In [None]:
import numpy as np
import scipy.spatial.distance as distance
import sklearn.metrics.pairwise as pairwise
import sklearn.metrics as metrics
import pandas as pd
import plotly
import plotly.offline as py

from rogers.logger import init_logging, get_logger

import rogers as r

plotly.offline.init_notebook_mode()

%matplotlib inline

Increase max cell height for network graph visualization

In [None]:
%%html
<style>
.container { width:70% !important; }
.output_wrapper, .output {
    height:auto !important;
    max-height:1000px;
}
.output_scroll {
    box-shadow:none !important;
    webkit-box-shadow:none !important;
}
</style>

# Toy Examples

| Sample | File Size (kB) | #  of export symbols | # of import symbols |# of import DLL |
|----|-----|----|---- |----- |
|$x_1$|100|3|10 |2 |
|$x_2$|10234|4|4|2 |
|$x_3$|3453|6|2 |1 | 

In [None]:
x_1 = np.array([100, 3, 10, 2])
x_2 = np.array([10234, 4, 4, 2])
x_3 = np.array([3453, 6, 2, 1])

In [None]:
x_1

In [None]:
x_1.shape

In [None]:
pairwise.euclidean_distances([x_1, x_2, x_3])

In [None]:
pairwise.manhattan_distances([x_1, x_2, x_3])

In [None]:
pairwise.cosine_distances([x_1, x_2, x_3])

In [None]:
pairwise.cosine_similarity([x_1, x_2, x_3])

In [None]:
1 - distance.jaccard(x_1, x_2)

In [None]:
distance.jaccard(x_1, x_3)

In [None]:
distance.jaccard(x_2, x_3)

# Command Line Usage

Install rogers, https://github.com/cylance/rogers/blob/develop/README.md#installation

In [None]:
! rogers -h

# Index

Interact with NN indexes:

+ Extract raw feature data
+ Transform data into vectors with pipeline
+ Fit a specific NN index
+ Query NN index

In [None]:
! rogers index -h

Extract samples in a directory into the database

In [None]:
! rogers index --dir "../samples/00/FD" extract -f

Transform samples using vectorizer pipeline

In [None]:
! rogers index transform

Fit different indexes

In [None]:
! rogers index fit pruning_tree

In [None]:
! rogers index fit lsh_forest

In [None]:
! rogers index fit hnsw

In [None]:
! rogers index fit ctph

Query fit index

In [None]:
! rogers --print index query lsh_forest "4B8C17F0F8BF27755DDAADB9B33B17FFBD6F785D7833300676AD70F43334D7F4"

In [None]:
! rogers --print index query ctph "4B8C17F0F8BF27755DDAADB9B33B17FFBD6F785D7833300676AD70F43334D7F4"

In [None]:
! rogers --print index query pruning_tree "4B8C17F0F8BF27755DDAADB9B33B17FFBD6F785D7833300676AD70F43334D7F4" --k 10

# Datasets

## Variant 2015

`variant2015` is a small dataset currated specifically for comparing malware similarity tools.

+ `group 1`: Ziyang RAT and were originally packed with Armadillo v1.71
+ `group 2`: LinseningSvr and were originally packed with Armadillo or Aspack
+ `group 3`: BeepService and were originally packed with Armadillo v1.71
+ `group 4`: SimpleFileMover and were originally packed with Armadillo v1.71
+ `group 5`: DD Keylogger
+ `group 6`: PUP by McAfee and were originally packed with Armadillo v1.71 or InstallShield 2000
+ `group 7`: Backdoors by McAfee and were originally packed with Armadillo v1.xx - v2.xx
+ `group 8`: SvcInstaller and were originally packed with Armadillo v1.71

In [None]:
variant_df = pd.read_csv("../datasets/variant_2015.csv")
print("Number of samples %s" % len(variant_df))

In [None]:
variant_group_1 = variant_df[variant_df['variant_group'] == 1]['sha256'].tolist()[:5]

In [None]:
variant_df.groupby(['variant_group']).size()

## VT Cluster Jan 2018

`vtcluster_jan2018` is a moderately sized dataset pulled from the VT clusters endpoint in January 2018.

In [None]:
vt_df = pd.read_csv("../datasets/vt_clusters_jan2018.csv")
print("Number of samples %s" % len(vt_df))

In [None]:
vt_df.groupby(['vt_cluster']).size()

In [None]:
sample_vt_df = vt_df.sample(n=100)

In [None]:
db = r.store.Database()

variant_seed_samples = db.load_samples(variant_df['sha256'])
variant_group_1_samples = db.load_samples(variant_group_1)
vt_seed_samples = db.load_samples(sample_vt_df['sha256'])

x = variant_seed_samples[0]
x1 = vt_seed_samples[0]
x2 = vt_seed_samples[1]

# Features

In [None]:
x1.features

In [None]:
x2.contextual_features()

In [None]:
hnsw_idx = r.index.init('hnsw')
hnsw_idx.load()

lsh_idx = r.index.init('lsh_forest')
lsh_idx.load()

pruning_idx = r.index.init('pruning_tree')
pruning_idx.load()

ssdeep_idx = r.index.init('ctph')
ssdeep_idx.load()

brute_idx = r.index.init('bruteforce')
brute_idx.load()

# Vector Plots

Plot vectors for `vtcluster_jan2018` to understand cosine similarity.

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from scipy import stats


xs, _ = hnsw_idx.transform(variant_seed_samples)

pca = PCA(n_components=2)

xs_t = pca.fit_transform(xs)
X, Y = zip(*np.zeros((xs_t.shape[0], 2)))
U, V = zip(*xs_t)

plt.figure()
ax = plt.gca()
ax.quiver(X, Y, U, V, angles='xy', scale_units='xy', scale=1)
ax.set_xlim([-1, 1.1])
ax.set_ylim([-1, 1])
plt.draw()
plt.show()

# Index Usage

In [None]:
neighbors = lsh_idx.query_samples([x], k=5, include_neighbors=False)
print("%s is in variant group %s" % (x.sha256, x.contextual_features()['variant_group']))
r.visualize.plt_neighbor_graph(neighbors)

In [None]:
neighbors = hnsw_idx.query_samples([x], k=5, include_neighbors=False)
print("%s is in variant group %s" % (x.sha256, x.contextual_features()['variant_group']))
r.visualize.plt_neighbor_graph(neighbors)

In [None]:
neighbors = lsh_idx.query_samples(variant_seed_samples, k=5, include_neighbors=False)
r.visualize.plt_neighbor_graph(neighbors)

In [None]:
neighbors = pruning_idx.query_samples(variant_seed_samples, k=5, include_neighbors=True)
r.visualize.plt_neighbor_graph(neighbors)

In [None]:
neighbors = hnsw_idx.query_samples(variant_seed_samples, k=5, include_neighbors=True)
r.visualize.plt_neighbor_graph(neighbors)

In [None]:
neighbors = ssdeep_idx.query_samples(variant_seed_samples, k=5, include_neighbors=True)
r.visualize.plt_neighbor_graph(neighbors, normalize=False)

In [None]:
neighbors = ssdeep_idx.query_samples(vt_seed_samples, k=5, include_neighbors=True)
r.visualize.plt_neighbor_graph(neighbors, normalize=False)

# Experiment

Precision@k is the proportion of retrieved documents that are relevant to the query over k.

$Precision@k = \frac{\textrm{relevant} \ \cap \ \textrm{retrieved}}{k}$

In [None]:
def precision_at_k_method(method):
    for k in (5, 10, 50, 100):
        results = method.query_samples(list(vt_seed_samples), k=k)
        precision_per_query = []
        for ret in results:
            q = ret['query']
            neigbors = ret['neighbors']

            label = str(q.contextual_features()['vt_cluster'])
            label_key = 'vt_cluster'

            relevant = 0

            for nbr in neigbors:
                nbr, _ = nbr
                context = nbr.contextual_features()
                if label_key in context and str(context[label_key]) == label:
                    relevant += 1
            precision_per_query.append(relevant / float(k))
        print(method.name, k, np.mean(precision_per_query))

In [None]:
for method in (pruning_idx, hnsw_idx, lsh_idx, ssdeep_idx, brute_idx):
    precision_at_k_method(method)