In [None]:
# %load_ext autoreload
# %autoreload 2

In [None]:
from rogers.logger import init_logging, logging, get_logger

init_logging(level=logging.DEBUG)

log = get_logger("rogers.notebook")

In [None]:
import numpy as np
import rogers as rgr
import rogers.config as cfg
import pandas as pd
import plotly
from sklearn.externals import joblib

from rogers.index.hnsw import Index as HNSW
from rogers.index.pdci import Index as PDCI

cfg.configure("./config.ini")

db = rgr.store.Database()

plotly.offline.init_notebook_mode()

%matplotlib inline

# Configuration

# Datasets

In [None]:
df = pd.read_csv("dataset.csv.gz")
df.groupby(['source'])['label'].count()
df.head(5)

Add label data from df as contextual features

In [None]:
rgr.api.feature_add(df, 'CATEGORICAL', 'CONTEXTUAL')

# Extract and Transform

Perform feature extraction on file samples

In [None]:
rgr.api.extract(filter_hashvals=df['sha256'].tolist())

Load samples, transform, and export

In [None]:
samples = list(db.get_samples())
pipeline = rgr.vectorizer.online_pe_pipeline()
xs = pipeline.fit_transform(samples)
hashvals = [s.sha256 for s in samples]
joblib.dump([hashvals, xs], 'data.pk.gz')
joblib.dump(pipeline, 'pipeline.pk.gz')

# Reload exisiting samples and pipeline
# hashvals, xs = joblib.load('data.pk.gz')
# pipeline = joblib.load('pipeline.pk.gz')

# Indexing

Fit HSNW and PDCI index and persist. Parameters selected from basic grid search using a 90/10 split on dataset.

In [None]:
hnsw_idx = HNSW(db=db, pipeline=pipeline, efConstruction=400, M=16)
hnsw_idx.fit(xs, hashvals)
hnsw_idx.save()
# hnsw_idx.load()

In [None]:
pdci_idx = PDCI(db=db, pipeline=pipeline, simple_indicies=20, composite_indices=2)
pdci_idx.fit(xs, hashvals)
pdci_idx.save()
# pdci_idx.load()

# Visualization

Select a random sample and visualize using plotly. Neighbor graph weights edges by similarity. Change values of `k` for bringing back more results. Set `include_neighbors` to true queries neighbors returned in initial query.

In [None]:
sample = samples[np.random.choice(range(len(samples)), 1)[0]]

## hnsw

In [None]:
neighbors = hnsw_idx.query_samples([sample], k=5, include_neighbors=True)
print("%s has label  %s" % (sample.sha256, sample.contextual_features()['label']))
print()
for ret in neighbors[0]['neighbors']:
    print( ret[0].sha256, ret[1])
rgr.visualize.plt_neighbor_graph(neighbors)

## pdci

Query parameters have been selected from basic grid search. `d` is the intrinsic dimensionality of the samples and used as parameter for worst case bounds in `pdci`.

In [None]:
k_retrieve = pdci_idx.index.omega_k_retrieve(k=5, d=8, include_neighbors=True)
k_visit = pdci_idx.index.omega_k_visit(k=5, d=8)

neighbors = pdci_idx.query_samples([sample], k=5, include_neighbors=False, k_retrieve=k_retrieve, k_visit=k_visit)
print("%s is in label  %s" % (sample.sha256, sample.contextual_features()['label']))
print()
for ret in neighbors[0]['neighbors']:
    print( ret[0].sha256, ret[1])
rgr.visualize.plt_neighbor_graph(neighbors)

# Xori Feature Extraction and Vectorization

In [None]:
df = df.sample(500)
df.groupby(['source', 'label'])['label'].count()

## Feature Extraction

Example mnemonics bag of words extraction

In [None]:
from rogers.sample import Sample
from rogers.sample.xori import Xori

_, msg = Xori.process("var/samples/00/01/0D/97/00010D97E3B9BA14D1A1EB21197918A42DA58B1291B810A68FC7DC17D1BAF3A2")
sample = Sample.deserialize(msg)
sample.get('mnemonics')

Perform feature extraction on file samples

In [None]:
# rgr.api.extract(filter_hashvals=df['sha256'].tolist(), sample_class=Xori)

## Vectorization Pipeline

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

from rogers.vectorizer.BaseVectorizer import BaseVectorizer
from rogers.vectorizer import SignatureVectorizer, HeaderVectorizer, SymImportsVectorizer, SymExportsVectorizer

class MnemonicVectorizer(BaseVectorizer):

    def explode(self, s):
        """ Preprocess sample for vectorizers
        :param s: Sample instance
        :return:
        """
        mnemonics = s.get('mnemonics')
        return mnemonics if isinstance(mnemonics, dict) else {}

pipeline = Pipeline([
        ('vectorize', FeatureUnion(
            transformer_list=[
                ('signatures', Pipeline([
                    ('vectorizer', SignatureVectorizer(TfidfVectorizer(sublinear_tf=True, min_df=2, max_df=0.90))),
                ])),
                ('mnemonics', Pipeline([
                    ('vectorizer', MnemonicVectorizer()),
                    ('normalize', Normalizer())
                ])),
                ('header', Pipeline([
                    ('vectorizer', HeaderVectorizer()),
                    ('normalize', Normalizer())
                ])),
                ('sym_imports', Pipeline([
                    ('vectorizer', SymImportsVectorizer(TfidfVectorizer(sublinear_tf=True, min_df=2, max_df=0.90))),
                    ('projection', TruncatedSVD(n_components=256)),
                ]))
            ],
        )),
        ('projection', TruncatedSVD(n_components=128)),
])

In [None]:
samples = list(db.get_samples())
xs = pipeline.fit_transform(samples)
hashvals = np.array([s.sha256 for s in samples])

In [None]:
## Indexing

In [None]:
idx = HNSW(db=db, pipeline=pipeline, n_esimators=20)
# fit the index and save
idx.fit(xs, hashvals)
idx.save()
# idx.load()

## Visualization

In [None]:
# select random sample
sample = samples[np.random.choice(range(len(samples)), 1)[0]]

In [None]:
neighbors = idx.query_samples([sample], k=10, include_neighbors=True)
print("%s has label  %s" % (sample.sha256, sample.contextual_features()['label']))
print()
for ret in neighbors[0]['neighbors']:
    print( ret[0].sha256, ret[1])
rgr.visualize.plt_neighbor_graph(neighbors)