In [1]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

In [2]:
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
from scipy import sparse
import random
from tqdm import tqdm
import collections
%matplotlib inline

In [3]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix

In [4]:
import logging

os.remove('./debug.log')
logger = logging.getLogger('debug')
hdlr = logging.FileHandler('./debug.log', mode='w')
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr) 
logger.setLevel(logging.INFO)

In [5]:
path = '/datasets/dsc180a-wi20-public/Malware/group_data/group_01/pipeline_output'
A = sparse.load_npz(os.path.join(path, 'A_reduced_tr.npz'))
B_tr = sparse.load_npz(os.path.join(path, 'B_reduced_tr.npz')).tocsr()
P_tr = sparse.load_npz(os.path.join(path, 'P_reduced_tr.npz')).tocsr()
A_csr = A
A_csc = A.tocsc(copy=True)

In [6]:
A

<1335x1000 sparse matrix of type '<class 'numpy.uint32'>'
	with 238038 stored elements in Compressed Sparse Row format>

In [7]:
A[:, 951]

<1335x1 sparse matrix of type '<class 'numpy.uint32'>'
	with 3 stored elements in Compressed Sparse Row format>

# Create metapath

In [8]:
import m2v
%load_ext autoreload
%autoreload 2

In [9]:
model = m2v.Metapath2Vec(A, B_tr, P_tr)

In [10]:
%%time
# test one iteration of metapath2vec
# os.remove('./debug.log')
logger = logging.getLogger('debug')
hdlr = logging.FileHandler('./debug.log', mode='w')
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
hdlr.setFormatter(formatter)
logger.addHandler(hdlr) 
logger.setLevel(logging.INFO)

for i in range(10):
    metapaths = list('ABA')
    print(model.metapath2vec(metapaths, 65))
# print(path)

['app_65', 'api_49', 'api_144', 'app_534']
['app_65', 'api_151', 'api_72', 'app_703']
['app_65', 'api_121', 'api_507', 'app_298']
['app_65', 'api_140', 'api_177', 'app_1063']
['app_65', 'api_158', 'api_258', 'app_154']
['app_65', 'api_104', 'api_158', 'app_37']
['app_65', 'api_22', 'api_59', 'app_212']
['app_65', 'api_79', 'api_590', 'app_773']
['app_65', 'api_126', 'api_62', 'app_1010']
['app_65', 'api_189', 'api_400', 'app_655']
CPU times: user 9.18 ms, sys: 6.26 ms, total: 15.4 ms
Wall time: 12.7 ms


# Create Corpus

In [None]:
isinstance(model.metapath2vec(metapaths, 65), collections.Iterable)

In [None]:
metas = ('AA','ABA', 'APA', 'ABPBA','APBPA')

In [None]:
for meta in metas:
    print(meta)
    model.create_corpus(meta)

In [11]:
%%time
model.create_corpus('ABPBPBBPA', '_tst')

CPU times: user 24min 4s, sys: 22.4 s, total: 24min 27s
Wall time: 24min 27s


In [11]:
%%time
model.create_corpus('ABPBPBBPA')

CPU times: user 26min 12s, sys: 24.4 s, total: 26min 36s
Wall time: 26min 37s


# Train word embeddings

unable to import 'smart_open.gcs', disabling that module


In [13]:
def check_tst_file(CORPUS_TEST):
    f = open(CORPUS_TEST).readlines()
    app_num = int(f[0].split()[0].split('_')[1])
    if(app_num <A.shape[0]):
        print('changing')
        walks = []
        for line in f:
            walk = line.strip().split(' ')
            walks.append([
                f"app_{int(node.split('_')[-1]) + 1335}"
                if node.startswith('app') else node
                for node in walk
            ])


        f = open(CORPUS_TEST, "w")
        for walk in walks:
            f.write(' '.join(walk) + '\n')
        f.close()
    else:
        print('changed')
        return

In [20]:
def prediction(metapath):
    fp = '/datasets/dsc180a-wi20-public/Malware/group_data/group_01/metapath_corpus'
    CORPUS = os.path.join(fp, 'meta_%s.cor'%metapath)
    CORPUS_TEST = os.path.join(fp, 'meta_%s_tst.cor'%metapath)
#     print(CORPUS, CORPUS_TEST)
    check_tst_file(CORPUS_TEST)
    
    
    from gensim import utils
    import gensim.models

    class MyCorpus(object):
        """An interator that yields sentences (lists of str)."""
        def __init__(self, CORPUS, CORPUS_TEST):
            self.lines = open(CORPUS).readlines()
    #         print(len(self.lines))
            self.lines += open(CORPUS_TEST).readlines()  # !!! Test
    #         print(len(self.lines))

        def __iter__(self):
            corpus_path = CORPUS
            for line in tqdm(self.lines):
                # assume there's one document per line, tokens separated by whitespace
                yield line.strip().split(' ')
    sentences = MyCorpus(CORPUS, CORPUS_TEST)
    model = gensim.models.Word2Vec(sentences=sentences, min_count=1, size=200, window=2)
    
    meta_tr = pd.read_csv(os.path.join(path, 'meta_tr.csv'), index_col=0)
    meta_tst = pd.read_csv(os.path.join(path, 'meta_tst.csv'), index_col=0)

    y_train = meta_tr.label == 'class1'
    y_test = meta_tst.label == 'class1'

    app_vec = np.array([model.wv[f'app_{i}'] for i in range(len(meta_tr))])
    app_vec_tst = np.array([model.wv[f'app_{i}'] for i in range(len(meta_tr), len(meta_tr) + len(meta_tst))])
    
    print('training')
    from sklearn.svm import SVC
    svm = SVC(kernel='rbf', C=10, gamma=0.1)
    svm.fit(app_vec, y_train)
    
    y_pred = svm.predict(app_vec_tst)
    print('train_acc: ', svm.score(app_vec, y_train), '\n')
    print('test_acc: ', svm.score(app_vec_tst, y_test), '\n')
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print(tn, fp, fn, tp)
    
    

In [21]:
%%time
prediction('ABPBPBBPA')

changed


100%|██████████| 2670000/2670000 [00:06<00:00, 406782.27it/s]
100%|██████████| 2670000/2670000 [00:38<00:00, 69894.43it/s]
100%|██████████| 2670000/2670000 [00:36<00:00, 72246.86it/s]
100%|██████████| 2670000/2670000 [00:36<00:00, 72799.89it/s]
100%|██████████| 2670000/2670000 [00:36<00:00, 73759.11it/s]
100%|██████████| 2670000/2670000 [00:35<00:00, 74447.08it/s] 


training
train_acc:  1.0 

test_acc:  0.49812734082397003 

1 670 0 664
CPU times: user 8min, sys: 6.99 s, total: 8min 7s
Wall time: 3min 13s


In [None]:
path = '/datasets/dsc180a-wi20-public/Malware/group_data/group_01/pipeline_output'
label = pd.read_csv(os.path.join(path, 'meta_tr.csv'))

In [None]:
def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    vectors = []  # positions in vector space
    labels = []  # keep track of words to label our data again later
    for word in model.wv.vocab:
        if 'app' in word:
            if (label.iloc[int(word.split('_')[1]), 1] == 'class1'):
                labels.append(1)
            else:
                labels.append(0)

            vectors.append(model.wv[word])
            # labels.append(word)

    # convert both lists into numpy vectors for reduction
    vectors = np.asarray(vectors)
    labels = np.asarray(labels)
    # reduce using t-SNE
    vectors = np.asarray(vectors)
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


def plot_with_plotly(x_vals, y_vals, labels, metapath, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    fig = go.Figure()
    
    fig.add_trace(go.Scatter(x=x_vals, y=y_vals, mode='markers',
                       text=labels, marker=dict(size=5, color=labels)))
    
    fig.update_layout(
        title = "Scatter graph of metapath: " + metapath,
#         xaxis_title="x Axis Title",
#         yaxis_title="y Axis Title",
        font=dict(
            family="Courier New, monospace",
            size=18,
            color="#7f7f7f"
        )
    )
    
    fig.show()
    
#     data = [trace]

#     if plot_in_notebook:
#         init_notebook_mode(connected=True)
#         iplot(data, filename='word-embedding-plot')
#     else:
#         plot(data, filename='word-embedding-plot.html')


In [None]:
x_vals, y_vals, labels = reduce_dimensions(model)
plot_with_plotly(x_vals, y_vals, labels, 'AA')

# Prediction