In [None]:
import json
from scipy import sparse
import numpy as np
from gensim.test.utils import datapath
from gensim import utils
import gensim.models
from tqdm import tqdm
from node2vec import Node2Vec
import networkx as nx
from itertools import cycle
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import matplotlib.pyplot as plt
from sklearn import svm
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
from plotly import tools
from sklearn.ensemble import (RandomForestClassifier, ExtraTreesClassifier,
                              AdaBoostClassifier)
from sklearn.tree import DecisionTreeClassifier

In [None]:
%load_ext autoreload

%autoreload 2

In [None]:
import sys
sys.path.insert(0, '../')

In [None]:
#!pip install gensim --user
#!pip install plotly --user

In [None]:
from src.features.w2v import word2vec

In [None]:
matrix_path = "/datasets/dsc180a-wi20-public/Malware/group_data/group_01/matrix"
corpus_path = "../data/processed/ABPBA.cor"

In [None]:
%%time
model = word2vec(matrix_path, corpus_path)
model.load_matrix()

In [None]:
%%time
#Uncomment this to generate corpus
#model.generate_corpus("APA", 10, 10)

In [None]:
%%time
model.create_model()

In [None]:
%%time
model.predict_embeddings()

In [None]:
# %%time
# for i in range(len(model.test_labels)):
#     model.model.wv[model.test_labels[i]] = model.test_embeddings[i] 

In [None]:
# Change train predict -> generate embedding
# add another method on traiing svc and get ready for plotting
# try using gensim lockdown to train, generate corpus and see how is the embedding
# Try david's corpus text and plot
# Plot hindroid classification on the same graph

# find out the app that are misclassified in hindroid, are they the same as those below in the graph that look like benign

# Try to color the different types of benign and see how is it related to the different types of malware

# Try to plot the API corresponding to those malware that look like benign, see which API play a big role

In [None]:
%%time
model.plot_embeddings()

In [None]:
%%time
tsne = TSNE(n_components=2, random_state=0)

train_X = tsne.fit_transform(np.array(model.train_embeddings))
train_Y = np.array(model.train_labels)

test_X = tsne.fit_transform(np.array(model.test_embeddings))
test_Y = np.array(model.test_labels)

X = np.vstack((np.array(model.train_embeddings),np.array(model.test_embeddings)))
X = tsne.fit_transform(X)

Y = np.array(model.train_labels + model.test_labels)

In [None]:
%%time
h = .02  # step size in the mesh

C = 1.0  # SVM regularization parameter

# create a mesh to plot in
x_min, x_max = train_X[:, 0].min() - 1, train_X[:, 0].max() + 1
y_min, y_max = train_X[:, 1].min() - 1, train_X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                     np.arange(y_min, y_max, h))

In [None]:
%%time
svc = svm.SVC(kernel='linear', C=C).fit(train_X, train_Y)

Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)

y_ = np.arange(y_min, y_max, h)

fig = tools.make_subplots(rows=1, cols=2,
                          subplot_titles=("Random Forest (Depth = 4)",
                                          "Random Forest (Depth = 50)")
                         )

trace = go.Heatmap(x=xx[0], y=y_, z=Z,
                  colorscale=[[0, '#8DE5F3'], [1, '#EA937A']],
                  showscale=True)

trace1 = go.Scatter(x=X.T[0].T, y=X.T[1].T, mode='markers', text=Y, 
                   marker=dict(size=5, color=Y))

fig.append_trace(trace, 1, 1)
fig.append_trace(trace1, 1, 1)

data = [trace, trace1]

init_notebook_mode(connected=True)
iplot(data, filename='word-embedding-plot')

In [None]:
%%time
svc = svm.SVC(kernel='linear', C=C).fit(train_X, train_Y)

Z = svc.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)

y_ = np.arange(y_min, y_max, h)

fig = tools.make_subplots(rows=1, cols=2,
                          subplot_titles=("Random Forest (Depth = 4)",
                                          "Random Forest (Depth = 50)")
                         )

trace = go.Heatmap(x=xx[0], y=y_, z=Z,
                  colorscale=[[0, '#8DE5F3'], [1, '#EA937A']],
                  showscale=True)

trace1 = go.Scatter(x=X.T[0].T, y=X.T[1].T, mode='markers', text=Y, 
                   marker=dict(size=5, color=Y))

fig.append_trace(trace, 1, 1)
fig.append_trace(trace1, 1, 1)

data = [trace, trace1]

init_notebook_mode(connected=True)
iplot(data, filename='word-embedding-plot')

In [None]:
%%time
rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(train_X, train_Y)

Z = rbf_svc.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)

y_ = np.arange(y_min, y_max, h)

fig = tools.make_subplots(rows=1, cols=2,
                          subplot_titles=("Random Forest (Depth = 4)",
                                          "Random Forest (Depth = 50)")
                         )

trace = go.Heatmap(x=xx[0], y=y_, z=Z,
                  colorscale=[[0, 'green'], [1, 'red']],
                  showscale=True)

trace1 = go.Scatter(x=X.T[0].T, y=X.T[1].T, mode='markers', text=Y, 
                   marker=dict(size=5, color=Y))

fig.append_trace(trace, 1, 1)
fig.append_trace(trace1, 1, 1)

data = [trace, trace1]

init_notebook_mode(connected=True)
iplot(data, filename='word-embedding-plot')

In [None]:
%%time
poly_svc = svm.SVC(kernel='poly', degree=3, C=C).fit(X, Y)

In [None]:
%%time
lin_svc = svm.LinearSVC(C=C).fit(X, Y)

In [None]:
%%time

Z = lin_svc.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)

y_ = np.arange(y_min, y_max, h)

fig = tools.make_subplots(rows=1, cols=2,
                          subplot_titles=("Random Forest (Depth = 4)",
                                          "Random Forest (Depth = 50)")
                         )

trace = go.Heatmap(x=xx[0], y=y_, z=Z,
                  colorscale=[[0, 'green'], [1, 'red']],
                  showscale=True)

trace1 = go.Scatter(x=X.T[0].T, y=X.T[1].T, mode='markers', text=Y, 
                   marker=dict(size=5, color=Y))

fig.append_trace(trace, 1, 1)
fig.append_trace(trace1, 1, 1)

data = [trace, trace1]

init_notebook_mode(connected=True)
iplot(data, filename='word-embedding-plot')

In [None]:
%%time


In [None]:
%%time

Z = poly_svc.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)

y_ = np.arange(y_min, y_max, h)

fig = tools.make_subplots(rows=1, cols=2,
                          subplot_titles=("Random Forest (Depth = 4)",
                                          "Random Forest (Depth = 50)")
                         )

trace = go.Heatmap(x=xx[0], y=y_, z=Z,
                  colorscale=[[0, 'green'], [1, 'red']],
                  showscale=True)

trace1 = go.Scatter(x=X.T[0].T, y=X.T[1].T, mode='markers', text=Y, 
                   marker=dict(size=5, color=Y))

fig.append_trace(trace, 1, 1)
fig.append_trace(trace1, 1, 1)

data = [trace, trace1]

init_notebook_mode(connected=True)
iplot(data, filename='word-embedding-plot')