In [None]:
import json
from scipy import sparse
import numpy as np
from gensim.test.utils import datapath
from gensim import utils
import gensim.models
from tqdm import tqdm
from node2vec import Node2Vec
import networkx as nx
from itertools import cycle
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction

In [None]:
#!pip install gensim --user
#!pip install plotly --user

In [None]:
config = json.load(open('../config/data-params.json'))

config['master_matrix'] =  '.' + config['master_matrix']


In [None]:
%%time

train_A = sparse.load_npz(config['master_matrix'] + '/train_A.npz').tocsr()
train_B = sparse.load_npz(config['master_matrix'] + '/train_B.npz').tocsc()
train_P = sparse.load_npz(config['master_matrix'] + '/train_P.npz').tocsc()
train_A_csc = train_A.tocsc()

test_A = sparse.load_npz(config['master_matrix'] + '/test_A.npz').tocsr()

In [None]:
print(train_A.count_nonzero())
print(train_B.count_nonzero())
print(train_P.count_nonzero())

In [None]:
7851572 + 36949323/2 + 30264149/2

In [None]:
# ABA calculation
print(7851572 + 36949323/2)
print(2000*30000)

In [None]:
2000*50000

In [None]:
60000000/26326233.5

In [None]:
%%time
train_APA_T = train_A.dot(train_P).dot(train_A.T)
test_APA_T = test_A.dot(train_P).dot(train_A.T)

In [None]:
# ABPBA
def generate_corpus(walk_length=5000):
    while True:
              
        app = np.random.choice(np.arange(train_A.shape[0]))
        
        path = f'app_{app}'
        
        for i in range(walk_length):
        
            api_i = np.random.choice(np.nonzero(train_A[app])[1])
            api_bi = np.random.choice(np.nonzero(train_B[:, api_i])[0])
            api_p = np.random.choice(np.nonzero(train_P[:, api_bi])[0])
            api_bj = np.random.choice(np.nonzero(train_B[:, api_p])[0])
            app = np.random.choice(np.nonzero(train_A_csc[:, api_bj])[0])

            path += f' api_{api_i} api_{api_bi} api_{api_p} api_{api_bj} app_{app}'
            
        yield path

In [None]:
#ABA
def generate_corpus_ABA(walk_length=5000):
    while True:
              
        app = np.random.choice(np.arange(train_A.shape[0]))
        
        path = f'app_{app}'
        
        for i in range(walk_length):
        
            api_i = np.random.choice(np.nonzero(train_A[app])[1])
            api_b = np.random.choice(np.nonzero(train_B[:, api_i])[0])
            #api_p = np.random.choice(np.nonzero(train_P[:, api_bi])[0])
            #api_bj = np.random.choice(np.nonzero(train_B[:, api_p])[0])
            app = np.random.choice(np.nonzero(train_A_csc[:, api_b])[0])

            path += f' api_{api_i} api_{api_b} app_{app}'
            
        yield path

In [None]:
# APA
def generate_corpus_APA(walk_length=5000):
    while True:
              
        app = np.random.choice(np.arange(train_A.shape[0]))
        
        path = f'app_{app}'
        
        for i in range(walk_length):
        
            api_i = np.random.choice(np.nonzero(train_A[app])[1])
            #api_b = np.random.choice(np.nonzero(train_B[:, api_i])[0])
            api_p = np.random.choice(np.nonzero(train_P[:, api_i])[0])
            #api_bj = np.random.choice(np.nonzero(train_B[:, api_p])[0])
            app = np.random.choice(np.nonzero(train_A_csc[:, api_p])[0])

            path += f' api_{api_i} api_{api_p} app_{app}'
            
        yield path

In [None]:
# APBPA

In [None]:
# ABPBA
def generate_corpus_limit_length():
    while True:
        app_i = np.random.choice(np.arange(train_A.shape[0]))
        api_i = np.random.choice(np.nonzero(train_A[app_i])[1])
        api_bi = np.random.choice(np.nonzero(train_B[:, api_i])[0])
        api_p = np.random.choice(np.nonzero(train_P[:, api_bi])[0])
        api_bj = np.random.choice(np.nonzero(train_B[:, api_p])[0])
        app_j = np.random.choice(np.nonzero(train_A_csc[:, api_bj])[0])

        yield f'app_{app_i} api_{api_i} api_{api_bi} api_{api_p} api_{api_bj} app_{app_j}'
        

In [None]:
corpus_function = generate_corpus_ABA()

In [None]:
corpus_function = generate_corpus_APA()

In [None]:
next(corpus_function)

In [None]:
%%time
f = open('ABA.cor', 'w')
for _ in tqdm(range(1000)):
    f.write(next(corpus_function) + '\n')
f.close()

In [None]:
%%time
f = open('APA.cor', 'w')
for _ in tqdm(range(1000)):
    f.write(next(corpus_function) + '\n')
f.close()

In [None]:
!head longcorpus.cor

In [None]:
class MyCorpus(object):
    """An interator that yields sentences (lists of str)."""

    def __init__(self, corpus_path):
        self.corpus_path = corpus_path
    
    def __iter__(self):
        for line in open(self.corpus_path):
            # assume there's one document per line, tokens separated by whitespace
            yield line.strip().split(' ')

In [None]:
%%time
sentences = MyCorpus()
model = gensim.models.Word2Vec(sentences=sentences, size=100)

In [None]:
next(sentences.__iter__())

In [None]:
model.wv.similarity('app_101', 'app_400')

In [None]:
print(model.wv.most_similar(positive=['app_334'], topn=10))

In [None]:
model_with_loss = gensim.models.Word2Vec(
    sentences,
    min_count=1,
    compute_loss=True,
    hs=0,
    sg=1,
    seed=42
)

# getting the training loss value
training_loss = model_with_loss.get_latest_training_loss()
print(training_loss)

In [None]:
model.wv['app_100']

In [None]:
def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    vectors = [] # positions in vector space
    labels = [] # keep track of words to label our data again later
    for word in model.wv.vocab:
        if 'app' in word:
            if int(word.split('_')[1]) > 332:
                labels.append(1)
            else:
                labels.append(0)

            vectors.append(model.wv[word])
            #labels.append(word)

    # convert both lists into numpy vectors for reduction
    vectors = np.asarray(vectors)
    labels = np.asarray(labels)

    # reduce using t-SNE
    vectors = np.asarray(vectors)
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels



def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='markers', text=labels, marker=dict(size=5, color=labels))

    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))
    plt.show()

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly



In [None]:
class MyCorpus(object):
    """An interator that yields sentences (lists of str)."""
    
    def __init__(self, corpus_path):
        self.corpus_path = corpus_path

    def __iter__(self):
#         corpus_path = './APA.cor'
        for line in open(self.corpus_path):
            # assume there's one document per line, tokens separated by whitespace
            yield line.strip().split(' ')

In [None]:
sentences = MyCorpus('./ABPBA.cor')
model = gensim.models.Word2Vec(sentences=sentences)

In [7]:
x_vals, y_vals, labels = reduce_dimensions(model)

plot_with_plotly(x_vals, y_vals, labels)


In [None]:
sentences = MyCorpus('./ABA.cor')
model = gensim.models.Word2Vec(sentences=sentences)

In [9]:
x_vals, y_vals, labels = reduce_dimensions(model)

plot_with_plotly(x_vals, y_vals, labels)

In [None]:
%%time
# Big size ABA.cor with dimension (2000*30000)
corpus_function = generate_corpus_ABA(walk_length=30000)
f = open('ABA_long.cor', 'w')
for _ in tqdm(range(2000)):
    f.write(next(corpus_function) + '\n')
f.close()

In [None]:
%%time
sentences = MyCorpus('./APA.cor')
model = gensim.models.Word2Vec(sentences=sentences)

In [11]:
x_vals, y_vals, labels = reduce_dimensions(model)

plot_with_plotly(x_vals, y_vals, labels)

In [None]:
# FNN that takes in a list of vectors of all 
# api embedding present in an application and output the embedding of the application