# url2vec

In [1]:
import os
import numpy as np
import pandas as pd
import sys
sys.path.append(os.path.abspath(".."))

from sklearn import metrics
from hdbscan import HDBSCAN
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from gensim.models import Word2Vec
from url2vec.util.metrics import *
from url2vec.util.plotter import *
from url2vec.model.urlembed import *
from url2vec.util.seqmanager import *

from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

### loading files

In [2]:
nocostraint_path = os.getcwd() + "/../dataset/cs.illinois.edu_NoConstraint.words1000.depth10/"

vertex_path      = nocostraint_path + "vertex.txt"
codecontent_map  = get_content_map(vertex_path)

map_path         = nocostraint_path + "urlsMap.txt"
codeurl_map      = get_urlmap(map_path)

sequences        = nocostraint_path + "sequenceIDs.txt"
seq              = get_sequences(sequences)


gt = GroundTruth()
ground_truth = [int(gt.get_groundtruth(codeurl_map[code])) for code in codeurl_map]

url2vec = Url2Vec(codeurl_map)

In [4]:
labels = url2vec.train(sequences_list=seq, codecontent_map=codecontent_map)
print(len(labels))

728


In [5]:
df = pd.DataFrame(url2vec.test(ground_truth), index=set(ground_truth), columns=set(labels))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,-1
0,0,0,0,0,0,0,17,0,0,0,1
1,0,0,0,0,0,0,8,0,0,0,4
2,0,0,0,0,0,0,0,0,47,0,0
3,0,0,0,0,0,0,0,0,0,27,2
4,0,0,0,0,0,8,0,0,0,0,6
6,0,0,0,0,0,0,0,0,0,0,6
8,0,0,0,65,0,0,0,0,0,0,10
10,0,0,0,0,0,289,0,0,0,0,36
11,0,0,0,0,0,3,0,0,0,0,4
12,0,0,0,16,0,1,0,0,0,0,2


In [8]:
listcostraint_path = os.getcwd() + "/../dataset/cs.illinois.edu_NoConstraint.words1000.depth10/"
vertex_path_lc      = listcostraint_path + "vertex.txt"
codecontent_map_lc  = get_content_map(vertex_path_lc)

map_path_lc         = listcostraint_path + "urlsMap.txt"
codeurl_map_lc      = get_urlmap(map_path_lc)

sequences_lc        = listcostraint_path + "sequenceIDs.txt"
seq_lc              = get_sequences(sequences_lc)


gt = GroundTruth()
ground_truth = [int(gt.get_groundtruth(codeurl_map[code])) for code in codeurl_map]

url2vecList = Url2Vec(codeurl_map)

In [None]:
seq = get_sequences(sequences)
codecontent_map = get_content_map(vertex_path)

k_memb = u2v.train(algorithm=KMeans(n_clusters=15), sequences_list=seq, codecontent_map=codecontent_map)

In [None]:
seq2 = get_sequences(sequences)
hd_memb = u2v2.train(sequences_list=seq2, codecontent_map=codecontent_map)

In [None]:
gt = GroundTruth(rmfile)
ground_truth = [int(gt.get_groundtruth(codeurl_map[code])) for code in codeurl_map]

confusion_table = u2v.test(ground_truth)
pd.DataFrame(confusion_table, index=set(ground_truth), columns=set(u2v.labels_))

In [None]:
confusion_table_hdbscan = u2v2.test(ground_truth)
pd.DataFrame(confusion_table_hdbscan, index=set(ground_truth), columns=set(u2v2.labels_))

In [None]:
# k_memb  
labels_pred_k = map(int, k_memb)
labels_pred_h = hd_memb

In [None]:
print "Homogeneity:\t\t", metrics.homogeneity_score(ground_truth, labels_pred_k)
print "Completeness:\t\t", metrics.completeness_score(ground_truth, labels_pred_k)
print "V Measure Score:\t", metrics.v_measure_score(ground_truth, labels_pred_k)
print "adjusted rand score:\t", metrics.adjusted_rand_score(ground_truth, labels_pred_k)

In [None]:
print "Homogeneity:\t\t", metrics.homogeneity_score(ground_truth, labels_pred_h) # best 1
print "Completeness:\t\t", metrics.completeness_score(ground_truth, labels_pred_h)  # best 1
print "V Measure Score:\t", metrics.v_measure_score(ground_truth, labels_pred_h) # best 1
print "adjusted rand score:\t", metrics.adjusted_rand_score(ground_truth, labels_pred_h) # best 1