# url2vec

In [1]:
import os
import numpy as np
import pandas as pd
import sys
sys.path.append(os.path.abspath(".."))

#from sklearn import metrics
from hdbscan import HDBSCAN
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
#from gensim.models import Word2Vec
#from url2vec.util.metrics import *
from url2vec.util.plotter import *
from url2vec.model.urlembed import *
from url2vec.util.seqmanager import *

#from sklearn.decomposition import TruncatedSVD
#from sklearn.feature_extraction.text import TfidfVectorizer

### Loading files

In [None]:
nocostraint_path = os.getcwd() + "/../dataset/cs.illinois.edu_NoConstraint.words1000.depth10/"

vertex_path      = nocostraint_path + "vertex.txt"
codecontent_map  = get_content_map(vertex_path)

map_path         = nocostraint_path + "urlsMap.txt"
codeurl_map      = get_urlmap(map_path)

sequences_path   = nocostraint_path + "sequenceIDs.txt"
sequences        = get_sequences(sequences_path)


gt = GroundTruth()
ground_truth = [int(gt.get_groundtruth(codeurl_map[code])) for code in codeurl_map]

In [None]:
model = Url2Vec(codeurl_map)

# print [codeurl_map[i] for i in codeurl_map]

In [None]:
seq = [
    ["https://cs.illinois.edu/prospective-students/undergraduates/undergraduate-scholarships-and-awards/state-farm-computer-scienc/",
     "https://cs.illinois.edu/news/torrellas-co-leads-workshop-popular-parallel-programming/", 
     "https://cs.illinois.edu/prospective-students/undergraduates/undergraduate-scholarships-and-awards/state-farm-computer-scienc/"
    ],
    ["https://cs.illinois.edu/news/torrellas-co-leads-workshop-popular-parallel-programming/", 
     "https://cs.illinois.edu/news/cs-illinois-has-strong-showing-sc13/", 
     "https://cs.illinois.edu/news/professor-josep-torrellas-receives-darpa-perfect-award/"
    ]
]

urls = [
    "https://cs.illinois.edu/news/professor-josep-torrellas-receives-darpa-perfect-award/", 
    "https://cs.illinois.edu/news/cs-illinois-has-strong-showing-sc13/", 
    "https://cs.illinois.edu/news/torrellas-co-leads-workshop-popular-parallel-programming/", 
    "https://cs.illinois.edu/prospective-students/undergraduates/undergraduate-scholarships-and-awards/state-farm-computer-scienc/"
]

contents = ["bene bene bene", "si bene bene", "no ciao ciao", "come come come ciao"]

u = Url2Vec(urls)
u.train(sequences=seq, contents=contents, d_dim_red=2)

In [None]:
labels = url2vec.train(sequences=seq, contents=codecontent_map)
print(len(labels))

In [None]:
df = pd.DataFrame(url2vec.test(ground_truth), index=set(ground_truth), columns=set(labels))
df

In [None]:
listcostraint_path = os.getcwd() + "/../dataset/cs.illinois.edu_NoConstraint.words1000.depth10/"
vertex_path_lc      = listcostraint_path + "vertex.txt"
codecontent_map_lc  = get_content_map(vertex_path_lc)

map_path_lc         = listcostraint_path + "urlsMap.txt"
codeurl_map_lc      = get_urlmap(map_path_lc)

sequences_lc        = listcostraint_path + "sequenceIDs.txt"
seq_lc              = get_sequences(sequences_lc)


gt = GroundTruth()
ground_truth = [int(gt.get_groundtruth(codeurl_map[code])) for code in codeurl_map]

**class url2vec.model.urlembed.Url2Vec(urls)**

The model has to be initialized with a dictionary that associates to every URL a unique code or a list of URLs. the former case is preferable, because for large datasets it drops down training time.


---
    train(self, algorithm=HDBSCAN(min_cluster_size=7), use_w2v=True, use_tfidf=True, 
          e_sg=0, e_min_count=1, e_window=10, e_negative=5, e_size=48, sequences=None, 
          d_max_df=0.9, d_max_features=200000, d_min_df=0.05, d_dim_red=50, d_tfidf=True, 
          contents=None)
        
        algorithm: 
            - default = HDBSCAN(min_cluster_size=7)
            - the chosen clustering algorithm. Is invoked the fit_predict of the passed object(here HDBSCAN) to
              return the predicted labels.

        use_w2v: 
            - default = True
            - with True word2vec is used to cluster the URLs according to their position 
              in the sequences passed (sequences argument)
            
        use_tfidf :
            - default = True
            - with True documents (contents of the URL) are tranformed into 
              document-term matrix

        e_sg:
            - default = 0
            - word2vec argument, defines the training algorithm for word2vec,
              if 0 CBOW is used. With sg=1 skipgram is used

        e_min_count:
            - default = 1
            - word2vec argument, ignore all words with total frequency lower than this

        e_window: 
            - default = 10
            - word2vec argument, is the maximum distance between the current and 
              predicted word within a sentence.

        e_negative: 
            - default = 5
            - word2vec argument, if > 0, negative sampling will be used, the int for 
              negative specifies how many “noise words” should be drawn. If set to 0,
              no negative samping is used

        e_size: 
            - default = 48
            - word2vec argument, is the dimensionality of the feature vectors

        sequences: 
            - default = None
            - word2vec argument, iterable of sentences. Each sentence is a list of words
              that will be used for training.

        d_max_d: 
            - default = 0.9
            - document argument, this is the maximum frequency within the documents a given
              feature can have to be used in the tfi-idf matrix.

        d_max_features: 
            - default = 200000

        d_min_df: 
            - default = 0.05
            - document argument, term would have to be in at least 5 of the documents 
              to be considered.

        d_dim_red: 
            - default = 50
            - document argument, is the dimensionality of the feature vectors

        d_tfidf: 
            - default = True
            - docuement argument, with True term frequency-inverse document frequency is applied to the 
              document-term matrix obtained (from the contents argument)

        contents: 
            - default = None
            - docuement argument. Must be a dict with the URLs as keys and relevant
              page content as values

---
    test(ground_truth, pred_membership=None)
        ground_truth:
        
        pred_membership:
            - default = None

---
    homogeneity_score(ground_truth=None, pred_membership=None)
        ground_truth:
            - default = None
        
        pred_membership:
            - default = None

---
    completeness_score(ground_truth=None, pred_membership=None)
        ground_truth:
            - default = None
        
        pred_membership:
            - default = None
---
    v_measure_score(ground_truth=None, pred_membership=None)
        ground_truth:
            - default = None
        
        pred_membership:
            - default = None
---
    adjusted_rand_score(ground_truth=None, pred_membership=None)
        ground_truth:
            - default = None
        
        pred_membership:
            - default = None
---
    adjusted_mutual_info_score(ground_truth=None, pred_membership=None)
        ground_truth:
            - default = None
        
        pred_membership:
            - default = None
---
    silhouette_score(pred_membership=None)
        pred_membership:
            - default = None



In [None]:
seq = get_sequences(sequences)
codecontent_map = get_content_map(vertex_path)

k_memb = u2v.train(algorithm=KMeans(n_clusters=15), sequences_list=seq, codecontent_map=codecontent_map)

In [None]:
seq2 = get_sequences(sequences)
hd_memb = u2v2.train(sequences_list=seq2, codecontent_map=codecontent_map)

In [None]:
gt = GroundTruth()
ground_truth = [int(gt.get_groundtruth(codeurl_map[code])) for code in codeurl_map]

confusion_table = u2v.test(ground_truth)
pd.DataFrame(confusion_table, index=set(ground_truth), columns=set(u2v.labels_))

In [None]:
confusion_table_hdbscan = u2v2.test(ground_truth)
pd.DataFrame(confusion_table_hdbscan, index=set(ground_truth), columns=set(u2v2.labels_))

In [None]:
# k_memb  
labels_pred_k = map(int, k_memb)
labels_pred_h = hd_memb

In [None]:
print "Homogeneity:\t\t", metrics.homogeneity_score(ground_truth, labels_pred_k)
print "Completeness:\t\t", metrics.completeness_score(ground_truth, labels_pred_k)
print "V Measure Score:\t", metrics.v_measure_score(ground_truth, labels_pred_k)
print "adjusted rand score:\t", metrics.adjusted_rand_score(ground_truth, labels_pred_k)

In [None]:
print "Homogeneity:\t\t", metrics.homogeneity_score(ground_truth, labels_pred_h) # best 1
print "Completeness:\t\t", metrics.completeness_score(ground_truth, labels_pred_h)  # best 1
print "V Measure Score:\t", metrics.v_measure_score(ground_truth, labels_pred_h) # best 1
print "adjusted rand score:\t", metrics.adjusted_rand_score(ground_truth, labels_pred_h) # best 1