# url2vec

In [1]:
import os
import numpy as np
import pandas as pd
import sys
sys.path.append(os.path.abspath(".."))
from itertools import tee

import plotly.plotly as py
from hdbscan import HDBSCAN
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from url2vec.util.plotter import *
from url2vec.model.urlembed import *
from url2vec.util.seqmanager import *

from __future__ import print_function

### Shared settings

In [2]:
nc_path = os.getcwd() + "/../dataset/cs.ox.ac.uk/no_constraint/words10000_depth5/"
vertex_path_nc = nc_path + "vertex.txt"
map_path_nc    = nc_path + "urlsMap.txt"
walks_path_nc  = nc_path + "sequenceIDs.txt"
urlsmap_nc     = get_urlmap(map_path_nc)
# for fit_predict
documents_nc   = get_content_map(vertex_path_nc)
walksembedding_nc, walkscombined_nc = tee(get_sequences(walks_path_nc))

lc_path = os.getcwd() + "/../dataset/cs.ox.ac.uk/list_constraint/words10000_depth5/"
vertex_path_lc = lc_path + "vertex.txt"
map_path_lc    = lc_path + "urlsMap.txt"
walks_path_lc  = lc_path + "sequenceIDs.txt"
urlsmap_lc     = get_urlmap(map_path_lc)
# for fit_predict
documents_lc   = get_content_map(vertex_path_lc)
walksembedding_lc, walkscombined_lc = tee(get_sequences(walks_path_lc))

ground_truth_path = os.getcwd() + "/../dataset/cs.ox.ac.uk/ground_truth/urlToMembership.txt"
gt = GroundTruth(ground_truth_path)
n_clusters = len(gt.get_labelset())

algorithm = KMeans(n_clusters=n_clusters)

In [3]:
embeddingmodel_nc = Url2Vec(
    use_embedding=True, # default True
    sg=0, # default 0
    min_count=1, # default 1
    window=10, # default 10
    negative=5, # default 5
    size=100, # default 100
    normalize=True, # default True
    use_text=False, # default True
    max_df=0.9, # default 0.9
    max_features=200000, # default 200000
    min_df=0.05, # default 0.05
    dim_red=100, # default 100
    tfidf=True, # default True
    svd=True # default True
)
embeddinglabel_nc = embeddingmodel_nc.fit_predict(algorithm=algorithm, walks=walksembedding_nc, documents=documents_nc)

embeddinggt_nc = [int(gt.get_groundtruth(urlsmap_nc[code], print_missing=False)) for code in embeddingmodel_nc.urls ]
embeddingconftable_nc = embeddingmodel_nc.test(embeddinggt_nc)
# pd.DataFrame(textconftable_nc, index=set(ground_truth), columns=set(predicted_labels))

homogeneity     = embeddingmodel_nc.homogeneity_score()
completeness    = embeddingmodel_nc.completeness_score()
v_measure       = embeddingmodel_nc.v_measure_score()
adj_rand        = embeddingmodel_nc.adjusted_rand_score()
adj_mutual_info = embeddingmodel_nc.adjusted_mutual_info_score()
silhouette      = embeddingmodel_nc.silhouette_score()

embeddingdf_nc = pd.DataFrame({
        "Homogeneity": homogeneity, 
        "Completeness": completeness, 
        "V-Measure": v_measure, 
        "Adj Rand index": adj_rand, 
        "Adj Mutual info": adj_mutual_info, 
        "Silhouette": silhouette
    },
    index=["Embedding NC"]
)
embeddingdf_nc

Unnamed: 0,Adj Mutual info,Adj Rand index,Completeness,Homogeneity,Silhouette,V-Measure
Embedding NC,0.282913,0.159389,0.40555,0.303267,0.418228,0.347029


In [4]:
combinedmodel_nc = Url2Vec(
    use_embedding=True, # default True
    sg=0, # default 0
    min_count=1, # default 1
    window=10, # default 10
    negative=5, # default 5
    size=100, # default 100
    normalize=True, # default True
    use_text=True, # default True
    max_df=0.9, # default 0.9
    max_features=200000, # default 200000
    min_df=0.05, # default 0.05
    dim_red=100, # default 100
    tfidf=True, # default True
    svd=True # default True
)
combinedlabel_nc = combinedmodel_nc.fit_predict(algorithm=algorithm, walks=walkscombined_nc, documents=documents_nc)

combinedgt_nc = [int(gt.get_groundtruth(urlsmap_nc[code], print_missing=False)) for code in combinedmodel_nc.urls ]
combinedconftable_nc = combinedmodel_nc.test(combinedgt_nc)
# pd.DataFrame(textconftable_nc, index=set(ground_truth), columns=set(predicted_labels))

homogeneity     = combinedmodel_nc.homogeneity_score()
completeness    = combinedmodel_nc.completeness_score()
v_measure       = combinedmodel_nc.v_measure_score()
adj_rand        = combinedmodel_nc.adjusted_rand_score()
adj_mutual_info = combinedmodel_nc.adjusted_mutual_info_score()
silhouette      = combinedmodel_nc.silhouette_score()

combineddf_nc = pd.DataFrame({
        "Homogeneity": homogeneity, 
        "Completeness": completeness, 
        "V-Measure": v_measure, 
        "Adj Rand index": adj_rand, 
        "Adj Mutual info": adj_mutual_info, 
        "Silhouette": silhouette
    },
    index=["Combined NC"]
)
combineddf_nc

Unnamed: 0,Adj Mutual info,Adj Rand index,Completeness,Homogeneity,Silhouette,V-Measure
Combined NC,0.542933,0.376625,0.558137,0.681087,0.189058,0.613513


In [7]:
textmodel = Url2Vec(
    use_embedding=False, # default True
    sg=0, # default 0
    min_count=1, # default 1
    window=10, # default 10
    negative=5, # default 5
    size=100, # default 100
    normalize=False, # default True
    use_text=True, # default True
    max_df=0.9, # default 0.9
    max_features=200000, # default 200000
    min_df=0.05, # default 0.05
    dim_red=100, # default 100
    tfidf=True, # default True
    svd=True # default True
)
textlabel = textmodel.fit_predict(algorithm=algorithm, documents=documents_nc)

textgt = [ int(gt.get_groundtruth(urlsmap_nc[code], print_missing=False)) for code in textmodel.urls ]
textconftable = textmodel.test(textgt)
# pd.DataFrame(textconftable_nc, index=set(ground_truth), columns=set(predicted_labels))

homogeneity     = textmodel.homogeneity_score()
completeness    = textmodel.completeness_score()
v_measure       = textmodel.v_measure_score()
adj_rand        = textmodel.adjusted_rand_score()
adj_mutual_info = textmodel.adjusted_mutual_info_score()
silhouette      = textmodel.silhouette_score()

textdf = pd.DataFrame({
        "Homogeneity": homogeneity, 
        "Completeness": completeness, 
        "V-Measure": v_measure, 
        "Adj Rand index": adj_rand, 
        "Adj Mutual info": adj_mutual_info, 
        "Silhouette": silhouette
    },
    index=["Text"]
)
textdf

Unnamed: 0,Adj Mutual info,Adj Rand index,Completeness,Homogeneity,Silhouette,V-Measure
Text,0.475319,0.267942,0.491912,0.655591,0.184691,0.562078


In [8]:
embeddingmodel_lc = Url2Vec(
    use_embedding=True, # default True
    sg=0, # default 0
    min_count=1, # default 1
    window=10, # default 10
    negative=5, # default 5
    size=100, # default 100
    normalize=True, # default True
    use_text=False, # default True
    max_df=0.9, # default 0.9
    max_features=200000, # default 200000
    min_df=0.05, # default 0.05
    dim_red=100, # default 100
    tfidf=True, # default True
    svd=True # default True
)
embeddinglabel_lc = embeddingmodel_lc.fit_predict(algorithm=algorithm, walks=walksembedding_lc, documents=documents_lc)

embeddinggt_lc = [int(gt.get_groundtruth(urlsmap_lc[code], print_missing=False)) for code in embeddingmodel_lc.urls ]
embeddingconftable_lc = embeddingmodel_lc.test(embeddinggt_lc)
# pd.DataFrame(textconftable_nc, index=set(ground_truth), columns=set(predicted_labels))

homogeneity     = embeddingmodel_lc.homogeneity_score()
completeness    = embeddingmodel_lc.completeness_score()
v_measure       = embeddingmodel_lc.v_measure_score()
adj_rand        = embeddingmodel_lc.adjusted_rand_score()
adj_mutual_info = embeddingmodel_lc.adjusted_mutual_info_score()
silhouette      = embeddingmodel_lc.silhouette_score()

embeddingdf_lc = pd.DataFrame({
        "Homogeneity": homogeneity, 
        "Completeness": completeness, 
        "V-Measure": v_measure, 
        "Adj Rand index": adj_rand, 
        "Adj Mutual info": adj_mutual_info, 
        "Silhouette": silhouette
    },
    index=["Embedding LC"]
)
embeddingdf_lc

Unnamed: 0,Adj Mutual info,Adj Rand index,Completeness,Homogeneity,Silhouette,V-Measure
Embedding LC,0.31323,0.262401,0.431785,0.333863,0.502251,0.376562


In [9]:
combinedmodel_lc = Url2Vec(
    use_embedding=True, # default True
    sg=0, # default 0
    min_count=1, # default 1
    window=10, # default 10
    negative=5, # default 5
    size=100, # default 100
    normalize=True, # default True
    use_text=True, # default True
    max_df=0.9, # default 0.9
    max_features=200000, # default 200000
    min_df=0.05, # default 0.05
    dim_red=100, # default 100
    tfidf=True, # default True
    svd=True # default True
)
combinedlabel_lc = combinedmodel_lc.fit_predict(algorithm=algorithm, walks=walkscombined_lc, documents=documents_lc)

combinedgt_lc = [ int(gt.get_groundtruth(urlsmap_lc[code], print_missing=False)) for code in combinedmodel_lc.urls ]
combinedconftable_lc = combinedmodel_lc.test(combinedgt_lc)
# pd.DataFrame(textconftable_nc, index=set(ground_truth), columns=set(predicted_labels))

homogeneity     = combinedmodel_lc.homogeneity_score()
completeness    = combinedmodel_lc.completeness_score()
v_measure       = combinedmodel_lc.v_measure_score()
adj_rand        = combinedmodel_lc.adjusted_rand_score()
adj_mutual_info = combinedmodel_lc.adjusted_mutual_info_score()
silhouette      = combinedmodel_lc.silhouette_score()

combineddf_lc = pd.DataFrame({
        "Homogeneity": homogeneity, 
        "Completeness": completeness, 
        "V-Measure": v_measure, 
        "Adj Rand index": adj_rand, 
        "Adj Mutual info": adj_mutual_info, 
        "Silhouette": silhouette
    },
    index=["Combined LC"]
)
combineddf_lc

Unnamed: 0,Adj Mutual info,Adj Rand index,Completeness,Homogeneity,Silhouette,V-Measure
Combined LC,0.474875,0.340146,0.491058,0.691363,0.220455,0.574245


---

---

---

### Complete results

In [11]:
pd.concat([embeddingdf_nc, combineddf_nc, textdf, embeddingdf_lc, combineddf_lc])

Unnamed: 0,Adj Mutual info,Adj Rand index,Completeness,Homogeneity,Silhouette,V-Measure
Embedding NC,0.282913,0.159389,0.40555,0.303267,0.418228,0.347029
Combined NC,0.542933,0.376625,0.558137,0.681087,0.189058,0.613513
Text,0.475319,0.267942,0.491912,0.655591,0.184691,0.562078
Embedding LC,0.31323,0.262401,0.431785,0.333863,0.502251,0.376562
Combined LC,0.474875,0.340146,0.491058,0.691363,0.220455,0.574245
