In [98]:
import os
import numpy as np
import pandas as pd
import sys
sys.path.append(os.path.abspath(".."))

from util.sequence_manager import *
from util.sequence_plotter import *
from util.sequence_handler import *
from util.clustering_metrics import *

from sklearn.feature_extraction.text import TfidfVectorizer

# from sklearn.metrics.pairwise import cosine_similarity

from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE

from sklearn.cluster import KMeans
from hdbscan import HDBSCAN

from gensim.models import Word2Vec

from itertools import tee
import oset

In [9]:
rm = RealMembership()
rm.get_membership("https://cs.illinois.edu/news")



In [99]:
class Url2Vec:
    
    # Constructor
    def __init__ (self, codeurl_map):
        # assert (type(codeurl_map) is dict), "url2vec needs a map that associates a code(e.g. a number) for each URL"
        self.labels_ = None
        # assume is list-like or fail gracefully
        self.codeurl_map = codeurl_map if type(codeurl_map) is dict else {str(x): codeurl_map[x] for x in range(len(codeurl_map))}
    
    
    # matching matrix 
    def __get_confusion_table(self, ground_truth, clusters_found_labels):
        assert isinstance(ground_truth[0], int), "Type is not int"
        assert isinstance(clusters_found_labels[0], int), "Type is not int"
    
       # matrix(num_of real_clusters x clusters_found)
        conf_table = np.zeros((len(set(ground_truth)), len(set(clusters_found_labels))))
        real_clusters_set = set(ground_truth)

        realLabel_index_map = {}
        index = 0
        for c in real_clusters_set:
            if not c in realLabel_index_map:
                realLabel_index_map[c] = index
                index += 1
        print realLabel_index_map

        for current_clust in realLabel_index_map.values():
            for i in range(len(clusters_found_labels)):
                if realLabel_index_map[ground_truth[i]] == current_clust:
                    cluster_found = clusters_found_labels[i]
                    conf_table[current_clust, cluster_found] = conf_table[current_clust, cluster_found] + 1
        return conf_table
    
    
    # trains word2vec with the given parameters and returns vectors for each page
    def __word_embedding(self, sequences_list, vecs_length=48):
        assert hasattr(sequences_list, '__iter__'), "Bad sequences argument"
        
        w2v_model = Word2Vec(min_count=1, negative=5, size=vecs_length)
        build_seq, train_seq = itertools.tee(sequences_list)
        w2v_model.build_vocab(build_seq)
        w2v_model.train(train_seq)
        return np.array([w2v_model[code] for code in self.codeurl_map])
        
    
    # returns tfidf vector for each page
    def __tfidf(self, codecontent_map, vecs_length=50, tfidf=True):
        assert set(codecontent_map.keys()) == set(self.codeurl_map.keys()), "NEIN"
        
        self.codecontent_map = codecontent_map
        self.pages_content = [self.codecontent_map[code] for code in self.codeurl_map]
        self.codes = [code for code in self.codeurl_map]
        self.longurls = [self.codeurl_map[code] for code in self.codeurl_map]
        
        tfidf_vectorizer = TfidfVectorizer(
            max_df = 0.9,
            max_features = 200000,
            min_df = 0.05,
            stop_words = 'english',
            use_idf = tfidf,
            tokenizer = tokenize_and_stem,
            ngram_range = (1,3)
        )
        tfidf_matrix = tfidf_vectorizer.fit_transform(self.pages_content)
        svd = TruncatedSVD(n_components=vecs_length, algorithm="arpack", random_state=1)
        return svd.fit_transform(tfidf_matrix)
    
    
    # calls the chosen algorithm with the data builded from the input arguments
    def train(self, algorithm=HDBSCAN(min_cluster_size=7), use_w2v=True, use_tfidf=True,
              w2v_size=48, tfidf_size=50, sequences_list=None, codecontent_map=None):
        
        assert (use_w2v and sequences_list is not None or use_tfidf and codecontent_map is not None), "Bad arguments!"
        
        empty_array = np.array([ [] for i in range(len(self.codeurl_map)) ])
        w2v_vecs    = self.__word_embedding(sequences_list, vecs_length=w2v_size) if use_w2v else empty_array
        tfidf_vecs  = self.__tfidf(codecontent_map, vecs_length=tfidf_size) if use_tfidf else empty_array
        
        data = [ np.append(w2v_vecs[i], tfidf_vecs[i]) for i in range(len(self.codeurl_map)) ]
        self.labels_ = algorithm.fit_predict(data)
        self.labels_ = [int(x) for x in self.labels_] # map(int, self.labels_)
        return self.labels_
            
    
    # needs the real membership (ground truth) and the membership returned by the algorithm (pred_membership)
    # ...(already given if train was successful)
    # returns the confusion matrix
    def test(self, ground_truth, pred_membership=None):
        assert (pred_membership is not None or self.labels_ is not None), "No train, No test !"
        pred_membership = self.labels_ if pred_membership is None else pred_membership
        
        return self.__get_confusion_table(ground_truth, pred_membership)
        

In [100]:
nocostraint_path = os.getcwd() + "/../dataset/new/cs.illinois.edu.ListConstraint.words1000.depth10/"
rmfile  = "/home/chris/workspace/jupyter-notebook/url2vec/dataset/manual-membership/urlToMembership.txt"

vertex_path      = nocostraint_path + "vertex.txt"
codecontent_map  = get_content_map(vertex_path)

map_path         = nocostraint_path + "urlsMap.txt"
codeurl_map      = get_urlmap(map_path)

sequences        = nocostraint_path + "sequenceIDs.txt"
seq              = get_seq(sequences)

rm = RealMembership(rmfile)
ground_truth = [int(rm.get_membership(codeurl_map[code])) for code in codeurl_map]

u2vNEW = Url2Vec(codeurl_map)

In [102]:
labels = u2vNEW.train(sequences_list=seq, codecontent_map=codecontent_map)
print(len (labels))

1022


In [122]:
tup = (1,2,3)
tup[2]

3

In [103]:
pd.DataFrame(u2vNEW.test(ground_truth))

{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 6: 5, 8: 6, 10: 7, 12: 8, 13: 9, 14: 10, 15: 11, -1: 12}


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0,0,0,0,0,0,0,13,0,0,0,0,0,0,0,5
1,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,2
2,0,0,0,0,0,0,0,0,0,49,0,0,0,19,16,85
3,0,0,0,0,28,0,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,1
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6
6,0,0,0,0,0,0,0,0,0,0,128,36,0,0,0,127
7,0,0,0,0,0,0,0,0,11,0,0,0,200,0,0,105
8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10
9,0,9,27,11,0,0,0,0,0,0,0,0,0,0,0,10


In [118]:
listi = ["0", 7, "9", "3"]
d = {str(x): listi[x] for x in range(len(listi))}
codeurl_map["3"]

'http://cs.illinois.edu'

In [2]:
class Url2Vecccz:
    # Real Membership default file path
    # doesn't work on ipython because of __file__
    # __rmfile = os.path.abspath(os.path.dirname(__file__)) + "../dataset/manual-membership/urlToMembership.txt"
    # __rmfile  = "/home/chris/workspace/jupyter-notebook/url2vec/dataset/manual-membership/urlToMembership.txt"
    
    # Constructor
    def __init__ (self, map_file):
        self.labels_ = None
        self.codeurl_map = get_urlmap(map_file)
    
    # 
    def __get_confusion_table(self, real_membership_list, clusters_found_labels):
        assert isinstance(real_membership_list[0], int), "Type is not int"
        assert isinstance(clusters_found_labels[0], int), "Type is not int"
    
       # matrix(num_of real_clusters x clusters_found)
        conf_table = np.zeros((len(set(real_membership_list)), len(set(clusters_found_labels))))
        real_clusters_set = set(real_membership_list)

        realLabel_index_map = {}
        index = 0
        for c in real_clusters_set:
            if not c in realLabel_index_map:
                realLabel_index_map[c] = index
                index += 1
        print realLabel_index_map

        for current_clust in realLabel_index_map.values():
            for i in range(len(clusters_found_labels)):
                if realLabel_index_map[real_membership_list[i]] == current_clust:
                    cluster_found = clusters_found_labels[i]
                    conf_table[current_clust, cluster_found] = conf_table[current_clust, cluster_found] + 1
        return conf_table
    
    
    # trains word2vec with the given parameters and returns vectors for each page
    def __word_embedding(self, sequences_file, vecs_length=48):
        vocab_sequences = get_seq(sequences_file, 1)
        train_sequences = get_seq(sequences_file, 1)
        
        w2v_model = Word2Vec(min_count=1, negative=5, size=vecs_length)
        w2v_model.build_vocab(vocab_sequences)
        w2v_model.train(train_sequences)
        return np.array([w2v_model[code] for code in self.codeurl_map])
        
    
    # returns tfidf vector for each page
    def __tfidf(self, vertex_file, vecs_length=50, tfidf=True):
        self.codecontent_map = get_content_map(vertex_file)
        self.pages_content = [self.codecontent_map[code] for code in self.codeurl_map]
        self.codes = [code for code in self.codeurl_map]
        self.longurls = [self.codeurl_map[code] for code in self.codeurl_map]
        
        tfidf_vectorizer = TfidfVectorizer(
            max_df = 0.9,
            max_features = 200000,
            min_df = 0.05,
            stop_words = 'english',
            use_idf = tfidf,
            tokenizer = tokenize_and_stem,
            ngram_range = (1,3)
        )
        tfidf_matrix = tfidf_vectorizer.fit_transform(self.pages_content)
        svd = TruncatedSVD(n_components=vecs_length, algorithm="arpack", random_state=1)
        return svd.fit_transform(tfidf_matrix)
    
    
    # calls the chosen algorithm with the data builded from the input arguments
    def train(self, algorithm=HDBSCAN(min_cluster_size=10), use_w2v=True, use_tfidf=True,
              w2v_size=48, tfidf_size=50, sequences_file="", vertex_file=""):
        
        empty_array = np.array([ [] for i in range(len(self.codeurl_map)) ])
        w2v_vecs    = self.__word_embedding(sequences_file, vecs_length=w2v_size) if use_w2v else empty_array
        tfidf_vecs  = self.__tfidf(vertex_file, vecs_length=tfidf_size) if use_tfidf else empty_array
        
        data = [ np.append(w2v_vecs[i], tfidf_vecs[i]) for i in range(len(self.codeurl_map)) ]
        self.labels_ = algorithm.fit_predict(data)
        self.labels_ = [int(x) for x in self.labels_] # map(int, self.labels_)
        return self.labels_
            
    
    # needs the real membership file and the membership returned by the algorithm 
    # (already given if train was successful)
    # returns the confusion matrix
    def test(self, realmembership_file, alg_membership=None):
        assert (alg_membership is not None or self.labels_ is not None), "No train, No test !"
        alg_membership = self.labels_ if alg_membership is None else alg_membership
        rm = RealMembership(fpath=realmembership_file)
        real_membership = [int(rm.get_membership(self.codeurl_map[code])) for code in self.codeurl_map]
        
        return self.__get_confusion_table(real_membership, alg_membership)
        

In [45]:
nocostraint_path = os.getcwd() + "/../dataset/new/cs.illinois.edu.ListConstraint.words1000.depth10/"
rmfile  = "/home/chris/workspace/jupyter-notebook/url2vec/dataset/manual-membership/urlToMembership.txt"
vertex_path      = nocostraint_path + "vertex.txt"
map_path         = nocostraint_path + "urlsMap.txt"
sequences        = nocostraint_path + "sequenceIDs.txt"

u2v  = Url2Vec(map_path)
u2v2 = Url2Vec(map_path)

In [4]:
# vec = u2v._Url2Vec__tfidf()
# vec2 = u2v.word_embedding(sequences)

k_memb = u2v.train(algorithm=KMeans(n_clusters=15), sequences_file=sequences, vertex_file=vertex_path)

NameError: name 'u2v' is not defined

In [None]:
hd_memb = u2v2.train(sequences_file=sequences, vertex_file=vertex_path)

In [None]:
confusion_table = u2v.test(rmfile)
pd.DataFrame(confusion_table)

In [None]:
confusion_table_hdbscan = u2v2.test(rmfile)
pd.DataFrame(confusion_table_hdbscan)

In [3]:
from sklearn import metrics
# k_memb  
labels_pred_k = map(int, k_memb)
labels_pred_h = hd_memb
codeurl_map = get_urlmap(map_path)
rm = RealMembership(rmfile)
labels_true = [int(rm.get_membership(codeurl_map[code])) for code in codeurl_map]

NameError: name 'k_memb' is not defined

In [None]:
print "Homogeneity:", metrics.homogeneity_score(labels_true, labels_pred_k)
print "Completeness:", metrics.completeness_score(labels_true, labels_pred_k)
print "V Measure Score:", metrics.v_measure_score(labels_true, labels_pred_k)
print "adjusted rand score:", metrics.adjusted_rand_score(labels_true, labels_pred_k)

In [None]:
print "Homogeneity:", metrics.homogeneity_score(labels_true, labels_pred_h) # best 1
print "Completeness:", metrics.completeness_score(labels_true, labels_pred_h)  # best 1
print "V Measure Score:", metrics.v_measure_score(labels_true, labels_pred_h) # best 1
print "adjusted rand score", metrics.adjusted_rand_score(labels_true, labels_pred_h) # best 1