In [173]:
import os
import csv
import seaborn as sns
import numpy as np
# from random import randint
from gensim.models import Word2Vec
from url_sequences.sequence_manager import *
from url_sequences.sequence_plotter import *
from url_sequences.clustering_metrics import *

import sklearn
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
import hdbscan

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
# from plotly.graph_objs import *
from plotly.tools import FigureFactory as FF

In [36]:
def get_colors(n):
    # 1.arancione, 2.bianco, 3.giallo, 4.azzurro, 5.verde, 6.blu, 7.fucsia, 8.viola
    colors = ["#FF8F00", "#FFFFFF", "#FFFF00", "#00E5FF", "#76FF03", "#2979FF", "#F50057", "#9C27B0"]
    c = ""
    if n < 0:
        c = "#009688"
    elif n < len(colors):
        c = colors[n]
    else:
        c = "#" + format(n**5, '06X')
    return c

# "#" + format(n + randint(0, 16777215 - n), '06X')
# print get_colors(8)

# WORD2VEC MODEL

In [3]:
path = os.getcwd() + "/dataset/cs.illinois.edu/RandomWalkLists-depth.100000.seqLen.10/"
rwwl_map_path = path + "sequencesMapUrl.txt"
rwwl_seq_path = path + "sequencesIDs.txt"

In [5]:
# because of generator
vocab_sequences = get_seq(rwwl_seq_path, 1)
train_sequences = get_seq(rwwl_seq_path, 1)

In [6]:
w2v_model = Word2Vec(min_count=1)
w2v_model.build_vocab(vocab_sequences)
w2v_model.train(train_sequences)

890323

# TSNE 2-DIM

In [7]:
seq_map = get_seq_map(rwwl_map_path)

# 100-dim vecs
wordvecs = [w2v_model[key] for key in seq_map]
hundred_dim_wordvecs = np.array(wordvecs, dtype="float64")

# long-url labels
word_labels = [seq_map[key] for key in seq_map]

# 2-dim vecs
two_dimensioner = TSNE(n_components=2)

# 2-dim vecs
two_dim_wordvecs = two_dimensioner.fit_transform(hundred_dim_wordvecs)

# DBSCAN CLUSTERING

In [41]:
dbscan_clusterer = DBSCAN(eps=2.9, min_samples=10)
dbscan_clusterer.fit(hundred_dim_wordvecs)

dbscan_colors = [get_colors(n_clust) for n_clust in dbscan.labels_]

print "Clusters found with DBSCAN:", len(set(dbscan.labels_))
print [label for label in set(dbscan.labels_)]

Clusters found with DBSCAN: 11
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1]


# DBSCAN PLOT

In [47]:
dbscan_data = sc_plot(two_dim_wordvecs, word_labels, dbscan_colors)
py.iplot(dbscan_data, filename='Word Vectors - Scatter plot')

# HDBSCAN CLUSTERING

In [54]:
hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=13)
hdbscan_labels = hdbscan_clusterer.fit_predict(hundred_dim_wordvecs)

hdbscan_colors = [get_colors(n_clust) for n_clust in hdbscan_labels]

print "Clusters found with HDBSCAN:", len(set(hdbscan_labels))
print [label for label in set(hdbscan_labels)]

Clusters found with HDBSCAN: 7
[0, 1, 2, 3, 4, 5, -1]


# HDBSCAN PLOT

In [55]:
hdbscan_data = sc_plot(two_dim_wordvecs, word_labels, hdbscan_colors)
py.iplot(hdbscan_data, filename='Word Vectors - Scatter plot HDBSCAN')

# K-MEANS CLUSTERING

In [57]:
kmeans_clusterer = KMeans(n_clusters=8)
kmeans_clusters = kmeans_clusterer.fit(hundred_dim_wordvecs)

kmeans_colors = [get_colors(n_clust) for n_clust in kmeans_clusters.labels_]

print "Clusters found with K-MEANS:", len(set(kmeans_clusters.labels_))
print [label for label in set(kmeans_clusters.labels_)]

Clusters found with K-MEANS: 8
[0, 1, 2, 3, 4, 5, 6, 7]


# K-MEANS PLOT

In [59]:
kmeans_data = sc_plot(two_dim_wordvecs, word_labels, kmeans_colors)
py.iplot(kmeans_data, filename='Word Vectors - Scatter plot K-MEANS')

# MANUALLY CLUSTERING

In [120]:
clusterized_map_path = path + "sequencesMapUrl-manually-clusterized.txt"
seq_tuple_list = get_sequence_tuple_list(clusterized_map_path)

# dict{url_code: cluster_membership} - manually clusterized
real_cluster_membership = {tup[1].strip(): int(tup[2].strip()) for tup in seq_tuple_list}

# dict{longurl: cluster_membership} - manually clusterized - never used
real_cluster_longurl_membership = {tup[0].strip(): int(tup[2].strip()) for tup in seq_tuple_list}

real_cluster_colors = [get_colors(real_cluster_membership[key]) for key in seq_map]

print "Clusters found manually:", len(set(real_cluster_membership.values()))
print [label for label in set(real_cluster_membership.values())]

Clusters found manually: 8
[0, 1, 2, 3, 4, 5, 6, 7]


# MANUALLY CLUSTERING - PLOT

In [104]:
real_cluster_data = sc_plot(two_dim_wordvecs, word_labels, real_cluster_colors)
py.iplot(real_cluster_data, filename='Word Vectors - Scatter plot MANUALLY')

In [171]:
# using seq_map to keep the same order, dunno if it's right
real_membership_list = [real_cluster_membership[key] for key in seq_map]
    
real_membership_list = np.array(real_clust, dtype="int32")

print "precision: ", sklearn.metrics.precision_score(real_membership_list, kmeans_clusters.labels_)
print "recall:    ", sklearn.metrics.recall_score(real_membership_list, kmeans_clusters.labels_)

precision:  0.290719370491
recall:     0.068281938326



The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".


The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".



In [168]:
def get_confusion_table(real_membership_list, clusters_found_labels):
    # matrix(num_of real_clusters x clusters_found)
    conf_table = np.zeros((len(set(real_membership_list)), len(set(clusters_found_labels))), dtype="int8")
    real_clusters_set = set(real_membership_list)
    # errore del -1
    for current_clust in real_clusters_set:
        for i in range(len(clusters_found_labels)):
            if real_membership_list[i] == current_clust:
                cluster_found = clusters_found_labels[i]
                conf_table[current_clust][cluster_found] = conf_table[current_clust][cluster_found] + 1
    return conf_table

confusion_table = get_confusion_table(real_membership_list, hdbscan_labels)

print set(real_membership_list), set(hdbscan_labels)
print confusion_table

set([0, 1, 2, 3, 4, 5, 6, 7]) set([0, 1, 2, 3, 4, 5, -1])
[[  0   0   0  18   0   0  21]
 [  0   0  23  30   0   0  73]
 [  0  16   0   1   0   0  26]
 [  0   0   0  80  64  45  90]
 [ 15   0   0 -25   0   0  26]
 [  0   0   0  64   0   0  46]
 [  0   0   0   2   0   0  11]
 [  0   0   0   5   0   0  21]]


In [190]:
ch = [c for c in range(len(set(hdbscan_labels)))]

chh = ["real clust"] + ch
col_headings = np.array(chh)
print col_headings

table = FF.create_table(confusion_table, index=True)
py.iplot(table, filename='index_table')

['real clust' '0' '1' '2' '3' '4' '5' '6']
