In [1]:
import os
import csv
# import seaborn as sns
import numpy as np
# from random import randint
from gensim.models import Word2Vec
from url_sequences.sequence_manager import *
from url_sequences.sequence_handler import *
from url_sequences.sequence_plotter import *
from url_sequences.clustering_metrics import *

import sklearn
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
import hdbscan

import plotly
import plotly.plotly as py
import plotly.graph_objs as go
# from plotly.graph_objs import *
from plotly.tools import FigureFactory as FF

# WORD2VEC MODEL

In [2]:
path = os.getcwd() + "/dataset/depth-100k/seqLen-10/cs.illinois.eduRandomWalkLists.depth.100000.seqLen.10/"
rwwl_map_path = path + "sequencesMapUrl.txt"
rwwl_seq_path = path + "sequencesIDs.txt"

In [3]:
# because of generator
vocab_sequences = get_seq(rwwl_seq_path, 1)
train_sequences = get_seq(rwwl_seq_path, 1)

In [4]:
w2v_model = Word2Vec(min_count=1, window=5, negative=5)
w2v_model.build_vocab(vocab_sequences)
w2v_model.train(train_sequences)

890323

# TSNE 2-DIM

In [5]:
seq_map = get_seq_map(rwwl_map_path)

# 100-dim vecs
wordvecs = [w2v_model[key] for key in seq_map]
hundred_dim_wordvecs = np.array(wordvecs, dtype="float64")

# long-url labels
word_labels = [seq_map[key] for key in seq_map]

# 2-dim vecs
two_dimensioner = TSNE(n_components=2)
two_dim_wordvecs = two_dimensioner.fit_transform(hundred_dim_wordvecs)

# DBSCAN CLUSTERING

In [6]:
dbscan_clusterer = DBSCAN(eps=0.9, min_samples=4)
dbscan_clusterer.fit(hundred_dim_wordvecs)

dbscan_colors = [get_color(n_clust) for n_clust in dbscan_clusterer.labels_]

print "Clusters found with DBSCAN:", len(set(dbscan_clusterer.labels_))
print [label for label in set(dbscan_clusterer.labels_)]

Clusters found with DBSCAN: 16
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1]


# DBSCAN PLOT

In [7]:
dbscan_data = scatter_plot(two_dim_wordvecs, word_labels, dbscan_colors)
py.iplot(dbscan_data, filename='Word Vectors - Scatter plot DBSCAN')

High five! You successfuly sent some data to your account on plotly. View your plot in your browser at https://plot.ly/~chrispolo/0 or inside your plot.ly account where it is named 'Word Vectors - Scatter plot DBSCAN'


# HDBSCAN CLUSTERING

In [8]:
hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=6)
hdbscan_labels = hdbscan_clusterer.fit_predict(hundred_dim_wordvecs)

hdbscan_colors = [get_color(n_clust) for n_clust in hdbscan_labels]

print "Clusters found with HDBSCAN:", len(set(hdbscan_labels))
print [label for label in set(hdbscan_labels)]

Clusters found with HDBSCAN: 16
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1]


# HDBSCAN PLOT

In [None]:
hdbscan_data = scatter_plot(two_dim_wordvecs, word_labels, hdbscan_colors)
py.iplot(hdbscan_data, filename='Word Vectors - Scatter plot HDBSCAN')

# K-MEANS CLUSTERING

In [9]:
kmeans_clusterer = KMeans(n_clusters=15)
kmeans_clusters = kmeans_clusterer.fit(hundred_dim_wordvecs)

kmeans_colors = [get_color(n_clust) for n_clust in kmeans_clusters.labels_]

print "Clusters found with K-MEANS:", len(set(kmeans_clusters.labels_))
print [label for label in set(kmeans_clusters.labels_)]

Clusters found with K-MEANS: 15
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]


# K-MEANS PLOT

In [None]:
kmeans_data = scatter_plot(two_dim_wordvecs, word_labels, kmeans_colors)
py.iplot(kmeans_data, filename='Word Vectors - Scatter plot K-MEANS')

# MANUALLY CLUSTERING

In [10]:
clusterized_map_path = path + "sequencesMapUrl-manually-clusterized.txt"
seq_tuple_list = get_sequence_tuple_list(clusterized_map_path)

# dict{url_code: cluster_membership} - manually clusterized
real_cluster_membership = {tup[1].strip(): int(tup[2].strip()) for tup in seq_tuple_list}

# dict{longurl: cluster_membership} - manually clusterized - never used
real_cluster_longurl_membership = {tup[0].strip(): int(tup[2].strip()) for tup in seq_tuple_list}

real_cluster_colors = [get_color(real_cluster_membership[key]) for key in seq_map]

print "Clusters found manually:", len(set(real_cluster_membership.values()))
print [label for label in set(real_cluster_membership.values())]

Clusters found manually: 17
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1]


# MANUALLY CLUSTERING - PLOT

In [None]:
real_cluster_data = scatter_plot(two_dim_wordvecs, word_labels, real_cluster_colors)
py.iplot(real_cluster_data, filename='Word Vectors - Scatter plot MANUALLY')

In [11]:
# using seq_map to keep the same order, dunno if it's right
real_membership_list = [real_cluster_membership[key] for key in seq_map]
    
real_membership_list = np.array(real_membership_list, dtype="int32")

print "precision: ", sklearn.metrics.precision_score(real_membership_list, kmeans_clusters.labels_)
print "recall:    ", sklearn.metrics.recall_score(real_membership_list, kmeans_clusters.labels_)

precision:  0.00962826550891
recall:     0.0286343612335



The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.


The default `weighted` averaging is deprecated, and from version 0.18, use of precision, recall or F-score with multiclass or multilabel data or pos_label=None will result in an exception. Please set an explicit value for `average`, one of (None, 'micro', 'macro', 'weighted', 'samples'). In cross validation use, for instance, scoring="f1_weighted" instead of scoring="f1".



In [12]:
def get_confusion_table(real_membership_list, clusters_found_labels):
    # matrix(num_of real_clusters x clusters_found)
    conf_table = np.zeros((len(set(real_membership_list)), len(set(clusters_found_labels))), dtype="int32")
    
    real_clusters_set = set(real_membership_list)
    
    for current_clust in real_clusters_set:
        for i in range(len(clusters_found_labels)):
            if real_membership_list[i] == current_clust:
                cluster_found = clusters_found_labels[i]
                conf_table[current_clust][cluster_found] = conf_table[current_clust][cluster_found] + 1
    return conf_table

C = kmeans_clusterer.labels_

confusion_table = get_confusion_table(real_membership_list, C)

print set(real_membership_list), set(C)
print confusion_table

set([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, -1]) set([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
[[ 13   0   3   0   0   0   0   0   0   0   0   0   0   0   0]
 [  4   0   6   0   0   0   0   0   0   0   0   0   0   0   0]
 [  1   0  12   0   0   0   0   0   0   0   0   0   0   0   0]
 [ 22   0   0   0  75   0   0   0   0   0   0  25   1   0   0]
 [  1   0   0   0   0   0   0  19   0   0   0   0   0   0   9]
 [  1   0   0   0   0   0   0   0   0   0   0   0  10   0   0]
 [  0   0   0   0   0   0   0   0   0  64   0   0   0   0   0]
 [  0   6   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [ 17   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [ 56   0   1   0   2   0  51   0   0   1   0  32   0   1   0]
 [  0  47   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  5   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [224   0   0   0   0   0   5   0   0   0  27   4   0  12   0]
 [  8   0   0  52   0   0   0   0   0   0   0   0   0   0   0