In [1]:
import os
#import csv
import numpy as np
import sys
sys.path.append(os.path.abspath(".."))

from gensim.models import Word2Vec
from urlembed.util.seqmanager import *
from urlembed.util.plotter import *
from urlembed.util.metrics import *

# import sklearn
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
import hdbscan

# import plotly
import plotly.plotly as py
# import plotly.graph_objs as go
from plotly.graph_objs import *
from plotly.tools import FigureFactory as FF
from __future__ import print_function

# WORD2VEC MODEL

In [2]:
nocostraint_path = os.getcwd() + "/../dataset/cs.illinois.edu_NoConstraint.words1000.depth10/"
nocostraint_urlmap_path = nocostraint_path + "urlsMap.txt"
nocostraint_seq_path = nocostraint_path + "sequenceIDs.txt"

In [4]:
# because of generator
vocab_sequences = get_sequences(nocostraint_seq_path)
train_sequences = get_sequences(nocostraint_seq_path)

w2v_model = Word2Vec(min_count=1, window=5, negative=5)
w2v_model.build_vocab(vocab_sequences)
w2v_model.train(train_sequences)

994570

# TSNE 2-DIM

In [10]:
nocostraint_urlmap = get_urlmap(nocostraint_urlmap_path)

# 100-dim vecs
wordvecs_nc = np.array([w2v_model[key] for key in nocostraint_urlmap], dtype="float64")
# hundred_dim_wordvecs = np.array(wordvecs, dtype="float64")

# URL labels
urls = [nocostraint_urlmap[key] for key in nocostraint_urlmap]

# 2-dim vecs
tsne = TSNE(n_components=2)
twodim_wordvecs_nc = tsne.fit_transform(wordvecs_nc)

# DBSCAN CLUSTERING

In [18]:
dbscan_clusterer = DBSCAN(eps=0.9, min_samples=4)
dbscan_clusterer.fit(wordvecs_nc)

dbscan_colors_nc = [get_color(clust) for clust in dbscan_clusterer.labels_]

print("Clusters found with DBSCAN:", len(set(dbscan_clusterer.labels_)))
print ([label for label in set(dbscan_clusterer.labels_)])
print()
print()

Clusters found with DBSCAN: 19
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, -1]




# DBSCAN PLOT

In [15]:
dbscan_data = scatter_plot(twodim_wordvecs_nc, urls, dbscan_colors_nc)
py.iplot(dbscan_data, filename='Word Vectors - Scatter plot DBSCAN')

<div>
    <a href="https://plot.ly/~chrispolo/0" 
        target="_blank" title="y" 
        style="display: block; text-align: center;">
            <img src="../dataset/img/wordvectors_scatter_plot_DBSCAN.png" 
                alt="y" style="max-width: 100%;width: 1121px;"  
                width="100%" onerror="this.onerror=null;this.src='https://plot.ly/404';" />
    </a>
    <script data-plotly="chrispolo:0"  src="https://plot.ly/embed.js" async></script>
</div>

# HDBSCAN CLUSTERING

In [17]:
hdbscan_clusterer = hdbscan.HDBSCAN(min_cluster_size=6)
hdbscan_labels = hdbscan_clusterer.fit_predict(wordvecs_nc)

hdbscan_colors_nc = [get_color(clust) for clust in hdbscan_labels]

print("Clusters found with HDBSCAN:", len(set(hdbscan_labels)))
print([label for label in set(hdbscan_labels)])
print()
print()

Clusters found with HDBSCAN: 13
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1]




# HDBSCAN PLOT

In [20]:
hdbscan_data = scatter_plot(twodim_wordvecs_nc, urls, hdbscan_colors_nc)
py.iplot(hdbscan_data, filename='Word Vectors - Scatter plot HDBSCAN')

<div>
    <a href="https://plot.ly/~chrispolo/2" 
        target="_blank" title="y" 
        style="display: block; text-align: center;">
            <img src="../dataset/img/wordvectors_scatter_plot_HDBSCAN.png" 
                alt="y" style="max-width: 100%;width: 1121px;"  
                width="100%" onerror="this.onerror=null;this.src='https://plot.ly/404';" />
    </a>
    <script data-plotly="chrispolo:2"  src="https://plot.ly/embed.js" async></script>
</div>

# K-MEANS CLUSTERING

In [23]:
kmeans = KMeans(n_clusters=15)
kmeans.fit(wordvecs_nc)

kmeans_colors_nc = [get_color(clust) for clust in kmeans.labels_]

print("Clusters found with K-MEANS:", len(set(kmeans.labels_)))
print([label for label in set(kmeans.labels_)])

Clusters found with K-MEANS: 15
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]


# K-MEANS PLOT

In [26]:
kmeans_data = scatter_plot(twodim_wordvecs_nc, urls, kmeans_colors_nc)
py.iplot(kmeans_data, filename='Word Vectors - Scatter plot K-MEANS')

<div>
    <a href="https://plot.ly/~chrispolo/4" 
        target="_blank" title="y" 
        style="display: block; text-align: center;">
            <img src="../dataset/img/wordvectors_scatter_plot_KMEANS.png" 
                alt="y" style="max-width: 100%;width: 1121px;"  
                width="100%" onerror="this.onerror=null;this.src='https://plot.ly/404';" />
    </a>
    <script data-plotly="chrispolo:4"  src="https://plot.ly/embed.js" async></script>
</div>

# MANUALLY CLUSTERING

In [31]:
"""clusterized_map_path = nopath + "sequencesMapUrl-manually-clusterized.txt"
seq_tuple_list = get_sequence_tuple_list(clusterized_map_path)
"""
gt = GroundTruth(os.getcwd() + "/../dataset/ground_truth/urlToMembership.txt")
ground_truth = [int(gt.get_groundtruth(nocostraint_urlmap[key])) for key in nocostraint_urlmap]
"""
# dict{url_code: cluster_membership} - manually clusterized
real_cluster_membership = {tup[1].strip(): int(tup[2].strip()) for tup in seq_tuple_list}

# dict{longurl: cluster_membership} - manually clusterized - never used
real_cluster_longurl_membership = {tup[0].strip(): int(tup[2].strip()) for tup in seq_tuple_list}
"""
real_colors = [get_color(n) for n in ground_truth]

print("Clusters found manually:", len(set(ground_truth)))
print([label for label in set(ground_truth)])

Clusters found manually: 14
[0, 1, 2, 3, 4, 6, 8, 10, 11, 12, 13, 14, 15, -1]


# MANUALLY CLUSTERING - PLOT

In [33]:
groundtruth_data = scatter_plot(twodim_wordvecs_nc, urls, real_colors)
py.iplot(groundtruth_data, filename='Word Vectors - Scatter plot Ground Truth')

<div>
    <a href="https://plot.ly/~chrispolo/56" 
        target="_blank" title="y" 
        style="display: block; text-align: center;">
            <img src="../dataset/img/wordvectors_scatter_plot_GROUNDTRUTH.png" 
                alt="y" style="max-width: 100%;width: 1121px;"  
                width="100%" onerror="this.onerror=null;this.src='https://plot.ly/404';" />
    </a>
    <script data-plotly="chrispolo:56"  src="https://plot.ly/embed.js" async></script>
</div>

## TODO

In [None]:
# using seq_map to keep the same order, dunno if it's right
real_membership_list = [real_cluster_membership[key] for key in seq_map]
    
real_membership_list = np.array(real_membership_list, dtype="int32")

print "precision: ", sklearn.metrics.precision_score(real_membership_list, kmeans_clusters.labels_)
print "recall:    ", sklearn.metrics.recall_score(real_membership_list, kmeans_clusters.labels_)

In [None]:
def get_confusion_table(real_membership_list, clusters_found_labels):
    # matrix(num_of real_clusters x clusters_found)
    conf_table = np.zeros((len(set(real_membership_list)), len(set(clusters_found_labels))), dtype="int32")
    
    real_clusters_set = set(real_membership_list)
    
    for current_clust in real_clusters_set:
        for i in range(len(clusters_found_labels)):
            if real_membership_list[i] == current_clust:
                cluster_found = clusters_found_labels[i]
                conf_table[current_clust][cluster_found] = conf_table[current_clust][cluster_found] + 1
    return conf_table

C = kmeans_clusterer.labels_

confusion_table = get_confusion_table(real_membership_list, C)

print set(real_membership_list), set(C)
print confusion_table