In [71]:
import os
import numpy as np
import pandas as pd
import sys
sys.path.append(os.path.abspath(".."))

import plotly.plotly as py
from plotly.graph_objs import *

from urlembed.util.plotter import *
from urlembed.util.seqmanager import *

from sklearn import metrics
from hdbscan import HDBSCAN
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

from __future__ import print_function
from nltk.stem.snowball import SnowballStemmer

In [2]:
nocostraint_path = os.getcwd() + "/../dataset/cs.illinois.edu_NoConstraint.words1000.depth10/"
vertex_nc_path   = nocostraint_path + "vertex.txt"
map_nc_path      = nocostraint_path + "urlsMap.txt"

codecontent_map_nc = get_content_map(vertex_nc_path)
urlmap_nc = get_urlmap(map_nc_path)

# document list
documents_nc = [codecontent_map_nc[key] for key in codecontent_map_nc]
codes_nc     = [key for key in codecontent_map_nc]
urls_nc      = [urlmap_nc[key] for key in codecontent_map_nc]

## Terms Documents Matrix

![Alt text](http://www.codeproject.com/KB/WPF/NNMFSearchResultClusterin/table.jpg "Very nice")

In [3]:
""" 
max_df: 
    this is the maximum frequency within the documents a given feature
    can have to be used in the tfi-idf matrix.
min_idf:
    this could be an integer (e.g. 0.2) and the term would have to be in
    at least 20% of the documents to be considered.
ngram_range:
    (e.g. 1,3) this just means I'll look at unigrams, bigrams and trigrams. 
"""
tfidf_vectorizer = TfidfVectorizer(
    max_df = 0.8,
    max_features = 200000,
    min_df = 0.1,
    stop_words = 'english',
    use_idf = True,
    tokenizer = tokenize_and_stem,
    ngram_range = (1,3)
)

### DTM

In [4]:
tfidf_matrix_nc = tfidf_vectorizer.fit_transform(documents_nc)
# can be used to generate a measure of similarity between each document and the other documents in the corpus
dist_nc = 1 - cosine_similarity(tfidf_matrix_nc)

print(pd.DataFrame({"documents":tfidf_matrix_nc.shape[0], "terms":tfidf_matrix_nc.shape[1]}, index=[""]).T)
print("\n\n")

              
documents  728
terms      433





## KMEANS Clustering - No Costraint

In [5]:
kmeans = KMeans(n_clusters=15)
kmeans_labels_nc = kmeans.fit_predict(tfidf_matrix_nc)

In [6]:
docs_nc = { 
    'code': codes_nc,
    'document': documents_nc
}
frame_nc = pd.DataFrame(docs_nc, index=[kmeans_labels_nc] , columns=['document', 'code'])

frame_nc[:5]

Unnamed: 0,document,code
0,john deere scholarship in computer science dep...,344
0,jump trading scholars department of computer s...,345
0,rockwell collins scholarship department of com...,346
0,spot trading scholarship department of compute...,347
0,illinois cyber security scholars program icssp...,340


In [9]:
# map -> {code: token_list}
tokens_nc_map = to_tokens_map(codecontent_map_nc)
# map -> {code: stem_list}
stems_nc_map = to_stems_map(codecontent_map_nc)

# total vocabulary, list of tokens
totalvocab_nc_stemmed = [stem for key in codecontent_map_nc for stem in stems_nc_map[key]]
# total vocabulary, list of stems
totalvocab_nc_tokenized = [stem for key in codecontent_map_nc for stem in tokens_nc_map[key]]

vocab_nc_frame = pd.DataFrame({'words': totalvocab_nc_tokenized}, index = totalvocab_nc_stemmed)
terms_nc = tfidf_vectorizer.get_feature_names()

# sort cluster centers by proximity to centroid
order_centroids = kmeans.cluster_centers_.argsort()[:,::-1]

for i in range(len(set(kmeans_labels_nc))):
    print("Cluster %d - top words:" % i, end=' -> ')
    
    for ind in order_centroids[i,: 6]: # replace 6 with n words per cluster
        print(' %s' % vocab_nc_frame.ix[terms_nc[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print()

Cluster 0 - top words: ->  undergraduate, undergraduate, scholarship, programs, students, advising,
Cluster 1 - top words: ->  office, office, contact, contact, directory, staff,
Cluster 2 - top words: ->  security, information, s, work, using, professor,
Cluster 3 - top words: ->  networks, real, time, s, work, mobile,
Cluster 4 - top words: ->  applications, graduate, programs, fellowship, ms, deadlines,
Cluster 5 - top words: ->  awards, distinguished, chair, member, s, siebel,
Cluster 6 - top words: ->  curriculum, research, news, news, offered, class,
Cluster 7 - top words: ->  data, information, s, projects, using, said,
Cluster 8 - top words: ->  s, moone, media, said, professor, work,
Cluster 9 - top words: ->  papers, best, best, papers, work, awards,
Cluster 10 - top words: ->  siebel, siebel, center, office, resources, history,
Cluster 11 - top words: ->  win, awards, scholars, home, directory, directory,
Cluster 12 - top words: ->  thesis, ph, ph, requirement, d, ms,
Cluste

# K-Means Plot - TSNE No Costraint

In [10]:
tsne = TSNE(n_components=2, random_state=1)

tfidf_matrix_dense_nc = tfidf_matrix_nc.todense()
docs_vecs_nc = np.array([tfidf_matrix_dense_nc[i].A1 for i in range(len(tfidf_matrix_dense_nc))])
clusters_colors_nc = [ get_color(i) for i in kmeans_labels_nc]

twodim_docs_nc = tsne.fit_transform(dist_nc)

k_tsne_data = scatter_plot(twodim_docs_nc, word_labels=urls_nc, colors=clusters_colors_nc)
py.iplot(k_tsne_data, filename="K-Means t-SNE nocostraint - Doc Clustering")

## K-Means Clustering - List Costraint

In [11]:
listcostraint_path = os.getcwd() + "/../dataset/cs.illinois.edu_ListConstraint.words1000.depth10/"
vertex_lc_path     = listcostraint_path + "vertex.txt"
map_lc_path        = listcostraint_path + "urlsMap.txt"

codecontent_map_lc = get_content_map(vertex_lc_path)
urlmap_lc = get_urlmap(map_lc_path)

# document list
documents_lc = [codecontent_map_lc[key] for key in codecontent_map_lc]
codes_lc     = [key for key in codecontent_map_lc]
urls_lc      = [urlmap_lc[key] for key in codecontent_map_lc]

In [12]:
tfidf_matrix_lc = tfidf_vectorizer.fit_transform(documents_lc) 

# can be used to generate a measure of similarity between each document and the other documents in the corpus
dist_lc = 1 - cosine_similarity(tfidf_matrix_lc)

print(pd.DataFrame({"documents":tfidf_matrix_lc.shape[0], "terms":tfidf_matrix_lc.shape[1]}, index=[""]).T)
print("\n\n")

               
documents  1022
terms       370





In [13]:
kmeans = KMeans(n_clusters=15)
kmeans_labels_lc = kmeans.fit_predict(tfidf_matrix_lc)

In [14]:
docs_lc = {
    'code': codes_lc,
    'document': documents_lc
}

frame_lc = pd.DataFrame(docs_lc, index = [kmeans_labels_lc] , columns = ['document', 'code'])
frame_lc[:5]

Unnamed: 0,document,code
3,engineering at illinois my cs illinois educomp...,344
3,engineering at illinois my cs illinois educomp...,345
0,engineering at illinois my cs illinois educomp...,346
13,engineering at illinois my cs illinois educomp...,347
7,engineering at illinois my cs illinois educomp...,340


In [15]:
# map -> {code: token_list}
tokens_lc_map = to_tokens_map(codecontent_map_lc)
# map -> {code: stem_list}
stems_lc_map = to_stems_map(codecontent_map_lc)

# total vocabulary, list of tokens
totalvocab_lc_stemmed = [stem for key in codecontent_map_lc for stem in stems_lc_map[key]]
# total vocabulary, list of stems
totalvocab_lc_tokenized = [stem for key in codecontent_map_lc for stem in tokens_lc_map[key]]

vocab_lc_frame = pd.DataFrame({'words': totalvocab_lc_tokenized}, index = totalvocab_lc_stemmed)
terms_lc = tfidf_vectorizer.get_feature_names()

# sort cluster centers by proximity to centroid
order_centroids_lc = kmeans.cluster_centers_.argsort()[:, ::-1]

for i in range(len(set(kmeans_labels_lc))):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids_lc[i,:6]: # replace 6 with n words per cluster
        print(' %s' % vocab_lc_frame.ix[terms_lc[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print()

Cluster 0 words: primary, primary, primary, research, professor, offices,
Cluster 1 words: security, s, networking, information, programming, data,
Cluster 2 words: undergraduate, undergraduate, programming, illinois, edu, students,
Cluster 3 words: honors, publications, honors, contacts, contacts, offices,
Cluster 4 words: description, topic, section, courses, curriculum, instructor,
Cluster 5 words: ph, ph, required, ms, graduate, form,
Cluster 6 words: b, c, sci, r, comp, comp,
Cluster 7 words: edu, edu, illinois, center, comp, center,
Cluster 8 words: said, s, data, used, working, development,
Cluster 9 words: applications, programming, graduate, ms, fellowship, degree,
Cluster 10 words: media, moone, s, working, awarded, networking,
Cluster 11 words: parallel, performance, parallel, programming, modeling, high,
Cluster 12 words: siebel, siebel, center, resources, edu, illinois,
Cluster 13 words: store, courses, store, curriculum, cs, courses,
Cluster 14 words: awarded, members, s,

# KMEANS Plot

In [16]:
tfidf_matrix_lc_dense = tfidf_matrix_lc.todense()
docs_vecs_lc = np.array([tfidf_matrix_lc_dense[i].A1 for i in range(len(tfidf_matrix_lc_dense))])

clusters_colors_lc = [ get_color(i) for i in kmeans_labels_lc]

twodim_docs_lc = tsne.fit_transform(dist_lc)

k_tsne_data_lc = scatter_plot(twodim_docs_lc, word_labels=urls_lc, colors=clusters_colors_lc)
py.iplot(k_tsne_data_lc, filename="K-Means listcostraint - Doc Clustering")

---

# EVALUATION

### GROUND TRUTH

In [47]:
gt = GroundTruth(os.getcwd() + "/../dataset/ground_truth/urlToMembership.txt")
ground_truth_lc = [int(gt.get_groundtruth(urlmap_lc[key])) for key in codecontent_map_lc]

gt = GroundTruth(os.getcwd() + "/../dataset/ground_truth/urlToMembership.txt")
ground_truth_nc = [int(gt.get_groundtruth(urlmap_nc[key])) for key in codecontent_map_nc]

print("Clusters found manually:", len(set(ground_truth_nc)))
print([label for label in set(ground_truth_nc)])
print()
print("Clusters found manually:", len(set(ground_truth_lc)))
print([label for label in set(ground_truth_lc)])
print("\n\n")

Clusters found manually: 14
[0, 1, 2, 3, 4, 6, 8, 10, 11, 12, 13, 14, 15, -1]

Clusters found manually: 13
[0, 1, 2, 3, 4, 6, 8, 10, 12, 13, 14, 15, -1]





## DBSCAN and HDBSCAN nocostraint

In [48]:
dbscan = DBSCAN(eps=0.9, min_samples=4)
dbscan_labels_nc = dbscan.fit_predict(tfidf_matrix_nc)

print("Clusters found with DBSCAN:", len(set(dbscan_labels_nc)))
print ([label for label in set(dbscan_labels_nc)])
print("\n")

hdbscan = HDBSCAN(min_cluster_size=4)
hdbscan_labels_nc = hdbscan.fit_predict(tfidf_matrix_nc)

print("Clusters found with HDBSCAN:", len(set(hdbscan_labels_nc)))
print([label for label in set(hdbscan_labels_nc)])
print("\n\n")

Clusters found with DBSCAN: 16
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1]


Clusters found with HDBSCAN: 15
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1]





## DBSCAN and HDBSCAN listcostraint

In [49]:
dbscan = DBSCAN(eps=0.7, min_samples=4)
dbscan_labels_lc = dbscan.fit_predict(tfidf_matrix_lc)

print("Clusters found with DBSCAN:", len(set(dbscan_labels_lc)))
print ([label for label in set(dbscan_labels_lc)])
print("\n")

hdbscan = HDBSCAN(min_cluster_size=7)
hdbscan_labels_lc = hdbscan.fit_predict(tfidf_matrix_lc)

print("Clusters found with HDBSCAN:", len(set(hdbscan_labels_lc)))
print([label for label in set(hdbscan_labels_lc)])
print("\n\n")

Clusters found with DBSCAN: 14
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1]


Clusters found with HDBSCAN: 13
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1]





## Metrics

In [72]:
metrics_df = pd.DataFrame([
        [
            # dbscan nocostraint
            metrics.homogeneity_score(ground_truth_nc, dbscan_labels_nc),
            metrics.completeness_score(ground_truth_nc, dbscan_labels_nc),
            metrics.v_measure_score(ground_truth_nc, dbscan_labels_nc),
            metrics.adjusted_rand_score(ground_truth_nc, dbscan_labels_nc),
            metrics.adjusted_mutual_info_score(ground_truth_nc, dbscan_labels_nc)
        ],
        [
            # hdbscan nocostraint
            metrics.homogeneity_score(ground_truth_nc, hdbscan_labels_nc),
            metrics.completeness_score(ground_truth_nc, hdbscan_labels_nc),
            metrics.v_measure_score(ground_truth_nc, hdbscan_labels_nc),
            metrics.adjusted_rand_score(ground_truth_nc, hdbscan_labels_nc),
            metrics.adjusted_mutual_info_score(ground_truth_nc, hdbscan_labels_nc)
        ],
        [
            # kmeans nocostraint
            metrics.homogeneity_score(ground_truth_nc, kmeans_labels_nc),
            metrics.completeness_score(ground_truth_nc, kmeans_labels_nc),
            metrics.v_measure_score(ground_truth_nc, kmeans_labels_nc),
            metrics.adjusted_rand_score(ground_truth_nc, kmeans_labels_nc),
            metrics.adjusted_mutual_info_score(ground_truth_nc, kmeans_labels_nc)
        ],
        [
            # dbscan listcostraint
            metrics.homogeneity_score(ground_truth_lc, dbscan_labels_lc),
            metrics.completeness_score(ground_truth_lc, dbscan_labels_lc),
            metrics.v_measure_score(ground_truth_lc, dbscan_labels_lc),
            metrics.adjusted_rand_score(ground_truth_lc, dbscan_labels_lc),
            metrics.adjusted_mutual_info_score(ground_truth_lc, dbscan_labels_lc)
        ],
        [
            # hdbscan listcostraint
            metrics.homogeneity_score(ground_truth_lc, hdbscan_labels_lc),
            metrics.completeness_score(ground_truth_lc, hdbscan_labels_lc),
            metrics.v_measure_score(ground_truth_lc, hdbscan_labels_lc),
            metrics.adjusted_rand_score(ground_truth_lc, hdbscan_labels_lc),
            metrics.adjusted_mutual_info_score(ground_truth_lc, hdbscan_labels_lc)
        ],
        [
            # kmeans listcostraint
            metrics.homogeneity_score(ground_truth_lc, kmeans_labels_lc),
            metrics.completeness_score(ground_truth_lc, kmeans_labels_lc),
            metrics.v_measure_score(ground_truth_lc, kmeans_labels_lc),
            metrics.adjusted_rand_score(ground_truth_lc, kmeans_labels_lc),
            metrics.adjusted_mutual_info_score(ground_truth_lc, kmeans_labels_lc)
        ]],
        index=["NoCostraint - DBSCAN", "NoCostraint - HDBSCAN", "NoCostraint - K-MEANS", "ListCostraint - DBSCAN", "ListCostraint - HDBSCAN", "ListCostraint - K-MEANS"],
        columns=["Homogeneity", "Completeness", "V-Measure core", "Adjusted Rand index", "Mutual Information"])

metrics_df

Unnamed: 0,Homogeneity,Completeness,V-Measure core,Adjusted Rand index,Mutual Information
NoCostraint - DBSCAN,0.560068,0.596247,0.577592,0.407829,0.534608
NoCostraint - HDBSCAN,0.515236,0.602887,0.555626,0.386213,0.48578
NoCostraint - K-MEANS,0.754334,0.572492,0.650952,0.311199,0.548965
ListCostraint - DBSCAN,0.570969,0.704237,0.630639,0.51967,0.556604
ListCostraint - HDBSCAN,0.450059,0.510352,0.478313,0.193805,0.429703
ListCostraint - K-MEANS,0.844222,0.606859,0.706127,0.440011,0.593167


# w2v - tfidf

In [None]:
from url_sequences.sequence_handler import *
sequences_lc = listcostraint_path + "sequenceIDs.txt"

# because of generator
vocab_sequences = get_seq(sequences_lc, 1)
train_sequences = get_seq(sequences_lc, 1)

In [None]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(min_count=1, negative=5, size=48)
w2v_model.build_vocab(vocab_sequences)
w2v_model.train(train_sequences)

In [None]:
w2v_vecs_lc = np.array([w2v_model[key] for key in content_lc_map]) 
docs_vecs_lc = docs_vecs_lc

In [None]:
type(docs_vecs_lc)

In [None]:
tsne50 =  TSNE(n_components=50)
docs_vecs_lc_reduced = tsne50.fit_transform(docs_vecs_lc)
# vecs = [ np.concatenate((w2v_vecs_lc[i], docs_vecs_lc_reduced[i]), axis=0) for i in range(3)]
