# Document Clustering

In [2]:
import os
import numpy as np
import pandas as pd
import sys
sys.path.append(os.path.abspath(".."))

import plotly.plotly as py
from plotly.graph_objs import *
from urlembed.util.plotter import *
from urlembed.util.seqmanager import *

from sklearn import metrics
from hdbscan import HDBSCAN
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from __future__ import print_function
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

---

# TF-IDF matrix

Here is defined **term frequency - inverse document frequency** (tf-idf) vectorizer parameters and then convert the documents (web pages) list into a tf-idf matrix.

To get a Tf-idf matrix, first count word occurrences by document. This is transformed into a **document-term matrix** (dtm). This is also just called a term frequency matrix.
Then apply the term frequency-inverse document frequency weighting: words that occur frequently within a document but not frequently within the corpus receive a higher weighting as these words are assumed to contain more meaning in relation to the document.

A couple things to note about the parameters defined below:

**max_df**: this is the maximum frequency within the documents a given feature can have to be used in the tfi-idf matrix. If the term is in greater than 80% of the documents it probably cares little meanining

**min_idf**: this could be an integer (e.g. 5) and the term would have to be in at least 5 of the documents to be considered. Here I pass 0.1; the term must be in at least 10% of the document.

**ngram_range**: this just means I'll look at unigrams, bigrams and trigrams.

#### Document-Term Matrix

![Alt text](http://www.codeproject.com/KB/WPF/NNMFSearchResultClusterin/table.jpg "Very nice")

In [3]:
tfidf_vectorizer = TfidfVectorizer(
    max_df = 0.8,
    max_features = 200000,
    min_df = 0.1,
    stop_words = 'english',
    use_idf = True,
    tokenizer = tokenize_and_stem,
    ngram_range = (1,3)
)

The crawling proccess has been done in two different ways:

- **No costraint**: the crawler follows a random outlink from all of the outlinks in a given page
- **List costraint**: the crawler follows a random outlink but only from the outlinks in "lists"

# No-costraint documents

In [4]:
nocostraint_path = os.getcwd() + "/../dataset/cs.illinois.edu_NoConstraint.words1000.depth10/"
vertex_nc_path   = nocostraint_path + "vertex.txt"
map_nc_path      = nocostraint_path + "urlsMap.txt"

codecontent_map_nc = get_content_map(vertex_nc_path)
urlmap_nc          = get_urlmap(map_nc_path)

documents_nc = [codecontent_map_nc[key] for key in codecontent_map_nc]
codes_nc     = [key for key in codecontent_map_nc]
urls_nc      = [urlmap_nc[key] for key in codecontent_map_nc]

#### Cosine-Similarity
Cosine similarity is measured against the tf-idf matrix and can be used to generate a measure of similarity between each document and the other documents in the corpus.

Subtracting it from 1 provides cosine distance which I will use for plotting on a euclidean (2-dimensional) plane.

In [5]:
tfidf_matrix_nc = tfidf_vectorizer.fit_transform(documents_nc)
dist_nc = 1 - cosine_similarity(tfidf_matrix_nc)

print(pd.DataFrame({"documents":tfidf_matrix_nc.shape[0], "terms":tfidf_matrix_nc.shape[1]}, index=[""]).T)
print("\n\n")

              
documents  728
terms      433





## Clustering on No-costraint documents
#### K-Means

In [7]:
kmeans = KMeans(n_clusters=15)
kmeans_labels_nc = kmeans.fit_predict(tfidf_matrix_nc)

docs_nc = { 
    'code': codes_nc,
    'document': documents_nc
}
frame_nc = pd.DataFrame(docs_nc, index=[kmeans_labels_nc] , columns=['document', 'code'])

frame_nc[:5]

Unnamed: 0,document,code
5,john deere scholarship in computer science dep...,344
5,jump trading scholars department of computer s...,345
5,rockwell collins scholarship department of com...,346
5,spot trading scholarship department of compute...,347
5,illinois cyber security scholars program icssp...,340


#### Topic modeling
Some fancy indexing and sorting on each cluster to identify which are the top n words that are nearest to the cluster centroid. This gives an idea of the main topic of each the cluster.

In [42]:
# map -> {code: token_list}
tokens_nc_map = to_tokens_map(codecontent_map_nc)
# map -> {code: stem_list}
stems_nc_map = to_stems_map(codecontent_map_nc)

# total vocabulary, list of tokens
totalvocab_nc_stemmed = [stem for key in codecontent_map_nc for stem in stems_nc_map[key]]
# total vocabulary, list of stems
totalvocab_nc_tokenized = [stem for key in codecontent_map_nc for stem in tokens_nc_map[key]]

vocab_nc_frame = pd.DataFrame({'words': totalvocab_nc_tokenized}, index = totalvocab_nc_stemmed)
terms_nc = tfidf_vectorizer.get_feature_names()

# sort cluster centers by proximity to centroid
order_centroids_nc = kmeans.cluster_centers_.argsort()[:,::-1]

num_clusters_nc = len(set(kmeans_labels_nc))
words_matrix_nc = [None] * num_clusters_nc
top_n = 7

for i in range(num_clusters_nc):
    cluster_chart = [vocab_nc_frame.ix[terms_nc[ind].split(' ')].values.tolist()[0][0] for ind in order_centroids_nc[i,:top_n]]
    words_matrix_nc[i] = cluster_chart
    
pd.DataFrame(
    words_matrix_nc, 
    index = ["Cluster " + str(i) + " - Top Words" for i in range(num_clusters_nc)],
    columns = list(range(1, top_n+1))
)

Unnamed: 0,1,2,3,4,5,6,7
Cluster 0 - Top Words,ph,ph,requirement,ms,graduate,form,d
Cluster 1 - Top Words,primary,primary,primary,research,professor,research,public
Cluster 2 - Top Words,said,s,work,using,developed,team,moone
Cluster 3 - Top Words,applications,programs,graduate,ms,fellowship,degrees,campus
Cluster 4 - Top Words,b,c,sci,r,comp,center,center
Cluster 5 - Top Words,description,topics,sections,courses,curriculum,instructors,day
Cluster 6 - Top Words,undergraduate,undergraduate,programs,illinois,edu,students,advising
Cluster 7 - Top Words,honors,public,honors,contact,contact,office,research
Cluster 8 - Top Words,store,courses,store,curriculum,cs,courses,home
Cluster 9 - Top Words,security,data,networks,information,s,work,new


### K-Means Plot

In [35]:
tsne = TSNE(n_components=2, random_state=1)
twodim_docs_nc = tsne.fit_transform(dist_nc)
# tfidf_matrix_dense_nc = tfidf_matrix_nc.todense()
# docs_vecs_nc = np.array([tfidf_matrix_dense_nc[i].A1 for i in range(len(tfidf_matrix_dense_nc))])
clusters_colors_nc = [ get_color(i) for i in kmeans_labels_nc]

kmeans_data = scatter_plot(twodim_docs_nc, word_labels=urls_nc, colors=clusters_colors_nc)
py.iplot(kmeans_data, filename="K-Means t-SNE nocostraint - Doc Clustering")

---

# List-costraint documents

In [36]:
listcostraint_path = os.getcwd() + "/../dataset/cs.illinois.edu_ListConstraint.words1000.depth10/"
vertex_lc_path     = listcostraint_path + "vertex.txt"
map_lc_path        = listcostraint_path + "urlsMap.txt"

codecontent_map_lc = get_content_map(vertex_lc_path)
urlmap_lc = get_urlmap(map_lc_path)

# document list
documents_lc = [codecontent_map_lc[key] for key in codecontent_map_lc]
codes_lc     = [key for key in codecontent_map_lc]
urls_lc      = [urlmap_lc[key] for key in codecontent_map_lc]

#### Cosine-Similarity
Cosine similarity is measured against the tf-idf matrix and can be used to generate a measure of similarity between each document and the other documents in the corpus.

Subtracting it from 1 provides cosine distance which I will use for plotting on a euclidean (2-dimensional) plane.

In [38]:
tfidf_matrix_lc = tfidf_vectorizer.fit_transform(documents_lc)

dist_lc = 1 - cosine_similarity(tfidf_matrix_lc)

print(pd.DataFrame({"documents":tfidf_matrix_lc.shape[0], "terms":tfidf_matrix_lc.shape[1]}, index=[""]).T)
print("\n\n")

               
documents  1022
terms       370





## Clustering on List-costraint documents
#### K-Means

In [39]:
kmeans = KMeans(n_clusters=15)
kmeans_labels_lc = kmeans.fit_predict(tfidf_matrix_lc)
docs_lc = {
    'code': codes_lc,
    'document': documents_lc
}

frame_lc = pd.DataFrame(docs_lc, index = [kmeans_labels_lc] , columns = ['document', 'code'])
frame_lc[:5]

Unnamed: 0,document,code
7,engineering at illinois my cs illinois educomp...,344
7,engineering at illinois my cs illinois educomp...,345
1,engineering at illinois my cs illinois educomp...,346
8,engineering at illinois my cs illinois educomp...,347
11,engineering at illinois my cs illinois educomp...,340


#### Topic modeling
Some fancy indexing and sorting on each cluster to identify which are the top n words that are nearest to the cluster centroid. This gives an idea of the main topic of each the cluster.

In [43]:
# map -> {code: token_list}
tokens_lc_map = to_tokens_map(codecontent_map_lc)
# map -> {code: stem_list}
stems_lc_map = to_stems_map(codecontent_map_lc)

# total vocabulary, list of tokens
totalvocab_lc_stemmed = [stem for key in codecontent_map_lc for stem in stems_lc_map[key]]
# total vocabulary, list of stems
totalvocab_lc_tokenized = [stem for key in codecontent_map_lc for stem in tokens_lc_map[key]]

vocab_lc_frame = pd.DataFrame({'words': totalvocab_lc_tokenized}, index = totalvocab_lc_stemmed)
terms_lc = tfidf_vectorizer.get_feature_names()

# sort cluster centers by proximity to centroid
order_centroids_lc = kmeans.cluster_centers_.argsort()[:, ::-1]

num_clusters_lc = len(set(kmeans_labels_lc))
words_matrix_lc = [None] * num_clusters_lc
top_n = 7

for i in range(num_clusters_lc):
    cluster_chart = [vocab_lc_frame.ix[terms_lc[ind].split(' ')].values.tolist()[0][0] for ind in order_centroids_lc[i,:top_n]]
    words_matrix_lc[i] = cluster_chart
    
pd.DataFrame(
    words_matrix_lc, 
    index = ["Cluster " + str(i) + " - Top Words" for i in range(num_clusters_lc)],
    columns = list(range(1, top_n+1))
)

Unnamed: 0,1,2,3,4,5,6,7
Cluster 0 - Top Words,ph,ph,required,ms,graduate,form,d
Cluster 1 - Top Words,primary,primary,primary,research,professor,research,publications
Cluster 2 - Top Words,said,s,working,used,development,team,moone
Cluster 3 - Top Words,applications,programming,graduate,ms,fellowship,degree,campus
Cluster 4 - Top Words,b,c,sci,r,comp,center,center
Cluster 5 - Top Words,description,topic,section,courses,curriculum,instructor,day
Cluster 6 - Top Words,undergraduate,undergraduate,programming,illinois,edu,students,advising
Cluster 7 - Top Words,honors,publications,honors,contacts,contacts,offices,research
Cluster 8 - Top Words,store,courses,store,curriculum,cs,courses,home
Cluster 9 - Top Words,security,data,networking,information,s,working,new


### K-Means Plot

In [44]:
tfidf_matrix_lc_dense = tfidf_matrix_lc.todense()
docs_vecs_lc = np.array([tfidf_matrix_lc_dense[i].A1 for i in range(len(tfidf_matrix_lc_dense))])

clusters_colors_lc = [ get_color(i) for i in kmeans_labels_lc]

twodim_docs_lc = tsne.fit_transform(dist_lc)

k_tsne_data_lc = scatter_plot(twodim_docs_lc, word_labels=urls_lc, colors=clusters_colors_lc)
py.iplot(k_tsne_data_lc, filename="K-Means listcostraint - Doc Clustering")

---

# EVALUATION

### GROUND TRUTH

In [47]:
gt = GroundTruth(os.getcwd() + "/../dataset/ground_truth/urlToMembership.txt")
ground_truth_lc = [int(gt.get_groundtruth(urlmap_lc[key])) for key in codecontent_map_lc]

gt = GroundTruth(os.getcwd() + "/../dataset/ground_truth/urlToMembership.txt")
ground_truth_nc = [int(gt.get_groundtruth(urlmap_nc[key])) for key in codecontent_map_nc]

print("Clusters found manually:", len(set(ground_truth_nc)))
print([label for label in set(ground_truth_nc)])
print()
print("Clusters found manually:", len(set(ground_truth_lc)))
print([label for label in set(ground_truth_lc)])
print("\n\n")

Clusters found manually: 14
[0, 1, 2, 3, 4, 6, 8, 10, 11, 12, 13, 14, 15, -1]

Clusters found manually: 13
[0, 1, 2, 3, 4, 6, 8, 10, 12, 13, 14, 15, -1]





## DBSCAN and HDBSCAN nocostraint

In [48]:
dbscan = DBSCAN(eps=0.9, min_samples=4)
dbscan_labels_nc = dbscan.fit_predict(tfidf_matrix_nc)

print("Clusters found with DBSCAN:", len(set(dbscan_labels_nc)))
print ([label for label in set(dbscan_labels_nc)])
print("\n")

hdbscan = HDBSCAN(min_cluster_size=4)
hdbscan_labels_nc = hdbscan.fit_predict(tfidf_matrix_nc)

print("Clusters found with HDBSCAN:", len(set(hdbscan_labels_nc)))
print([label for label in set(hdbscan_labels_nc)])
print("\n\n")

Clusters found with DBSCAN: 16
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -1]


Clusters found with HDBSCAN: 15
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, -1]





## DBSCAN and HDBSCAN listcostraint

In [49]:
dbscan = DBSCAN(eps=0.7, min_samples=4)
dbscan_labels_lc = dbscan.fit_predict(tfidf_matrix_lc)

print("Clusters found with DBSCAN:", len(set(dbscan_labels_lc)))
print ([label for label in set(dbscan_labels_lc)])
print("\n")

hdbscan = HDBSCAN(min_cluster_size=7)
hdbscan_labels_lc = hdbscan.fit_predict(tfidf_matrix_lc)

print("Clusters found with HDBSCAN:", len(set(hdbscan_labels_lc)))
print([label for label in set(hdbscan_labels_lc)])
print("\n\n")

Clusters found with DBSCAN: 14
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, -1]


Clusters found with HDBSCAN: 13
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, -1]





## Metrics

In [72]:
metrics_df = pd.DataFrame([
        [
            # dbscan nocostraint
            metrics.homogeneity_score(ground_truth_nc, dbscan_labels_nc),
            metrics.completeness_score(ground_truth_nc, dbscan_labels_nc),
            metrics.v_measure_score(ground_truth_nc, dbscan_labels_nc),
            metrics.adjusted_rand_score(ground_truth_nc, dbscan_labels_nc),
            metrics.adjusted_mutual_info_score(ground_truth_nc, dbscan_labels_nc)
        ],
        [
            # hdbscan nocostraint
            metrics.homogeneity_score(ground_truth_nc, hdbscan_labels_nc),
            metrics.completeness_score(ground_truth_nc, hdbscan_labels_nc),
            metrics.v_measure_score(ground_truth_nc, hdbscan_labels_nc),
            metrics.adjusted_rand_score(ground_truth_nc, hdbscan_labels_nc),
            metrics.adjusted_mutual_info_score(ground_truth_nc, hdbscan_labels_nc)
        ],
        [
            # kmeans nocostraint
            metrics.homogeneity_score(ground_truth_nc, kmeans_labels_nc),
            metrics.completeness_score(ground_truth_nc, kmeans_labels_nc),
            metrics.v_measure_score(ground_truth_nc, kmeans_labels_nc),
            metrics.adjusted_rand_score(ground_truth_nc, kmeans_labels_nc),
            metrics.adjusted_mutual_info_score(ground_truth_nc, kmeans_labels_nc)
        ],
        [
            # dbscan listcostraint
            metrics.homogeneity_score(ground_truth_lc, dbscan_labels_lc),
            metrics.completeness_score(ground_truth_lc, dbscan_labels_lc),
            metrics.v_measure_score(ground_truth_lc, dbscan_labels_lc),
            metrics.adjusted_rand_score(ground_truth_lc, dbscan_labels_lc),
            metrics.adjusted_mutual_info_score(ground_truth_lc, dbscan_labels_lc)
        ],
        [
            # hdbscan listcostraint
            metrics.homogeneity_score(ground_truth_lc, hdbscan_labels_lc),
            metrics.completeness_score(ground_truth_lc, hdbscan_labels_lc),
            metrics.v_measure_score(ground_truth_lc, hdbscan_labels_lc),
            metrics.adjusted_rand_score(ground_truth_lc, hdbscan_labels_lc),
            metrics.adjusted_mutual_info_score(ground_truth_lc, hdbscan_labels_lc)
        ],
        [
            # kmeans listcostraint
            metrics.homogeneity_score(ground_truth_lc, kmeans_labels_lc),
            metrics.completeness_score(ground_truth_lc, kmeans_labels_lc),
            metrics.v_measure_score(ground_truth_lc, kmeans_labels_lc),
            metrics.adjusted_rand_score(ground_truth_lc, kmeans_labels_lc),
            metrics.adjusted_mutual_info_score(ground_truth_lc, kmeans_labels_lc)
        ]],
        index=["NoCostraint - DBSCAN", "NoCostraint - HDBSCAN", "NoCostraint - K-MEANS", "ListCostraint - DBSCAN", "ListCostraint - HDBSCAN", "ListCostraint - K-MEANS"],
        columns=["Homogeneity", "Completeness", "V-Measure core", "Adjusted Rand index", "Mutual Information"])

metrics_df

Unnamed: 0,Homogeneity,Completeness,V-Measure core,Adjusted Rand index,Mutual Information
NoCostraint - DBSCAN,0.560068,0.596247,0.577592,0.407829,0.534608
NoCostraint - HDBSCAN,0.515236,0.602887,0.555626,0.386213,0.48578
NoCostraint - K-MEANS,0.754334,0.572492,0.650952,0.311199,0.548965
ListCostraint - DBSCAN,0.570969,0.704237,0.630639,0.51967,0.556604
ListCostraint - HDBSCAN,0.450059,0.510352,0.478313,0.193805,0.429703
ListCostraint - K-MEANS,0.844222,0.606859,0.706127,0.440011,0.593167


# w2v - tfidf

In [None]:
from url_sequences.sequence_handler import *
sequences_lc = listcostraint_path + "sequenceIDs.txt"

# because of generator
vocab_sequences = get_seq(sequences_lc, 1)
train_sequences = get_seq(sequences_lc, 1)

In [None]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(min_count=1, negative=5, size=48)
w2v_model.build_vocab(vocab_sequences)
w2v_model.train(train_sequences)

In [None]:
w2v_vecs_lc = np.array([w2v_model[key] for key in content_lc_map]) 
docs_vecs_lc = docs_vecs_lc

In [None]:
type(docs_vecs_lc)

In [None]:
tsne50 =  TSNE(n_components=50)
docs_vecs_lc_reduced = tsne50.fit_transform(docs_vecs_lc)
# vecs = [ np.concatenate((w2v_vecs_lc[i], docs_vecs_lc_reduced[i]), axis=0) for i in range(3)]
