In [1]:
import os
import numpy as np
import pandas as pd

from url_sequences.sequence_manager import *
from url_sequences.sequence_plotter import *
from url_sequences.sequence_handler import *
from url_sequences.clustering_metrics import *

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE

from sklearn.cluster import KMeans
from hdbscan import HDBSCAN

import plotly.plotly as py
from plotly.graph_objs import *
import plotly.graph_objs as go

from gensim.models import Word2Vec

### Loading no-costraint & list-costraint Dataset
- **content_nc_map** -> dict {code: content_string}
- **url_nc_map** -> dict {code: longurl}
- **pages_content_nc** -> list [content strings]
- **codes_nc** -> list [codes]
- **longurls_nc** -> list [longurls]


In [2]:
"""Real membership List"""
rm = RealMembership()  


"""No-Costraint"""
nocostraint_path   = os.getcwd() + "/dataset/new/cs.illinois.eduNoConstraint.words1000.depth.10/"
vertex_nc_path     = nocostraint_path + "vertex.txt"
map_nc_path        = nocostraint_path + "urlsMap.txt"

# code -> content_string - dict 
content_nc_map = get_content_map(vertex_nc_path)
# code -> longurl - dict 
url_nc_map = get_urlmap(map_nc_path)
# document no-costraint list
pages_content_nc = [content_nc_map[key] for key in content_nc_map]
# codes no-costraint list
codes_nc = [key for key in content_nc_map]
# longurls no-costraint list
longurls_nc = [url_nc_map[key] for key in content_nc_map]
# real membership list-costraint
real_membership_nc = [rm.get_membership(url_nc_map[key]) for key in content_nc_map]

"""List-Costraint"""
listcostraint_path = os.getcwd() + "/dataset/new/cs.illinois.edu.ListConstraint.words1000.depth10/"
vertex_lc_path     = listcostraint_path + "vertex.txt"
map_lc_path        = listcostraint_path + "urlsMap.txt"

# code -> content_string - dict 
content_lc_map = get_content_map(vertex_lc_path)
# code -> longurl - dict 
url_lc_map = get_urlmap(map_lc_path)
# document list-costraint list
pages_content_lc = [content_lc_map[key] for key in content_lc_map]
# codes list-costraint list
codes_lc = [key for key in content_lc_map]
# longurls list-costraint list
longurls_lc = [url_lc_map[key] for key in content_lc_map]
# real membership list-costraint
real_membership_lc = [rm.get_membership(url_lc_map[key]) for key in content_lc_map]

['http://cs.illinois.edu/', '-1\n']
['http://cs.illinois.edu/', '-1\n']
['http://cs.illinois.edu/45.55.227.170/', '-1\n']
['http://cs.illinois.edu/about-us/', '-1\n']
['http://cs.illinois.edu/about-us/about-siebel-center/', '-1\n']
['http://cs.illinois.edu/about-us/awards/', '-1\n']
['http://cs.illinois.edu/about-us/awards/best-paper-awards/', '-1\n']
['http://cs.illinois.edu/about-us/awards/distinguished-alumni/', '-1\n']
['http://cs.illinois.edu/about-us/awards/distinguished-alumni/2011-distinguished-achievement-award/', '0\n']
['http://cs.illinois.edu/about-us/awards/distinguished-alumni/2011-distinguished-educator-award/', '0\n']
['http://cs.illinois.edu/about-us/awards/distinguished-alumni/2011-distinguished-service-award/', '0\n']
['http://cs.illinois.edu/about-us/awards/distinguished-alumni/2011-memorial-achievement-award/', '0\n']
['http://cs.illinois.edu/about-us/awards/distinguished-alumni/2012-distinguished-achievement-award/', '0\n']
['http://cs.illinois.edu/about-us/awards

### TFIDF Matrix
Term frequency - inverse document frequency (tf-idf) vectorizer parameters.
To get a Tf-idf matrix, first count word occurrences by document. This is transformed into a document-term matrix (dtm). This is also just called a term frequency matrix.

![Alt text](http://www.codeproject.com/KB/WPF/NNMFSearchResultClusterin/table.jpg "DTM") 

Then apply the term frequency-inverse document frequency weighting: words that occur frequently within a document but not frequently within the corpus receive a higher weighting as these words are assumed to contain more meaning in relation to the document.

Parameters defined below:

**max_df**: this is the maximum frequency within the documents a given feature can have to be used in the tfi-idf matrix. If the term is in greater than 80% of the documents it probably cares little meanining (in the context of film synopses)

**min_idf**: this could be an integer (e.g. 5) and the term would have to be in at least 5 of the documents to be considered. Here I pass 0.2; the term must be in at least 20% of the document. I found that if I allowed a lower min_df I ended up basing clustering on names--for example "Michael" or "Tom" are names found in several of the movies and the synopses use these names frequently, but the names carry no real meaning.

**ngram_range**: this just means I'll look at unigrams, bigrams and trigrams. See n-grams


In [None]:
# TFIDF vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_df = 0.8,
    max_features = 200000,
    min_df = 0.1,
    stop_words = 'english',
    use_idf = True,
    tokenizer = tokenize_and_stem,
    ngram_range = (1,3)
)

"""TFIDF matrix No-Costraint"""
tfidf_matrix_nc = tfidf_vectorizer.fit_transform(pages_content_nc)

"""TFIDF matrix List-Costraint"""
tfidf_matrix_lc = tfidf_vectorizer.fit_transform(pages_content_lc)

### Cosine Similarity
dist is defined as 1 - the cosine similarity of each document. Cosine similarity is measured against the tf-idf matrix and can be used to generate a measure of similarity between each document and the other documents in the corpus

In [None]:
"""No-Costraint"""
dist_nc = 1 - cosine_similarity(tfidf_matrix_nc)

"""List-Costraint"""
dist_lc = 1 - cosine_similarity(tfidf_matrix_lc)

### Word2Vec
Applying word2vec (skip-gram with negative sampling) to sequences generated by crawling the web site. Vectors generated for each web-page are stored in **w2v_vecs_nc** and **w2v_vecs_lc** 

In [None]:
"""No-Costraint"""
sequences_nc = nocostraint_path + "sequenceIDs.txt"

# because of generator
vocab_sequences_nc = get_seq(sequences_nc, 1)
train_sequences_nc = get_seq(sequences_nc, 1)

"""List-Costraint"""
sequences_lc = listcostraint_path + "sequenceIDs.txt"

# because of generator
vocab_sequences_lc = get_seq(sequences_lc, 1)
train_sequences_lc = get_seq(sequences_lc, 1)

In [None]:
"""No-Costraint"""
w2v_model_nc = Word2Vec(min_count=1, negative=5, size=48)
w2v_model_nc.build_vocab(vocab_sequences_nc)
w2v_model_nc.train(train_sequences_nc)

w2v_vecs_nc = np.array([w2v_model_nc[key] for key in content_nc_map])

"""List-Costraint"""
w2v_model_lc = Word2Vec(min_count=1, negative=5, size=48)
w2v_model_lc.build_vocab(vocab_sequences_lc)
w2v_model_lc.train(train_sequences_lc)

w2v_vecs_lc = np.array([w2v_model_lc[key] for key in content_lc_map])

### Latent Semantic Analysis (LSA)
Dimensionality reduction using **truncated SVD** (aka LSA).
This transformer performs linear dimensionality reduction by means of truncated singular value decomposition (SVD). 

It is very similar to PCA, but operates on sample vectors directly, instead of on a covariance matrix.

In particular, truncated SVD works on term count/tf-idf matrices as returned by the vectorizers. In that context, it is known as latent semantic analysis (LSA).

In [None]:
svd = TruncatedSVD(n_components=50, algorithm="arpack", random_state=1)

"""No-Costraint"""
pages_tfidf_vecs_nc = svd.fit_transform(tfidf_matrix_nc)

"""List-Costraint"""
pages_tfidf_vecs_lc = svd.fit_transform(tfidf_matrix_lc)

### Combining Vectors from word2vec and TFIDF
Appending TFIDF vector at the end of the relative word2vec vector for each page.

In [None]:
"""No-Costraint"""
combined_vecs_nc = [np.append(w2v_vecs_nc[i], pages_tfidf_vecs_nc[i]) for i in range(len(pages_tfidf_vecs_nc))]

"""List-Costraint"""
combined_vecs_lc = [np.append(w2v_vecs_lc[i], pages_tfidf_vecs_lc[i]) for i in range(len(pages_tfidf_vecs_lc))]

### T-SNE
Reducing vectors dimensionality to 2 for plot purposes

In [None]:
tsne = TSNE(n_components=2, random_state=1)

"""No-Costraint"""
twodim_nc = tsne.fit_transform(combined_vecs_nc)

"""List-Costraint"""
twodim_lc = tsne.fit_transform(combined_vecs_lc)

### K-Means Clustering

In [None]:
kmeans = KMeans(n_clusters=15)

"""No-Costraint"""
kmeans.fit(tfidf_matrix_nc)
kmeans_clusters_nc = kmeans.labels_
kmeans_colors_nc = [get_color(i) for i in kmeans_clusters_nc]

"""List-Costraint"""
kmeans.fit(tfidf_matrix_lc)
kmeans_clusters_lc = kmeans.labels_
kmeans_colors_lc = [get_color(i) for i in kmeans_clusters_lc]

### K-Means No-costraint plot

In [None]:
kmeans_data_nc = scatter_plot(twodim_nc, word_labels=longurls_nc, colors=kmeans_colors_nc)
py.iplot(kmeans_data_nc, filename="K-Means TFIDF-W2V Clustering - No-Costraint")

### K-Means List-costraint plot

In [None]:
kmeans_data_lc = scatter_plot(twodim_lc, word_labels=longurls_lc, colors=kmeans_colors_lc)
py.iplot(kmeans_data_lc, filename="K-Means TFIDF-W2V Clustering - List-Costraint")

### HDBSCAN Clsustering

In [None]:
hdbscan = HDBSCAN(min_cluster_size=7)

"""No-Costraint"""
hdbscan_labels_nc = hdbscan.fit_predict(tfidf_matrix_nc)
hdbscan_colors_nc = [get_color(n_clust) for n_clust in hdbscan_labels_nc]

print "Clusters found with HDBSCAN in No-costraint Dataset:", len(set(hdbscan_labels_nc))
print [label for label in set(hdbscan_labels_nc)], "\n"

"""List-Costraint"""
hdbscan_labels_lc = hdbscan.fit_predict(tfidf_matrix_lc)
hdbscan_colors_lc = [get_color(n_clust) for n_clust in hdbscan_labels_lc]

print "Clusters found with HDBSCAN in List-costraint Dataset:", len(set(hdbscan_labels_nc))
print [label for label in set(hdbscan_labels_nc)]

### HDBSCAN No-costraint plot

In [None]:
hdbscan_data_nc = scatter_plot(twodim_nc, word_labels=longurls_nc, colors=hdbscan_colors_nc)
py.iplot(hdbscan_data_nc, filename="HDBSCAN TFIDF-W2V Clustering - No-Costraint")

### HDBSCAN List-costraint plot

In [None]:
hdbscan_data_lc = scatter_plot(twodim_lc, word_labels=longurls_lc, colors=hdbscan_colors_lc)
py.iplot(hdbscan_data_lc, filename="HDBSCAN TFIDF-W2V Clustering - List-Costraint")

In [None]:
print(len(hdbscan_labels_lc))

In [None]:
real_membership_list = real_membership_lc
clusters_found_labels = hdbscan_labels_lc

conf_table = np.zeros((len(set(real_membership_list)), len(set(clusters_found_labels))), dtype="int8")
real_clusters_set = set(real_membership_list)

assert isinstance(real_membership_list[0], int), "Type is not int"
assert isinstance(clusters_found_labels[0], int), "Type is not int"

for current_clust in real_clusters_set:
        for i in range(len(clusters_found_labels)):
            if int(real_membership_list[i]) == int(current_clust):
                cluster_found = clusters_found_labels[i]
                conf_table[current_clust][cluster_found] = 1 # conf_table[current_clust][cluster_found] + 1

In [None]:
real_membership_lc = [int(m) for m in real_membership_lc]
print (set(real_membership_lc))

In [None]:
get_confusion_table(real_membership_lc ,hdbscan_labels_lc)

In [None]:
"""from sklearn.manifold import TSNE
import numpy as np
X = np.random.rand(100,100)
t = TSNE(n_components=20)
t.fit(X)"""