In [2]:
import os
import numpy as np
import pandas as pd
import nltk
from nltk.stem.snowball import SnowballStemmer
from url_sequences.sequence_manager import *
from url_sequences.sequence_plotter import *
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
path = os.getcwd() + "/dataset/new/cs.illinois.eduNoConstraint.words1000.depth.10/"
filepath= path + "vertex.txt"
mappath = path + "urlsMap.txt"

content_map = get_content_map(filepath)
url_map = get_urlmap(mappath)

# to dowload english stopwords
# nltk.download()
# english stopwords
stopwords = nltk.corpus.stopwords.words('english')
# english stemmer
# stemmer = SnowballStemmer("english")

# map -> {code: token_list}
tokens_map = to_tokens_map(content_map)
# map -> {code: stem_list}
stems_map = to_stems_map(content_map)

# total vocabulary, list of tokens
totalvocab_stemmed = get_total_vocab(stems_map)
# total vocabulary, list of stems
totalvocab_tokenized = get_total_vocab(tokens_map)

# document list
documents = [content_map[key] for key in content_map]
codes = [key for key in content_map]
longurls = [url_map[key] for key in content_map]

In [5]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
vocab_frame[:5]

Unnamed: 0,words
depart,department
of,of
comput,computer
scienc,science
at,at


## Terms Documents Matrix
![Alt text](http://www.jiem.org/index.php/jiem/article/viewFile/293/252/2402 "Very nice")

In [6]:
""" 
max_df: 
    this is the maximum frequency within the documents a given feature
    can have to be used in the tfi-idf matrix.
min_idf:
    this could be an integer (e.g. 0.2) and the term would have to be in
    at least 20% of the documents to be considered.
ngram_range:
    (e.g. 1,3) this just means I'll look at unigrams, bigrams and trigrams. 
"""
tfidf_vectorizer = TfidfVectorizer(
    max_df = 0.8,
    max_features = 200000,
    min_df = 0.2,
    stop_words = 'english',
    use_idf = True,
    tokenizer = tokenize_and_stem,
    ngram_range = (1,3)
)

#fit the vectorizer to synopse
tfidf_matrix = tfidf_vectorizer.fit_transform(documents) 
print(tfidf_matrix.shape, len(documents))

((728, 156), 728)


In [7]:
# terms without stopwords or duplicates
terms = tfidf_vectorizer.get_feature_names()

# can be used to generate a measure of similarity between each document and the other documents in the corpus
dist = 1 - cosine_similarity(tfidf_matrix)

## K-Means Clustering

In [8]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=15)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

In [9]:
docs = { 'code': codes, 'rank': range(len(documents)), 'document': documents, 'cluster': clusters }
frame = pd.DataFrame(docs, index = [clusters] , columns = ['rank', 'document', 'code', 'cluster'])

frame[:10]

Unnamed: 0,rank,document,code,cluster
0,0,john deere scholarship in computer science dep...,344,0
0,1,jump trading scholars department of computer s...,345,0
0,2,rockwell collins scholarship department of com...,346,0
0,3,spot trading scholarship department of compute...,347,0
0,4,illinois cyber security scholars program icssp...,340,0
0,5,senior honorary department of computer science...,341,0
0,6,cisco systems wcs undergraduate scholarship de...,342,0
0,7,crowe horwath llp outstanding computer science...,343,0
0,8,state farm computer science scholarship depart...,348,0
0,9,afcea cyber studies scholarship department of ...,349,0


In [11]:
from __future__ import print_function

# sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

for i in range(len(set(clusters))):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: # replace 6 with n words per cluster
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace

Cluster 0 words: undergraduate, programs, students, academic, awards, graduate,
Cluster 1 words: high, s, information, nation, programs, advancing,
Cluster 2 words: offices, research, news, news, related, receives,
Cluster 3 words: requires, d, ms, graduate, graduate, programs,
Cluster 4 words: data, information, s, using, work, said,
Cluster 5 words: awards, receives, cs, siebel, offices, research,
Cluster 6 words: said, s, work, projects, development, team,
Cluster 7 words: research, news, news, offices, click, years,
Cluster 8 words: graduate, programs, applications, ms, campus, degrees,
Cluster 9 words: awards, professor, s, performance, moone, work,
Cluster 10 words: siebel, siebel, center, offices, academic, undergraduate,
Cluster 11 words: awards, s, member, siebel, siebel, cs,
Cluster 12 words: media, moone, s, professor, director, programs,
Cluster 13 words: applications, programs, graduate, requires, ms, degrees,
Cluster 14 words: networks, data, information, awards, s, work,

In [13]:
from sklearn.manifold import MDS

mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
two_dim_vec = mds.fit_transform(dist)

tfidf_matrix_dense = tfidf_matrix.todense()
docs_vecs = np.array([tfidf_matrix_dense[i].A1 for i in range(len(tfidf_matrix_dense))])

In [23]:
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2)
two_dim_docs_vec = tsne.fit_transform(dist)

In [15]:
clusters_colors = [ get_color(i) for i in clusters]

In [16]:
import plotly.plotly as py
from plotly.graph_objs import *
import plotly.graph_objs as go

k_data = scatter_plot(two_dim_vec, word_labels=longurls, colors=clusters_colors)
py.iplot(k_data, filename="K-Means mds-Doc Clustering")

In [22]:
k_tfidf_data = scatter_plot(two_dim_docs_vec, word_labels=longurls, colors=clusters_colors)
py.iplot(k_tfidf_data, filename="K-Means tsne-Doc Clustering")