In [1]:
import os
import numpy as np
import pandas as pd
import nltk
from nltk.stem.snowball import SnowballStemmer

from url_sequences.sequence_manager import *
from url_sequences.sequence_plotter import *

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

from sklearn.manifold import TSNE
from sklearn.manifold import MDS

from __future__ import print_function
import plotly.plotly as py
from plotly.graph_objs import *
import plotly.graph_objs as go

In [2]:
# run below to dowload english stopwords
# nltk.download()
# english stopwords
stopwords = nltk.corpus.stopwords.words('english')
# english stemmer
# stemmer = SnowballStemmer("english")

In [3]:
nocostraint_path = os.getcwd() + "/dataset/new/cs.illinois.eduNoConstraint.words1000.depth.10/"
vertex_nc_path= nocostraint_path + "vertex.txt"
map_nc_path = nocostraint_path + "urlsMap.txt"

content_nc_map = get_content_map(vertex_nc_path)
url_nc_map = get_urlmap(map_nc_path)

# map -> {code: token_list}
tokens_nc_map = to_tokens_map(content_nc_map)
# map -> {code: stem_list}
stems_nc_map = to_stems_map(content_nc_map)

# total vocabulary, list of tokens
totalvocab_nc_stemmed = get_total_vocab(stems_nc_map)
# total vocabulary, list of stems
totalvocab_nc_tokenized = get_total_vocab(tokens_nc_map)

# document list
documents_nc = [content_nc_map[key] for key in content_nc_map]
codes_nc = [key for key in content_nc_map]
longurls_nc = [url_nc_map[key] for key in content_nc_map]

In [4]:
listcostraint_path = os.getcwd() + "/dataset/new/cs.illinois.edu.ListConstraint.words1000.depth10/"
vertex_lc_path= listcostraint_path + "vertex.txt"
map_lc_path = listcostraint_path + "urlsMap.txt"

content_lc_map = get_content_map(vertex_lc_path)
url_lc_map = get_urlmap(map_lc_path)

# map -> {code: token_list}
tokens_lc_map = to_tokens_map(content_lc_map)
# map -> {code: stem_list}
stems_lc_map = to_stems_map(content_lc_map)

# total vocabulary, list of tokens
totalvocab_lc_stemmed = get_total_vocab(stems_lc_map)
# total vocabulary, list of stems
totalvocab_lc_tokenized = get_total_vocab(tokens_lc_map)

# document list
documents_lc = [content_lc_map[key] for key in content_lc_map]
codes_lc = [key for key in content_lc_map]
longurls_lc = [url_lc_map[key] for key in content_lc_map]

In [5]:
vocab_nc_frame = pd.DataFrame({'words': totalvocab_nc_tokenized}, index = totalvocab_nc_stemmed)
vocab_nc_frame[:5]

Unnamed: 0,words
depart,department
of,of
comput,computer
scienc,science
at,at


In [6]:
vocab_lc_frame = pd.DataFrame({'words': totalvocab_lc_tokenized}, index = totalvocab_lc_stemmed)
vocab_lc_frame[:5]

Unnamed: 0,words
engin,engineering
at,at
illinoi,illinois
my,my
cs,cs


## Terms Documents Matrix
![Alt text](http://www.jiem.org/index.php/jiem/article/viewFile/293/252/2402 "Very nice")

In [7]:
""" 
max_df: 
    this is the maximum frequency within the documents a given feature
    can have to be used in the tfi-idf matrix.
min_idf:
    this could be an integer (e.g. 0.2) and the term would have to be in
    at least 20% of the documents to be considered.
ngram_range:
    (e.g. 1,3) this just means I'll look at unigrams, bigrams and trigrams. 
"""
tfidf_vectorizer = TfidfVectorizer(
    max_df = 0.8,
    max_features = 200000,
    min_df = 0.1,
    stop_words = 'english',
    use_idf = True,
    tokenizer = tokenize_and_stem,
    ngram_range = (1,3)
)

## K-Means Clustering - No Costraint

In [8]:
tfidf_matrix_nc = tfidf_vectorizer.fit_transform(documents_nc) 

# terms without stopwords or duplicates
terms_nc = tfidf_vectorizer.get_feature_names()

# can be used to generate a measure of similarity between each document and the other documents in the corpus
dist_nc = 1 - cosine_similarity(tfidf_matrix_nc)

print(tfidf_matrix_nc.shape, len(documents_nc))

(728, 433) 728


In [9]:
km = KMeans(n_clusters=15)
km.fit(tfidf_matrix_nc)
clusters_nc = km.labels_.tolist()

In [10]:
docs_nc = { 'code': codes_nc, 'rank': range(len(documents_nc)), 'document': documents_nc, 'cluster': clusters_nc }
frame_nc = pd.DataFrame(docs_nc, index = [clusters_nc] , columns = ['rank', 'document', 'code', 'cluster'])

frame_nc[:5]

Unnamed: 0,rank,document,code,cluster
5,0,john deere scholarship in computer science dep...,344,5
5,1,jump trading scholars department of computer s...,345,5
5,2,rockwell collins scholarship department of com...,346,5
5,3,spot trading scholarship department of compute...,347,5
5,4,illinois cyber security scholars program icssp...,340,5


In [11]:
# sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

for i in range(len(set(clusters_nc))):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :6]: # replace 6 with n words per cluster
        print(' %s' % vocab_nc_frame.ix[terms_nc[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print()

Cluster 0 words: applications, requires, programs, graduate, ms, degrees,
Cluster 1 words: said, s, data, using, team, development,
Cluster 2 words: ieee, awards, society, s, real, time,
Cluster 3 words: win, awards, directory, home, directory, stories,
Cluster 4 words: curriculum, research, news, news, offered, class,
Cluster 5 words: undergraduate, undergraduate, scholarship, programs, students, advising,
Cluster 6 words: school, opportunities, high, s, advancing, cs,
Cluster 7 words: media, moone, s, work, networks, professor,
Cluster 8 words: security, information, networks, data, s, work,
Cluster 9 words: ph, ph, thesis, d, form, requires,
Cluster 10 words: graduate, programs, applications, fellowships, ms, deadline,
Cluster 11 words: parallel, performance, parallel, software, professor, high,
Cluster 12 words: offices, directory, offices, contacts, contacts, faculty,
Cluster 13 words: awards, distinguished, chair, s, member, siebel,
Cluster 14 words: siebel, siebel, center, offic

In [12]:
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
tsne = TSNE(n_components=2, random_state=1)

# K-Means Plot - MDS No Costraint

In [13]:
two_dim_vec_nc = mds.fit_transform(dist_nc)

tfidf_matrix_nc_dense = tfidf_matrix_nc.todense()
docs_vecs_nc = np.array([tfidf_matrix_nc_dense[i].A1 for i in range(len(tfidf_matrix_nc_dense))])

clusters_colors_nc = [ get_color(i) for i in clusters_nc]

k_data_nc = scatter_plot(two_dim_vec_nc, word_labels=longurls_nc, colors=clusters_colors_nc)
py.iplot(k_data_nc, filename="K-Means mds-Doc Clustering")

# K-Means Plot - TSNE No Costraint

In [14]:
two_dim_tsne_vec_nc = tsne.fit_transform(dist_nc)

k_tsne_data = scatter_plot(two_dim_tsne_vec_nc, word_labels=longurls_nc, colors=clusters_colors_nc)
py.iplot(k_tsne_data, filename="K-Means tsne-Doc Clustering")

## K-Means Clustering - List Costraint

In [15]:
tfidf_matrix_lc = tfidf_vectorizer.fit_transform(documents_lc) 

# terms without stopwords or duplicates
terms_lc = tfidf_vectorizer.get_feature_names()

# can be used to generate a measure of similarity between each document and the other documents in the corpus
dist_lc = 1 - cosine_similarity(tfidf_matrix_lc)

print(tfidf_matrix_lc.shape, len(documents_lc))

(1022, 370) 1022


In [16]:
km_lc = KMeans(n_clusters=15)
km_lc.fit(tfidf_matrix_lc)
clusters_lc = km_lc.labels_.tolist()

In [17]:
docs_lc = { 'code': codes_lc, 'rank': range(len(documents_lc)), 'document': documents_lc, 'cluster': clusters_lc }
frame_lc = pd.DataFrame(docs_lc, index = [clusters_lc] , columns = ['rank', 'document', 'code', 'cluster'])

frame_lc[:3]

Unnamed: 0,rank,document,code,cluster
1,0,engineering at illinois my cs illinois educomp...,344,1
1,1,engineering at illinois my cs illinois educomp...,345,1
4,2,engineering at illinois my cs illinois educomp...,346,4


In [18]:
# sort cluster centers by proximity to centroid
order_centroids_lc = km_lc.cluster_centers_.argsort()[:, ::-1]

for i in range(len(set(clusters_lc))):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids_lc[i, :6]: # replace 6 with n words per cluster
        print(' %s' % vocab_lc_frame.ix[terms_lc[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print()

Cluster 0 words: networking, s, information, communications, awards, work,
Cluster 1 words: honors, publications, honors, contact, contact, office,
Cluster 2 words: undergraduate, undergraduate, programs, edu, illinois, students,
Cluster 3 words: parallel, algorithm, programs, languages, methods, software,
Cluster 4 words: primary, primary, primary, research, professor, office,
Cluster 5 words: applications, graduate, programs, ms, required, ph,
Cluster 6 words: moone, media, parallel, awards, professor, s,
Cluster 7 words: description, topic, section, courses, curriculum, instructors,
Cluster 8 words: s, said, work, data, development, using,
Cluster 9 words: awards, members, s, siebel, center, service,
Cluster 10 words: edu, illinois, edu, siebel, center, center,
Cluster 11 words: store, courses, store, curriculum, cs, courses,
Cluster 12 words: team, projects, s, said, using, design,
Cluster 13 words: b, c, r, t, design, hours,
Cluster 14 words: comp, siebel, center, siebel, comp, ce

# K-Means Plot - MDS List Costraint

In [19]:
two_dim_vec_lc = mds.fit_transform(dist_lc)

tfidf_matrix_lc_dense = tfidf_matrix_lc.todense()
docs_vecs_lc = np.array([tfidf_matrix_lc_dense[i].A1 for i in range(len(tfidf_matrix_lc_dense))])

clusters_colors_lc = [ get_color(i) for i in clusters_lc]

k_data_lc = scatter_plot(two_dim_vec_lc, word_labels=longurls_lc, colors=clusters_colors_lc)
py.iplot(k_data_lc, filename="K-Means mds-Doc Clustering-List Costraint")

# K-Means Plot - TSNE List Costraint

In [20]:
two_dim_tsne_vec_lc = tsne.fit_transform(dist_lc)

k_tsne_data_lc = scatter_plot(two_dim_tsne_vec_lc, word_labels=longurls_lc, colors=clusters_colors_lc)
py.iplot(k_tsne_data_lc, filename="K-Means tsne-Doc Clustering-List Costraint")

# w2v - tfidf

In [21]:
from url_sequences.sequence_handler import *
sequences_lc = listcostraint_path + "sequenceIDs.txt"

# because of generator
vocab_sequences = get_seq(sequences_lc, 1)
train_sequences = get_seq(sequences_lc, 1)

In [22]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(min_count=1, negative=5, size=48)
w2v_model.build_vocab(vocab_sequences)
w2v_model.train(train_sequences)

1000000

In [23]:
w2v_vecs_lc = np.array([w2v_model[key] for key in content_lc_map]) 
docs_vecs_lc = docs_vecs_lc

In [None]:
type(docs_vecs_lc)

numpy.ndarray

In [None]:
tsne50 =  TSNE(n_components=50)
docs_vecs_lc_reduced = tsne50.fit_transform(docs_vecs_lc)
# vecs = [ np.concatenate((w2v_vecs_lc[i], docs_vecs_lc_reduced[i]), axis=0) for i in range(3)]
