In [1]:
import os
import numpy as np
import pandas as pd

from url_sequences.sequence_manager import *
from url_sequences.sequence_plotter import *

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

from sklearn.manifold import TSNE
from sklearn.manifold import MDS

from __future__ import print_function
import plotly.plotly as py
from plotly.graph_objs import *
import plotly.graph_objs as go

In [2]:
nocostraint_path = os.getcwd() + "/dataset/new/cs.illinois.eduNoConstraint.words1000.depth.10/"
vertex_nc_path= nocostraint_path + "vertex.txt"
map_nc_path = nocostraint_path + "urlsMap.txt"

content_nc_map = get_content_map(vertex_nc_path)
url_nc_map = get_urlmap(map_nc_path)

# map -> {code: token_list}
tokens_nc_map = to_tokens_map(content_nc_map)
# map -> {code: stem_list}
stems_nc_map = to_stems_map(content_nc_map)

# total vocabulary, list of tokens
totalvocab_nc_stemmed = get_total_vocab(stems_nc_map)
# total vocabulary, list of stems
totalvocab_nc_tokenized = get_total_vocab(tokens_nc_map)

# document list
documents_nc = [content_nc_map[key] for key in content_nc_map]
codes_nc = [key for key in content_nc_map]
longurls_nc = [url_nc_map[key] for key in content_nc_map]

In [3]:
vocab_nc_frame = pd.DataFrame({'words': totalvocab_nc_tokenized}, index = totalvocab_nc_stemmed)
vocab_nc_frame[:5]

Unnamed: 0,words
depart,department
of,of
comput,computer
scienc,science
at,at


In [4]:
""" 
max_df: 
    this is the maximum frequency within the documents a given feature
    can have to be used in the tfi-idf matrix.
min_idf:
    this could be an integer (e.g. 0.2) and the term would have to be in
    at least 20% of the documents to be considered.
ngram_range:
    (e.g. 1,3) this just means I'll look at unigrams, bigrams and trigrams. 
"""
tfidf_vectorizer = TfidfVectorizer(
    max_df = 0.8,
    max_features = 200000,
    min_df = 0.1,
    stop_words = 'english',
    use_idf = True,
    tokenizer = tokenize_and_stem,
    ngram_range = (1,3)
)

In [5]:
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
tsne = TSNE(n_components=2, random_state=1)

In [6]:
tfidf_matrix_nc = tfidf_vectorizer.fit_transform(documents_nc) 

# terms without stopwords or duplicates
terms_nc = tfidf_vectorizer.get_feature_names()
dist_nc = 1 - cosine_similarity(tfidf_matrix_nc)

print(tfidf_matrix_nc.shape, len(documents_nc))
two_dim_vec_nc = mds.fit_transform(dist_nc)

tfidf_matrix_nc_dense = tfidf_matrix_nc.todense()
docs_vecs_nc = np.array([tfidf_matrix_nc_dense[i].A1 for i in range(len(tfidf_matrix_nc_dense))])

(728, 433) 728


In [7]:
from url_sequences.sequence_handler import *
sequences_nc = nocostraint_path + "sequenceIDs.txt"

# because of generator
vocab_sequences = get_seq(sequences_nc, 1)
train_sequences = get_seq(sequences_nc, 1)

In [8]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(min_count=1, negative=5, size=48)
w2v_model.build_vocab(vocab_sequences)
w2v_model.train(train_sequences)

994570

In [9]:
w2v_vecs_nc = np.array([w2v_model[key] for key in content_nc_map]) 
docs_vecs_nc = docs_vecs_nc

In [10]:
tsne50 =  TSNE(n_components=50)
docs_vecs_nc_reduced = tsne50.fit_transform(docs_vecs_nc)
#docs_vecs_nc_reduced = tsne50.fit_transform(docs_vecs_nc)
# vecs = [ np.concatenate((w2v_vecs_lc[i], docs_vecs_lc_reduced[i]), axis=0) for i in range(3)]
