## Offline part

check to see if the compressed tdm is available, then jump over the calculation
otherwise read from the URL with the zip file and build it

Load the data

In [3]:
#from __future__ import print_function

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.metrics.pairwise import linear_kernel

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

import numpy as np


In [4]:
import numpy as np

## Read files from URL
# from urllib2 import urlopen
# urlopen()



# Read files from zip
arc_path = "/Users/dima/Google Drive/CUNY_MSDA/Data_602_Python/Final_Project/data/bbc-fulltext.zip"

def read_corpus_from_zip(archive_path):
    '''
    WRITE DOCSTRING
    '''
    
    import zipfile
    import os
    
    with zipfile.ZipFile(archive_path) as z:

        docs = []
        terms = []
        # need to read only .txt files but not those hidden ones created by the MacOS

        for filename in [f for f in z.namelist() if (not f.startswith('_'))& f.endswith('.txt')]:
            with z.open(filename) as f:
                docs.append(filename)
                words = f.read().replace('\n', '')
                terms.append(words)

    print 'Created a corpus with {} documents'.format(len(docs))
    return {'docs' : docs,'terms' : terms}

corpus = read_corpus_from_zip(arc_path)

Created a corpus with 2225 documents


Clean and stem the strings before passing to the tokenizer that will count term occurrences per document.

In [5]:
# Process terms: Remove punctuation & numbers, lowercase, remove articles and auxillary verbs

def clean_strings(string_):
    '''
    WRITE DOCSTRING
    '''
    
    import string
    import re
    from nltk import stem
    
    snowball = stem.snowball.EnglishStemmer() # function to stem English words
    
    string_ = string_.lower()  # lowercase
    string_ = re.sub(r'[^\x00-\x7f]',r' ',string_) # remove non-unicode characters
    string_ = re.sub('\d','',string_) # retain only letters
    string_ = re.sub(r' *\b[a-zA-Z]{1,2}\b',r'',string_) # remove all 1- or 2-word terms
    #string_ = [snowball.stem(i) for i in string_.split()] # stem
    #string_ = ' '.join(string_)
    return string_


corpus['terms'] = [clean_strings(item) for item in corpus['terms']]


In [6]:
# Print the first 300 symbols of the first document
corpus['terms'][0][:300]

' sales boost time warner profitquarterly profits media giant timewarner jumped % $. () for the three months december, from $ year-earlier.the firm, which now one the biggest investors google, benefited from sales high-speed internet connections and higher advert sales. timewarner said fourth quarter'

Apply scikit-learn vectorizer

Reference for this part:  

http://scikit-learn.org/stable/auto_examples/text/document_clustering.html#sphx-glr-auto-examples-text-document-clustering-py  
Peter Prettenhofer  
Lars Buitinck  

In [7]:
t0 = time()

vectorizer = TfidfVectorizer(max_df=0.7,min_df=2, stop_words='english',use_idf=True)
# max_df=0.7 means..
# min_df=2 means..

X = vectorizer.fit_transform(corpus['terms'])

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)

done in 0.795325s
n_samples: 2225, n_features: 16723


In [8]:
t0 = time()
# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized, we have to redo the normalization.

svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
# n_components = 100 means...
# n_iter = 7 means..

normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(X.transpose()) # use transpose to generate a matrix for terms(features) instead of documents

print("done in %fs" % (time() - t0))
print("n_terms: %d, n_components: %d" % X.shape)

done in 1.663015s
n_terms: 16723, n_components: 100


In [9]:
X.shape

(16723, 100)

In [10]:
X

array([[ 0.11322606, -0.08858838, -0.11481925, ...,  0.02483692,
         0.0308393 ,  0.12903484],
       [ 0.11673417, -0.10178572, -0.12229693, ..., -0.1090511 ,
         0.06981183, -0.03725003],
       [ 0.11532662, -0.10773052, -0.02530448, ...,  0.0298613 ,
        -0.01788788,  0.03954373],
       ..., 
       [ 0.24039674, -0.20552223, -0.23199312, ...,  0.01500166,
         0.05132643, -0.03396719],
       [ 0.13739699, -0.15670377, -0.19861743, ...,  0.01157532,
         0.18498143, -0.20329173],
       [ 0.09546642, -0.124781  , -0.11831286, ..., -0.00733596,
        -0.03006833, -0.11851548]])

Now we have a matrix with M = 16.723 terms described by N = 100 components representing "semantic dimensions".  
We can:
  
1) Visualize the scores of each term across these dimensions in a MxN heatmap  
2) Calculate pairwise similarity between two term vectors (1xN each) using cosine distance. Then an MxM similarity matrix can be constructed holding these similarity values. The higher the value is to 1 the closer the terms are to each other in the semantic dimensions.  
  
So for any term we can provide a set of e.g. top 10 synonyms using the top values from the correspondent row in the distance matrix. 

In [11]:
# The array of term names
terms = vectorizer.get_feature_names()
len(terms)

16723

In [12]:
# Find top terms per semantic dimension

import pandas as pd

terms_comps = pd.DataFrame(X) # semantic dimensions = components, are the columns, one row per each term

In [13]:
terms_comps.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.113226,-0.088588,-0.114819,-0.066678,-0.017645,0.028401,-0.02078,-0.293838,-0.176486,0.032807,...,0.020546,-0.064412,0.018931,0.109256,-0.06318,0.108095,0.077723,0.024837,0.030839,0.129035
1,0.116734,-0.101786,-0.122297,-0.047617,-0.036581,0.017029,-0.023478,-0.291808,-0.1395,0.071898,...,0.013701,0.239071,-0.020271,-0.023135,-0.017255,-0.037946,-0.05027,-0.109051,0.069812,-0.03725


In [14]:
max_positions = [terms_comps[i].idxmax() for i in terms_comps.columns] # find max value for each dimension
top_100 = terms_comps.ix[max_positions,:]
top_100_names = [terms[i] for i in top_100.index]

In [15]:
top_100

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
5492,0.806829,-0.097683,-0.003997,-0.123638,0.084433,0.044903,-0.000669,0.038489,0.045670,-0.082156,...,0.002712,0.069437,0.036159,0.020646,0.089023,-0.016153,-0.049998,-0.026585,0.073529,-0.015199
8394,0.418680,0.689105,-0.201592,0.090200,0.096754,-0.160188,0.099288,0.014597,-0.037578,0.150110,...,0.042380,0.023922,0.054399,-0.062813,-0.014279,-0.040795,-0.022744,-0.026782,0.056776,-0.033096
558,0.444116,-0.160972,0.554381,-0.113607,-0.173565,-0.003042,0.163770,0.044734,0.001305,0.070224,...,0.012206,0.048663,-0.090366,-0.062196,0.007909,-0.109807,-0.032486,-0.011030,0.041084,0.087756
5675,0.251263,-0.263190,-0.186943,0.672115,-0.211130,-0.001158,-0.077258,0.201702,0.026863,-0.030201,...,0.031930,0.013662,0.025530,-0.036441,-0.000505,-0.040764,0.002074,0.000028,-0.004056,-0.000044
15891,0.365892,-0.211860,0.350625,0.116441,0.485303,-0.047829,-0.018561,0.046708,-0.108111,0.071142,...,0.012699,-0.076144,0.029061,-0.057052,0.017620,0.033803,-0.052981,-0.187587,0.004735,-0.038516
8791,0.174686,-0.002969,0.148498,-0.048341,-0.118541,0.667577,0.266283,0.228232,-0.206034,0.181492,...,0.011666,0.047426,0.057165,-0.022985,0.032421,-0.050431,0.009493,-0.026822,0.127825,0.097674
9338,0.253812,0.076878,0.055408,-0.017347,-0.058138,0.468640,0.474223,0.207539,-0.191608,0.265190,...,-0.004595,0.001456,0.015703,0.049484,-0.000808,0.061697,0.070390,-0.052903,0.025276,0.110443
8836,0.240455,-0.230018,-0.367275,-0.328763,0.033651,-0.093207,-0.099776,0.481866,-0.295191,-0.230157,...,-0.055778,-0.018384,0.041363,0.009563,0.011498,-0.032569,0.004309,-0.045224,0.040430,-0.127821
8535,0.224685,-0.188437,-0.290613,-0.265609,0.052348,0.028558,0.052135,-0.007339,0.535788,0.062323,...,-0.091471,0.032437,-0.017093,-0.082046,-0.035775,0.020990,-0.035913,-0.027586,0.010435,-0.006341
12931,0.125599,-0.127253,-0.135445,-0.138656,-0.073527,0.183608,0.168640,-0.215676,-0.276819,0.411004,...,-0.065782,0.025006,-0.083554,-0.004572,0.043698,-0.076219,-0.078145,-0.026719,0.030568,-0.040605


In [16]:
# Load graphics

from bokeh.io import output_notebook, show
from bokeh.plotting import figure, show, output_file
from bokeh.models import ColumnDataSource, HoverTool
output_notebook()

In [17]:
#from bokeh.charts import HeatMap

#p = figure(x_range=list(top_100.columns), y_range=list(top_100_names))

# must give a vector of image data for image parameter
#p.image(image=top_100.as_matrix(), x=0, y=0, dw=10, dh=10, palette="Spectral11")


#hm = HeatMap(data=top_100, x=list(top_100.columns), y=list(top_100_names), values=top_100,
#             title='Top100', stat=None)

#hm = Heatmap(z=top_100.as_matrix(),
#                   x=top_100.columns,
 #                  y=top_100_names)

#show(p)

In [18]:
#top_100_names
#list(terms_comps.columns)

In [19]:
# Function to filter top 10 words per semantic dimension

def get_top_x_term_df(inp_matrix, term_names, top_x):
    '''WRITE DOCSTRING AND ERROR'''
        
    # Convert into long pandas table and use built-in top-n function, then use the index of the top scoring 
    # terms to filter the melted term_component df
    tc_matrix = inp_matrix.copy()
    val_cols = tc_matrix.columns
    tc_matrix['term'] = term_names
    tc_melt = pd.melt(tc_matrix, id_vars=['term'], value_vars=list(val_cols),var_name='sem_dim', value_name='weight')
    top_weights = tc_melt.groupby(['sem_dim'])['weight'].nlargest(top_x)
    top_df = tc_melt.ix[top_weights.index.get_level_values(1),:]
    return top_df


In [20]:
top_terms = get_top_x_term_df(terms_comps,terms, 5)
top_terms.head(20)

Unnamed: 0,term,sem_dim,weight
5492,far,0,0.806829
185,added,0,0.803412
9051,make,0,0.794982
15157,time,0,0.786284
16264,way,0,0.786165
25117,labour,1,0.689105
21559,election,1,0.673864
25825,manifesto,1,0.634419
19869,conservative,1,0.631094
27914,polls,1,0.627402


We can see how the first dimensions show terms related to the same topic, e.g. dimension 1 appears to be related to politics, dimension 2 to "stock markets or business", dimension 3 is about films and actors, however, the dimension 0 is unclear when judged alone.

In [21]:
# Calculate cosine distances from 1 vector to all the other vectors to find close terms
terms_comps.shape

(16723, 100)

In [22]:
# Calculate cosine distances from 1 vector to all the other vectors to find close terms

def top_synonyms(tc_matrix,search_term,search_n):
    '''
    WRITE DOCSTRING AND NOT FOUND ERROR HANDLING FOR WHEN TERM NOT FOUND
    avoid duplicate records like in output: top_synonyms(X, 'hall', 10)
    '''
    search_index = terms.index(search_term)

    term_vec = tc_matrix[search_index,:].reshape(1, -1) # vector for the search item
    
    # find distances to each other term, as the values are normalized can use linear kernel:
    term_dist = linear_kernel(term_vec, tc_matrix) 
    
    vals_max = np.sort(term_dist).take(range(-(search_n+1),0)) # top n+1 closest terms (as the term itself will be closest)
    vals_max = vals_max[::-1] 
    terms_idx_max = [np.where(term_dist[0] == vals_max[idx])[0][0] for idx in range(0,vals_max.shape[0])] 
    terms_max = [terms[i] for i in terms_idx_max] 
    output = zip(terms_max, vals_max)[1:] # exclude the matched term itelf from the output
    return output


In [23]:
syn = top_synonyms(X, 'car', 10)
syn

[(u'motor', 0.95454353841950124),
 (u'marque', 0.93718056654147608),
 (u'cars', 0.9229350141782231),
 (u'marques', 0.89193622549154128),
 (u'diesel', 0.87026015727237005),
 (u'lexus', 0.8531752060932396),
 (u'carmaker', 0.84870587296567201),
 (u'vehicles', 0.84767965294607106),
 (u'saab', 0.84552522738624325),
 (u'suvs', 0.84374154366331178)]

In [24]:
# Draw a bar chart based on the input

def top_syn_bar(search_term,search_n, min_score):
    '''
    WRITE DOCSTRING
    '''
    from bokeh.palettes import PuRd
    from bokeh.charts.attributes import cat, color
    from bokeh.models import ColumnDataSource, Range1d, LabelSet, Label, NumeralTickFormatter
    
    syns = top_synonyms(X, search_term,search_n) # get the data
    
    # Set up the data for the Bokeh bar chart
    chart_data = pd.DataFrame(syns)
    chart_data.columns = ["term","score"]
    chart_data = chart_data.sort_values('score') # sort desc order
    chart_data['x_lp'] = 0 # add a column for x annotation position
    chart_data = chart_data[chart_data['score'] > min_score] # remove non-relevant records
    
    #chart_data['annotation'] = chart_data['score'].round(2)*100
    #chart_data['annotation'] = chart_data['term']+str(': ')+ chart_data['annotation'].astype(int).astype(str) + '%'
    
    y = list(chart_data['term'])
    #x_range = list(np.arange(0, 1, 0.05))
    x = list(chart_data['score'])
    
    f_title = 'Top Related Terms for: '+ search_term
    
    p = figure(title=f_title,y_range=y)

    p.hbar(y=y, height=0.5, left=0,right=x, 
           color="navy")

    p.xaxis.axis_label = "Relative Term Similarity"
    p.xaxis[0].formatter = NumeralTickFormatter(format="0%")
    p.yaxis.visible = False

    
    labels = LabelSet(x='x_lp', y='term', text='term', level='glyph',text_color="white",
              x_offset=5, y_offset=-8, source=ColumnDataSource(chart_data), 
                      render_mode='canvas')
    
    p.add_layout(labels)
    show(p)

In [25]:
top_syn_bar('hero',10,0.6)

In [27]:
from ipywidgets import interact, widgets

def f(term, n_terms,min_score):
    top_syn_bar(term,n_terms,min_score)
    
interact(f, term = widgets.Text(value='car'),n_terms=widgets.IntSlider(min=-10,max=25,step=1,value=10),
        min_score=widgets.FloatSlider(min=0,max=1,step=0.05,value=0.6));