## Offline part

check to see if the compressed tdm is available, then jump over the calculation
otherwise read from the URL with the zip file and build it

Load the data

In [43]:
#from __future__ import print_function

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.metrics.pairwise import linear_kernel

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

import numpy as np


In [2]:
import numpy as np

# Read files from zip
arc_path = "/Users/dima/Google Drive/CUNY_MSDA/Data_602_Python/Final_Project/data/bbc-fulltext.zip"

def read_corpus_from_zip(archive_path):
    '''
    WRITE DOCSTRING
    '''
    
    import zipfile
    import os
    
    with zipfile.ZipFile(archive_path) as z:

        docs = []
        terms = []
        # need to read only .txt files but not those hidden ones created by the MacOS

        for filename in [f for f in z.namelist() if (not f.startswith('_'))& f.endswith('.txt')]:
            with z.open(filename) as f:
                docs.append(filename)
                words = f.read().replace('\n', '')
                terms.append(words)

    print 'Created a corpus with {} documents'.format(len(docs))
    return {'docs' : docs,'terms' : terms}

corpus = read_corpus_from_zip(arc_path)

Created a corpus with 2225 documents


Clean and stem the strings before passing to the tokenizer that will count term occurrences per document.

In [3]:
# Process terms: Remove punctuation & numbers, lowercase, remove articles and auxillary verbs

def clean_strings(string_):
    '''
    WRITE DOCSTRING
    '''
    
    import string
    import re
    from nltk import stem
    
    snowball = stem.snowball.EnglishStemmer() # function to stem English words
    
    string_ = string_.lower()  # lowercase
    string_ = re.sub(r'[^\x00-\x7f]',r' ',string_) # remove non-unicode characters
    string_ = re.sub('\d','',string_) # retain only letters
    string_ = re.sub(r' *\b[a-zA-Z]{1,2}\b',r'',string_) # remove all 1- or 2-word terms
    #string_ = [snowball.stem(i) for i in string_.split()] # stem
    #string_ = ' '.join(string_)
    return string_


corpus['terms'] = [clean_strings(item) for item in corpus['terms']]


In [4]:
# Print the first 300 symbols of the first document
corpus['terms'][0][:300]

' sales boost time warner profitquarterly profits media giant timewarner jumped % $. () for the three months december, from $ year-earlier.the firm, which now one the biggest investors google, benefited from sales high-speed internet connections and higher advert sales. timewarner said fourth quarter'

Apply scikit-learn vectorizer

Reference for this part:  

http://scikit-learn.org/stable/auto_examples/text/document_clustering.html#sphx-glr-auto-examples-text-document-clustering-py  
Peter Prettenhofer  
Lars Buitinck  

In [5]:
t0 = time()

vectorizer = TfidfVectorizer(max_df=0.7,min_df=2, stop_words='english',use_idf=True)
# max_df=0.7 means..
# min_df=2 means..

X = vectorizer.fit_transform(corpus['terms'])

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)

done in 0.793958s
n_samples: 2225, n_features: 16723


In [6]:
t0 = time()
# Vectorizer results are normalized, which makes KMeans behave as
# spherical k-means for better results. Since LSA/SVD results are
# not normalized, we have to redo the normalization.

svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
# n_components = 100 means...
# n_iter = 7 means..

normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

X = lsa.fit_transform(X.transpose()) # use transpose to generate a matrix for terms(features) instead of documents

print("done in %fs" % (time() - t0))
print("n_terms: %d, n_components: %d" % X.shape)

done in 1.630621s
n_terms: 16723, n_components: 100


In [7]:
X.shape

(16723, 100)

In [8]:
X

array([[ 0.11322606, -0.08858838, -0.11481925, ...,  0.02483692,
         0.0308393 ,  0.12903484],
       [ 0.11673417, -0.10178572, -0.12229693, ..., -0.1090511 ,
         0.06981183, -0.03725003],
       [ 0.11532662, -0.10773052, -0.02530448, ...,  0.0298613 ,
        -0.01788788,  0.03954373],
       ..., 
       [ 0.24039674, -0.20552223, -0.23199312, ...,  0.01500166,
         0.05132643, -0.03396719],
       [ 0.13739699, -0.15670377, -0.19861743, ...,  0.01157532,
         0.18498143, -0.20329173],
       [ 0.09546642, -0.124781  , -0.11831286, ..., -0.00733596,
        -0.03006833, -0.11851548]])

Now we have a matrix with M = 16.723 terms described by N = 100 components representing "semantic dimensions".  
We can:
  
1) Visualize the scores of each term across these dimensions in a MxN heatmap  
2) Calculate pairwise similarity between two term vectors (1xN each) using cosine distance. Then an MxM similarity matrix can be constructed holding these similarity values. The higher the value is to 1 the closer the terms are to each other in the semantic dimensions.  
  
So for any term we can provide a set of e.g. top 10 synonyms using the top values from the correspondent row in the distance matrix. 

In [9]:
# The array of term names
terms = vectorizer.get_feature_names()
len(terms)

16723

In [10]:
# Find top terms per semantic dimension

import pandas as pd

terms_comps = pd.DataFrame(X) # semantic dimensions = components, are the columns, one row per each term

In [11]:
terms_comps.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.113226,-0.088588,-0.114819,-0.066678,-0.017645,0.028401,-0.02078,-0.293838,-0.176486,0.032807,...,0.020546,-0.064412,0.018931,0.109256,-0.06318,0.108095,0.077723,0.024837,0.030839,0.129035
1,0.116734,-0.101786,-0.122297,-0.047617,-0.036581,0.017029,-0.023478,-0.291808,-0.1395,0.071898,...,0.013701,0.239071,-0.020271,-0.023135,-0.017255,-0.037946,-0.05027,-0.109051,0.069812,-0.03725


In [12]:
max_positions = [terms_comps[i].idxmax() for i in terms_comps.columns] # find max value for each dimension
top_100 = terms_comps.ix[max_positions,:]
top_100_names = [terms[i] for i in top_100.index]

In [48]:
top_100

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
5492,0.806829,-0.097683,-0.003997,-0.123638,0.084433,0.044903,-0.000669,0.038489,0.045670,-0.082156,...,0.002712,0.069437,0.036159,0.020646,0.089023,-0.016153,-0.049998,-0.026585,0.073529,-0.015199
8394,0.418680,0.689105,-0.201592,0.090200,0.096754,-0.160188,0.099288,0.014597,-0.037578,0.150110,...,0.042380,0.023922,0.054399,-0.062813,-0.014279,-0.040795,-0.022744,-0.026782,0.056776,-0.033096
558,0.444116,-0.160972,0.554381,-0.113607,-0.173565,-0.003042,0.163770,0.044734,0.001305,0.070224,...,0.012206,0.048663,-0.090366,-0.062196,0.007909,-0.109807,-0.032486,-0.011030,0.041084,0.087756
5675,0.251263,-0.263190,-0.186943,0.672115,-0.211130,-0.001158,-0.077258,0.201702,0.026863,-0.030201,...,0.031930,0.013662,0.025530,-0.036441,-0.000505,-0.040764,0.002074,0.000028,-0.004056,-0.000044
15891,0.365892,-0.211860,0.350625,0.116441,0.485303,-0.047829,-0.018561,0.046708,-0.108111,0.071142,...,0.012699,-0.076144,0.029061,-0.057052,0.017620,0.033803,-0.052981,-0.187587,0.004735,-0.038516
8791,0.174686,-0.002969,0.148498,-0.048341,-0.118541,0.667577,0.266283,0.228232,-0.206034,0.181492,...,0.011666,0.047426,0.057165,-0.022985,0.032421,-0.050431,0.009493,-0.026822,0.127825,0.097674
9338,0.253812,0.076878,0.055408,-0.017347,-0.058138,0.468640,0.474223,0.207539,-0.191608,0.265190,...,-0.004595,0.001456,0.015703,0.049484,-0.000808,0.061697,0.070390,-0.052903,0.025276,0.110443
8836,0.240455,-0.230018,-0.367275,-0.328763,0.033651,-0.093207,-0.099776,0.481866,-0.295191,-0.230157,...,-0.055778,-0.018384,0.041363,0.009563,0.011498,-0.032569,0.004309,-0.045224,0.040430,-0.127821
8535,0.224685,-0.188437,-0.290613,-0.265609,0.052348,0.028558,0.052135,-0.007339,0.535788,0.062323,...,-0.091471,0.032437,-0.017093,-0.082046,-0.035775,0.020990,-0.035913,-0.027586,0.010435,-0.006341
12931,0.125599,-0.127253,-0.135445,-0.138656,-0.073527,0.183608,0.168640,-0.215676,-0.276819,0.411004,...,-0.065782,0.025006,-0.083554,-0.004572,0.043698,-0.076219,-0.078145,-0.026719,0.030568,-0.040605


In [67]:
# Load graphics

from bokeh.io import output_notebook, show
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource
output_notebook()

In [52]:
from bokeh.charts import HeatMap


hm = HeatMap(data=top_100, x=list(top_100.columns), y=list(top_100_names), values=top_100,
             title='Top100', stat=None)

#hm = Heatmap(z=top_100.as_matrix(),
#                   x=top_100.columns,
 #                  y=top_100_names)

show(hm)

ValueError: expected an element of either Column Name or Column String or List(Column Name or Column String), got              0         1         2         3         4         5         6   \
5492   0.806829 -0.097683 -0.003997 -0.123638  0.084433  0.044903 -0.000669   
8394   0.418680  0.689105 -0.201592  0.090200  0.096754 -0.160188  0.099288   
558    0.444116 -0.160972  0.554381 -0.113607 -0.173565 -0.003042  0.163770   
5675   0.251263 -0.263190 -0.186943  0.672115 -0.211130 -0.001158 -0.077258   
15891  0.365892 -0.211860  0.350625  0.116441  0.485303 -0.047829 -0.018561   
8791   0.174686 -0.002969  0.148498 -0.048341 -0.118541  0.667577  0.266283   
9338   0.253812  0.076878  0.055408 -0.017347 -0.058138  0.468640  0.474223   
8836   0.240455 -0.230018 -0.367275 -0.328763  0.033651 -0.093207 -0.099776   
8535   0.224685 -0.188437 -0.290613 -0.265609  0.052348  0.028558  0.052135   
12931  0.125599 -0.127253 -0.135445 -0.138656 -0.073527  0.183608  0.168640   
2800   0.196065 -0.141005  0.045890  0.137756 -0.033933 -0.062893  0.218668   
9524   0.200128  0.106886  0.001745  0.042363  0.100774 -0.135825  0.164326   
7601   0.229599  0.348629 -0.119701  0.026647  0.013389 -0.087650  0.101675   
3867   0.219610  0.366666 -0.079657  0.041589  0.050635 -0.046533  0.084469   
12733  0.349305 -0.113253  0.154639 -0.051901 -0.062789  0.139943  0.092585   
1678   0.135015 -0.058591  0.173808 -0.051596 -0.099610 -0.101289 -0.191955   
9923   0.233086  0.032016  0.204757 -0.004148  0.086002 -0.083078 -0.096037   
7107   0.309205  0.425624 -0.021569  0.025580 -0.018661 -0.044308  0.028619   
10842  0.138706 -0.176630 -0.160037  0.354604 -0.134642 -0.031920 -0.013531   
8638   0.127601 -0.091849  0.165180  0.054387  0.198499  0.014578  0.151528   
6152   0.148759 -0.206825 -0.249352  0.094310 -0.105640  0.001003  0.024268   
16346  0.079903 -0.000085  0.083506 -0.052490 -0.116406  0.061553  0.037824   
10227  0.110913 -0.006614 -0.022194  0.086132 -0.017476  0.100060 -0.008810   
6738   0.136277 -0.022467  0.092217 -0.037646 -0.207627 -0.074988 -0.025200   
7540   0.070834 -0.039072  0.087673  0.014136  0.129609  0.035179 -0.199394   
10080  0.286457  0.130499 -0.180677  0.003622 -0.094049 -0.114196  0.089584   
13477  0.210558 -0.092941 -0.122894 -0.127388  0.026104  0.088111 -0.039578   
7603   0.091019 -0.010433  0.082134 -0.046654 -0.131508  0.115613  0.025642   
5371   0.169704 -0.103483  0.009678  0.078640  0.026877  0.042510  0.088294   
3596   0.077311 -0.022549  0.008477 -0.056005 -0.047155  0.119596  0.071589   
...         ...       ...       ...       ...       ...       ...       ...   
2484   0.180487 -0.088434 -0.129901 -0.094974  0.056507  0.052803  0.049525   
3055   0.214948 -0.098610 -0.068536 -0.135948  0.133897  0.023691 -0.144313   
1334   0.105949  0.051587 -0.046633  0.038462  0.035045  0.052952 -0.038588   
276    0.155552  0.026211 -0.040170  0.038786  0.035700  0.039973 -0.058361   
10553  0.142993 -0.014351  0.023577 -0.072770 -0.067104  0.016267 -0.056682   
2278   0.134793 -0.080194 -0.023360  0.169073 -0.097940 -0.034849 -0.024579   
1014   0.213282  0.284127 -0.095088  0.030648  0.113072  0.122435 -0.125295   
3883   0.149671  0.007860  0.094306 -0.074507 -0.147026  0.021302 -0.037649   
4950   0.209319  0.142948  0.055551 -0.043496 -0.126521 -0.094749 -0.084721   
2892   0.304742 -0.124910  0.090381  0.173868  0.073434  0.052251 -0.183025   
14163  0.164515 -0.163746 -0.107504  0.140149 -0.014065 -0.007177  0.074729   
14005  0.149720 -0.045710 -0.029062 -0.083389  0.013130  0.057104  0.016446   
1527   0.237467 -0.054768 -0.097785 -0.167052 -0.004794 -0.014686 -0.020223   
9596   0.183681 -0.028443  0.096689 -0.015716 -0.008257  0.241194 -0.010297   
2196   0.082828 -0.084761  0.054967  0.043085  0.090943 -0.104870  0.170974   
4990   0.152287 -0.119950 -0.064571 -0.002927  0.062022 -0.015364  0.085300   
1778   0.191452 -0.042193  0.137993 -0.013075 -0.106158  0.059060  0.045742   
7716   0.171615  0.160517  0.009718  0.021679  0.066269  0.057048 -0.121557   
15365  0.213779  0.062714  0.090418 -0.039114 -0.071348  0.123162 -0.059743   
15275  0.279586 -0.008957  0.127652 -0.075852 -0.086523  0.110719 -0.230919   
1299   0.125955  0.032037 -0.016781 -0.036736  0.021488  0.009943 -0.014622   
8073   0.149215 -0.072389 -0.145385 -0.107567 -0.006115  0.006850 -0.012766   
2272   0.153575  0.018475  0.024896  0.027886  0.018767  0.089999 -0.054517   
3814   0.170853  0.124509 -0.035998  0.033654  0.051390  0.023198 -0.037890   
4033   0.207042 -0.092424 -0.048159 -0.138501  0.111762  0.056731 -0.077297   
676    0.155742 -0.070226 -0.063466  0.218701 -0.036133 -0.040637  0.124310   
13547  0.104828  0.098795 -0.054096  0.027937  0.037711  0.004692  0.006827   
1342   0.131520  0.078035  0.007618 -0.002791  0.059192  0.091799 -0.158000   
11200  0.143559 -0.031851  0.120202  0.010346  0.089534  0.011700  0.032150   
512    0.102770  0.137243 -0.086111  0.023594  0.042728  0.001574  0.028730   

             7         8         9     ...           90        91        92  \
5492   0.038489  0.045670 -0.082156    ...     0.002712  0.069437  0.036159   
8394   0.014597 -0.037578  0.150110    ...     0.042380  0.023922  0.054399   
558    0.044734  0.001305  0.070224    ...     0.012206  0.048663 -0.090366   
5675   0.201702  0.026863 -0.030201    ...     0.031930  0.013662  0.025530   
15891  0.046708 -0.108111  0.071142    ...     0.012699 -0.076144  0.029061   
8791   0.228232 -0.206034  0.181492    ...     0.011666  0.047426  0.057165   
9338   0.207539 -0.191608  0.265190    ...    -0.004595  0.001456  0.015703   
8836   0.481866 -0.295191 -0.230157    ...    -0.055778 -0.018384  0.041363   
8535  -0.007339  0.535788  0.062323    ...    -0.091471  0.032437 -0.017093   
12931 -0.215676 -0.276819  0.411004    ...    -0.065782  0.025006 -0.083554   
2800  -0.170125  0.027688 -0.026463    ...     0.012242  0.012549  0.073468   
9524   0.012454  0.062387  0.218605    ...    -0.095690  0.080314  0.029047   
7601   0.015408  0.025989  0.169669    ...    -0.009930 -0.017253 -0.121120   
3867   0.036588 -0.061122  0.042424    ...    -0.010876 -0.119980 -0.066147   
12733 -0.135805  0.146229  0.064306    ...    -0.033279 -0.028962  0.039669   
1678   0.008114 -0.024126  0.105873    ...     0.070816 -0.104849 -0.151267   
9923   0.008985 -0.037061 -0.097109    ...    -0.046173  0.037758  0.096452   
7107   0.048389 -0.086466  0.195230    ...    -0.025114 -0.042371 -0.107488   
10842  0.051680 -0.015217  0.054487    ...    -0.052037 -0.045856 -0.001759   
8638  -0.022395  0.005004 -0.001620    ...     0.010069  0.081128  0.025608   
6152  -0.216528 -0.210733  0.326844    ...     0.056917 -0.069223  0.141537   
16346 -0.004330  0.166276 -0.050652    ...    -0.053223 -0.000360  0.040389   
10227  0.009567  0.010070 -0.071688    ...     0.044093  0.047088  0.019493   
6738  -0.070677  0.018438 -0.002605    ...    -0.051827 -0.068572 -0.055069   
7540   0.046885  0.003248  0.170889    ...     0.006679  0.014948  0.073383   
10080  0.026074 -0.093179  0.222988    ...    -0.040882 -0.051103  0.055189   
13477  0.073030  0.122531 -0.128595    ...     0.018235 -0.025167 -0.008774   
7603   0.013276  0.011810 -0.033020    ...    -0.057159  0.023576 -0.008540   
5371   0.033116  0.240385 -0.111085    ...     0.016027  0.036007 -0.041728   
3596   0.020709  0.294204 -0.023962    ...     0.040178  0.057420 -0.033707   
...         ...       ...       ...    ...          ...       ...       ...   
2484  -0.020421  0.230892 -0.076881    ...    -0.029556 -0.102462  0.047704   
3055   0.124670 -0.059690 -0.007645    ...     0.030820  0.090289 -0.131773   
1334  -0.026874  0.057031 -0.050813    ...    -0.167033  0.221607 -0.100863   
276   -0.002388  0.053736 -0.049399    ...     0.135405  0.060078  0.036153   
10553 -0.020713  0.025821  0.015015    ...     0.009069 -0.101162  0.187094   
2278   0.043452  0.012931 -0.053055    ...     0.101140  0.032251 -0.061524   
1014  -0.034221  0.051927 -0.169110    ...    -0.023063  0.113697 -0.013047   
3883  -0.003844  0.099288 -0.054691    ...     0.059955 -0.011393 -0.067355   
4950  -0.035273  0.007451 -0.056050    ...     0.063159  0.022084 -0.124704   
2892   0.017761 -0.039655  0.060702    ...     0.095164 -0.102620  0.088757   
14163 -0.021900  0.031768  0.017042    ...    -0.004142 -0.134327 -0.037463   
14005 -0.009522  0.262377 -0.037513    ...     0.069979 -0.179591 -0.035266   
1527  -0.085026  0.271215  0.067813    ...    -0.149691  0.094916 -0.031528   
9596  -0.003863  0.062562 -0.023033    ...    -0.231964 -0.057403 -0.098709   
2196  -0.016460 -0.015542 -0.022885    ...    -0.305449  0.159184  0.052725   
4990  -0.013820  0.079303  0.045478    ...     0.033151 -0.079869  0.130359   
1778  -0.024971  0.083308 -0.046719    ...     0.200195 -0.052465  0.061549   
7716  -0.011177  0.021144 -0.002562    ...     0.043091  0.002068 -0.008296   
15365  0.008654  0.073417 -0.118253    ...    -0.042827 -0.057182 -0.090488   
15275 -0.022598  0.007024  0.003231    ...    -0.179412  0.024886 -0.001103   
1299   0.045707  0.027152 -0.083793    ...     0.450697 -0.045170 -0.071433   
8073  -0.090856 -0.076913  0.010421    ...     0.009604  0.399277 -0.031777   
2272  -0.018252  0.031986 -0.116468    ...     0.094963  0.210248  0.504112   
3814  -0.006007  0.037099 -0.086622    ...    -0.121227 -0.167395 -0.096772   
4033   0.194969 -0.056555 -0.090374    ...     0.201817  0.068919 -0.011937   
676   -0.046034  0.004685  0.017619    ...    -0.152141 -0.072300 -0.155227   
13547 -0.057127  0.003840 -0.030205    ...    -0.040883 -0.218006  0.168325   
1342   0.042113 -0.028461 -0.099300    ...     0.038949  0.026134 -0.174732   
11200  0.027104  0.034354  0.045327    ...    -0.069197  0.161119  0.173064   
512   -0.007443  0.039363 -0.010549    ...    -0.076213 -0.183546 -0.000159   

             93        94        95        96        97        98        99  
5492   0.020646  0.089023 -0.016153 -0.049998 -0.026585  0.073529 -0.015199  
8394  -0.062813 -0.014279 -0.040795 -0.022744 -0.026782  0.056776 -0.033096  
558   -0.062196  0.007909 -0.109807 -0.032486 -0.011030  0.041084  0.087756  
5675  -0.036441 -0.000505 -0.040764  0.002074  0.000028 -0.004056 -0.000044  
15891 -0.057052  0.017620  0.033803 -0.052981 -0.187587  0.004735 -0.038516  
8791  -0.022985  0.032421 -0.050431  0.009493 -0.026822  0.127825  0.097674  
9338   0.049484 -0.000808  0.061697  0.070390 -0.052903  0.025276  0.110443  
8836   0.009563  0.011498 -0.032569  0.004309 -0.045224  0.040430 -0.127821  
8535  -0.082046 -0.035775  0.020990 -0.035913 -0.027586  0.010435 -0.006341  
12931 -0.004572  0.043698 -0.076219 -0.078145 -0.026719  0.030568 -0.040605  
2800  -0.135018 -0.082305  0.004295  0.008955  0.092940 -0.021579 -0.062052  
9524   0.042840  0.007948 -0.029292  0.114596  0.017628  0.074604  0.083853  
7601  -0.046391  0.000119  0.067584  0.036556 -0.066381 -0.016794 -0.052281  
3867   0.157202  0.060781 -0.025315  0.020419 -0.012095  0.030519 -0.000497  
12733 -0.079821  0.074647  0.063943  0.119133  0.055431  0.070851  0.064367  
1678  -0.009533  0.065064 -0.077479  0.031570 -0.003995  0.014893  0.027201  
9923  -0.103776 -0.026121  0.034768 -0.001244 -0.015938 -0.067121 -0.051155  
7107   0.030822 -0.102177 -0.022919 -0.064178 -0.056365  0.006288 -0.037558  
10842  0.033186  0.040280  0.030620  0.104827  0.057306 -0.018596  0.022135  
8638  -0.075685 -0.003473 -0.049687  0.012451 -0.018602  0.020288  0.018570  
6152  -0.109602  0.070215 -0.035793 -0.005074  0.052143  0.142864  0.025593  
16346  0.024139 -0.006704 -0.016309  0.015126 -0.033847 -0.021472 -0.000411  
10227  0.068666 -0.135815 -0.074815 -0.066587  0.019851 -0.076115 -0.093379  
6738   0.023519  0.127075 -0.128536 -0.031030 -0.030709  0.079813  0.060648  
7540   0.004687  0.032733  0.067011  0.024793  0.030740 -0.054312 -0.023529  
10080  0.130000 -0.113840 -0.011143 -0.014286  0.081038 -0.041117  0.012623  
13477  0.110995 -0.082031 -0.058258  0.034092  0.103759  0.075833 -0.111751  
7603   0.061543 -0.006769 -0.030032 -0.045151 -0.026987 -0.036084 -0.055757  
5371  -0.011702  0.136620  0.013353 -0.043977 -0.033892 -0.030739 -0.112633  
3596   0.056612  0.070746  0.030951  0.001905 -0.003650 -0.022590 -0.025726  
...         ...       ...       ...       ...       ...       ...       ...  
2484  -0.066898 -0.022577  0.005939  0.019612 -0.072201 -0.140281 -0.054502  
3055  -0.180242 -0.030735 -0.073084 -0.004083 -0.000294 -0.008856  0.174593  
1334   0.002239 -0.395950  0.013352  0.038584 -0.041742  0.151019  0.135466  
276    0.090843  0.124869  0.132872  0.008240  0.012132 -0.125613  0.020137  
10553  0.058132 -0.097723 -0.087401  0.091494  0.023140 -0.031671  0.032581  
2278   0.000781 -0.096979  0.125641 -0.042772 -0.027970  0.019675  0.026155  
1014   0.023846  0.163187 -0.129921 -0.087615 -0.020608  0.033047  0.016412  
3883   0.110011 -0.034530  0.070481 -0.101134 -0.084647 -0.015842  0.073186  
4950   0.058905  0.007109 -0.026863 -0.083680  0.108664 -0.021019  0.012873  
2892  -0.094597  0.076423 -0.204170  0.037439  0.051821 -0.108896  0.119228  
14163 -0.202175  0.127176  0.200375  0.136492  0.052952 -0.084323 -0.027394  
14005 -0.194438 -0.095704 -0.053729  0.176600 -0.123572 -0.059340 -0.038974  
1527   0.095743  0.059237 -0.011736 -0.013425 -0.158907  0.019337 -0.108176  
9596  -0.251795 -0.091060 -0.160062 -0.038808 -0.012127 -0.001833  0.056781  
2196  -0.119663  0.125713 -0.041029  0.003629  0.118227  0.046715 -0.071156  
4990   0.092542  0.066844  0.116184 -0.077570  0.053992  0.167360  0.063247  
1778  -0.138238 -0.025189  0.033992  0.218553  0.069032  0.037456 -0.155199  
7716  -0.050189  0.061184  0.288832  0.040429  0.086956 -0.093686  0.190511  
15365 -0.160515 -0.022331  0.115099 -0.139735 -0.056743  0.060764  0.035634  
15275  0.005649  0.148398  0.061658  0.147651  0.281723 -0.169877 -0.071085  
1299  -0.006286  0.086648 -0.071151 -0.032603 -0.043369  0.121012 -0.157537  
8073   0.239264 -0.069132 -0.109786 -0.087653  0.120940 -0.077581 -0.026943  
2272  -0.178349  0.120619  0.046214 -0.059744  0.005910  0.081196  0.166837  
3814   0.393824 -0.267311  0.111191 -0.107034 -0.228675  0.082307 -0.052509  
4033  -0.010363  0.377870  0.126100  0.100201 -0.141980  0.213802  0.137378  
676    0.102204  0.203672  0.375432 -0.048845  0.027616  0.061444 -0.062801  
13547 -0.185961 -0.266538 -0.077545  0.381135 -0.069251 -0.188131 -0.081234  
1342  -0.181395  0.112680  0.173600  0.053732  0.445915 -0.069586  0.179680  
11200  0.092245 -0.143117 -0.013218  0.084400  0.083953  0.416661  0.121925  
512    0.321743  0.052565 -0.126421  0.106383  0.102332 -0.138798  0.507701  

[100 rows x 100 columns]

In [None]:
#top_100_names
#list(terms_comps.columns)

In [15]:
# Function to filter top 10 words per semantic dimension

def get_top_x_term_df(inp_matrix, term_names, top_x):
    '''WRITE DOCSTRING AND ERROR'''
        
    # Convert into long pandas table and use built-in top-n function, then use the index of the top scoring 
    # terms to filter the melted term_component df
    tc_matrix = inp_matrix.copy()
    val_cols = tc_matrix.columns
    tc_matrix['term'] = term_names
    tc_melt = pd.melt(tc_matrix, id_vars=['term'], value_vars=list(val_cols),var_name='sem_dim', value_name='weight')
    top_weights = tc_melt.groupby(['sem_dim'])['weight'].nlargest(top_x)
    top_df = tc_melt.ix[top_weights.index.get_level_values(1),:]
    return top_df


In [16]:
top_terms = get_top_x_term_df(terms_comps,terms, 5)
top_terms.head(20)

Unnamed: 0,term,sem_dim,weight
5492,far,0,0.806829
185,added,0,0.803412
9051,make,0,0.794982
15157,time,0,0.786284
16264,way,0,0.786165
25117,labour,1,0.689105
21559,election,1,0.673864
25825,manifesto,1,0.634419
19869,conservative,1,0.631094
27914,polls,1,0.627402


We can see how the first dimensions show terms related to the same topic, e.g. dimension 1 appears to be related to politics, dimension 2 to "stock markets or business", dimension 3 is about films and actors, however, the dimension 0 is unclear when judged alone.

In [17]:
# Calculate cosine distances from 1 vector to all the other vectors to find close terms
terms_comps.shape

(16723, 100)

In [68]:
# Calculate cosine distances from 1 vector to all the other vectors to find close terms

def top_synonyms(tc_matrix,search_term,search_n):
    '''
    WRITE DOCSTRING AND NOT FOUND ERROR HANDLING FOR WHEN TERM NOT FOUND
    avoid duplicate records like in output: top_synonyms(X, 'hall', 10)
    '''
    search_index = terms.index(search_term)

    term_vec = tc_matrix[search_index,:].reshape(1, -1) # vector for the search item
    
    # find distances to each other term, as the values are normalized can use linear kernel:
    term_dist = linear_kernel(term_vec, tc_matrix) 
    
    vals_max = np.sort(term_dist).take(range(-(search_n+1),0)) # top n+1 closest terms (as the term itself will be closest)
    vals_max = vals_max[::-1] 
    terms_idx_max = [np.where(term_dist[0] == vals_max[idx])[0][0] for idx in range(0,vals_max.shape[0])] 
    terms_max = [terms[i] for i in terms_idx_max] 
    output = zip(terms_max, vals_max)[1:] # exclude the matched term itelf from the output
    return output


In [69]:
syn = top_synonyms(X, 'car', 10)
syn

[(u'motor', 0.95454353841950124),
 (u'marque', 0.93718056654147608),
 (u'cars', 0.9229350141782231),
 (u'marques', 0.89193622549154128),
 (u'diesel', 0.87026015727237005),
 (u'lexus', 0.8531752060932396),
 (u'carmaker', 0.84870587296567201),
 (u'vehicles', 0.84767965294607106),
 (u'saab', 0.84552522738624325),
 (u'suvs', 0.84374154366331178)]

In [82]:
# Draw a bar chart based on the input

def top_syn_bar(search_term,search_n):
    '''
    WRITE DOCSTRING
    '''
    from bokeh.palettes import PuRd
    from bokeh.charts.attributes import cat, color
    
    syns = top_synonyms(X, search_term,search_n) # get the data
    
    # Set up the data for the Bokeh bar chart
    chart_data = pd.DataFrame(syns)
    chart_data.columns = ["term","score"]
    chart_data = chart_data.sort_values('score') # sort desc order
    
    p = figure(title="Synonyms",y_range=y)

    p.hbar(y=y, height=0.5, left=0,right=x, 
           color=color(columns=x, palette=['SaddleBrown', 'Silver', 'Goldenrod'],sort=False))

    p.xaxis.axis_label = "Relative Term Similarity"

    show(p)
    


In [None]:
# Draw a bar chart based on the input

def top_syn_bar(search_term,search_n):
    '''
    WRITE DOCSTRING
    '''
    from bokeh.palettes import PuRd
    from bokeh.charts.attributes import cat, color
    
    syns = top_synonyms(X, search_term,search_n) # get the data
    
    # Set up the data for the Bokeh bar chart
    chart_data = pd.DataFrame(syns)
    chart_data.columns = ["term","score"]
    chart_data = chart_data.sort_values('score') # sort desc order
    
    p = figure(title="Synonyms",y_range=y)

    p.hbar(y=y, height=0.5, left=0,right=x, 
           color=color(columns=x, palette=['SaddleBrown', 'Silver', 'Goldenrod'],sort=False))

    p.xaxis.axis_label = "Relative Term Similarity"

    show(p)

In [83]:
# Convert the synonym output into a source for Bokeh

top_syn_bar('car',10)


ValueError: expected an element of either Column Name or Column String or List(Column Name or Column String), got [0.84374154366331178, 0.84552522738624325, 0.84767965294607106, 0.84870587296567201, 0.8531752060932396, 0.87026015727237005, 0.89193622549154128, 0.9229350141782231, 0.93718056654147608, 0.95454353841950124]

In [66]:
from bokeh.plotting import figure, show, output_file
#from bokeh.charts import Bar, output_file, show

y = list(chart_data['term'])
x = list(chart_data['score'])

#LabelSet(x='x', y='y', text='names', level='glyph',
#         x_offset=5, y_offset=5, source=source)


p = figure(title="Synonyms",y_range=y)

p.hbar(y=y, height=0.5, left=0,
       right=x, color="navy")

p.xaxis.axis_label = "Relative Term Similarity"

show(p)

In [27]:
list(chart_data['score'])

[0.70851802010128839,
 0.70683245613681467,
 0.70292997279217961,
 0.68870090222018598,
 0.66049361607650936,
 0.65988390694167343,
 0.6550768203098285,
 0.65116250517222718,
 0.65116250517222718]