In [1]:
# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# change default style figure and font size
plt.rcParams['figure.figsize'] = 8, 6
plt.rcParams['font.size'] = 12

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,sklearn,matplotlib

Ethen 2018-06-27 18:13:54 

CPython 3.6.4
IPython 6.4.0

numpy 1.14.5
pandas 0.23.0
sklearn 0.19.1
matplotlib 2.2.2


In [42]:
documents = ["Human machine interface for lab abc computer applications",
             "A survey of user opinion of computer system response time",
             "The EPS user interface management system",
             "System and human system engineering testing of EPS",              
             "Relation of user perceived response time to error measurement",
             "The generation of random binary unordered trees",
             "The intersection graph of paths in trees",
             "Graph minors IV Widths of trees and well quasi ordering",
             "Graph minors A survey"]
# remove common words and tokenize# remov 
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in documents]

# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

documents = [' '.join(token for token in text if frequency[token] > 1) for text in texts]
documents

['human interface computer',
 'survey user computer system response time',
 'eps user interface system',
 'system human system eps',
 'user response time',
 'trees',
 'graph trees',
 'graph minors trees',
 'graph minors survey']

In [43]:
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

stoplist = set('for a of the and to in'.split())
tfidf = TfidfVectorizer(stop_words = stoplist)
svd = TruncatedSVD(n_components = 2)
pipeline = Pipeline([
    ('tfidf', tfidf),
    ('svd', svd)
])
X_lsa = pipeline.fit_transform(documents)
X_lsa

array([[ 0.05667254,  0.47343916],
       [ 0.18010825,  0.79236365],
       [ 0.09075478,  0.75546177],
       [ 0.07471199,  0.64032022],
       [ 0.0925438 ,  0.5943493 ],
       [ 0.70927442, -0.14650451],
       [ 0.8988797 , -0.15386956],
       [ 0.92873325, -0.1359042 ],
       [ 0.61873361,  0.04443583]])

In [48]:
docs = ["Human computer interaction".lower()]
vec2_lsi = pipeline.transform(docs).ravel()
vec2_lsi

array([0.04746224, 0.36739683])

In [58]:
def similar_items(X_lsa, doc_id=0, N=5):
    item_norms = np.linalg.norm(X_lsa, axis=-1)
    scores = X_lsa.dot(X_lsa[doc_id]) / item_norms
    best = np.argpartition(scores, -N)[-N:]
    return sorted(zip(best, scores[best] / item_norms[doc_id]), key=lambda x: -x[1])


similar_items(X_lsa)

[(0, 1.0),
 (2, 0.9999999111618154),
 (3, 0.9999955503612504),
 (4, 0.9993760074162686),
 (1, 0.9945583674009802)]

In [61]:
def similar_items(X_lsa, vec2_lsi, N=5):
    item_norms = np.linalg.norm(X_lsa, axis=1)
    scores = X_lsa.dot(vec2_lsi) / item_norms
    best = np.argpartition(scores, -N)[-N:]
    return sorted(zip(best, scores[best] / np.linalg.norm(vec2_lsi)), key=lambda x: -x[1])


similar_items(X_lsa, vec2_lsi)

[(2, 0.9999602608651891),
 (0, 0.9999564142214454),
 (3, 0.9999241125730483),
 (4, 0.9996622252720809),
 (1, 0.9954877002928035)]

In [49]:
tfidf = pipeline.named_steps['tfidf']
vocab = tfidf.get_feature_names()
vocab_size = len(vocab)

vec1_lsi = X_lsa[2, :]
vec1_tfidf = tfidf.transform([documents[2]]).A.ravel()
vec1_tfidf

array([0.        , 0.53361154, 0.        , 0.        , 0.53361154,
       0.        , 0.        , 0.        , 0.46395983, 0.        ,
       0.        , 0.46395983])

In [47]:
svd = pipeline.named_steps['svd']
svd.components_.shape

(2, 12)

In [50]:
norms = np.linalg.norm(vec1_lsi) * np.linalg.norm(vec2_lsi) 
word_sims = vec1_tfidf * np.dot(svd.components_.T, vec2_lsi) / norms
word_sims

array([ 0.        ,  0.21558392,  0.        ,  0.        ,  0.21175185,
        0.        ,  0.        ,  0.        ,  0.31136407,  0.        ,
       -0.        ,  0.26126043])

In [51]:
word_sims = sorted(enumerate(word_sims), key=lambda item: -item[1])
word_sims

[(8, 0.3113640685065232),
 (11, 0.2612604265486649),
 (1, 0.21558391641981317),
 (4, 0.21175184939018935),
 (0, 0.0),
 (2, 0.0),
 (3, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (9, 0.0),
 (10, -0.0)]

In [52]:
def printWordSims(word_sims, dictionary, topn=10, min_pos=0.1, max_neg=0.01):
    """
    Internal function used by `interpretMatch` to display the contributing
    words.
    """
    # TODO - First create the list of results in interpretMatch, then
    #        in this function just do the printing, and adapt the column
    #        width to the maximum word length in the results...

    # Build up the table of results to display.        
    tableStr = ''
    for i in range(0, topn):
        pos_word_id, pos_word_val = word_sims[i]
        neg_word_id, neg_word_val = word_sims[-(i + 1)]

        pos_word = dictionary[pos_word_id]
        neg_word = dictionary[neg_word_id]                       

        # If neither words pass the thresholds, break.
        if ((pos_word_val <= min_pos) and (neg_word_val >= max_neg)):
            break

        # Only display the positive word if the value passes the threshold.
        if (pos_word_val > min_pos):
            tableStr += '  %15s  +%.3f' % (pos_word, pos_word_val)
        # Otherwise add empty space.
        else:
            # e.g.,     '          freedom  +0.440'
            tableStr += '                         '

        # Only display the negative word if the value passes the threshold.
        if (neg_word_val < max_neg):
            tableStr += '    %15s  %.3f\n' % (neg_word, neg_word_val)
        # Otherwise just end the line.
        else:
            tableStr += '\n'

    print(tableStr)
    
    
printWordSims(word_sims, vocab)

           system  +0.311              trees  -0.000
             user  +0.261               time  0.000
              eps  +0.216             survey  0.000
        interface  +0.212           response  0.000
                                      minors  0.000
                                       human  0.000
                                       graph  0.000
                                    computer  0.000



In [None]:
vocab_size = len(dictionary.token2id)

vec1_lsi = sparse2dense(lsi[vec1_tfidf], lsi.num_topics)
vec2_lsi = sparse2dense(lsi[vec2_tfidf], lsi.num_topics)
vec1_tfidf = sparse2dense(vec1_tfidf, vocab_size)

In [14]:
# mapping of words to latent factors/concepts,
# i.e. each concept is a linear combination of words
tfidf = pipeline.named_steps['tfidf']
vocab = tfidf.get_feature_names()
pd.DataFrame(svd.components_, index = ['concept1', 'concept2'], columns = vocab)

Unnamed: 0,abc,applications,binary,computer,engineering,eps,error,generation,graph,human,...,random,relation,response,survey,testing,time,trees,unordered,user,widths
concept1,0.053842,0.053842,0.053385,0.181679,0.048573,0.129152,0.092299,0.053385,0.433814,0.086501,...,0.053385,0.092299,0.214161,0.377852,0.048573,0.214161,0.262912,0.053385,0.26283,0.143002
concept2,0.104054,0.104054,-0.066833,0.208323,0.121922,0.271183,0.125538,-0.066833,-0.321092,0.190862,...,-0.066833,0.125538,0.226469,-0.002877,0.121922,0.226469,-0.262954,-0.066833,0.343158,-0.130742


In [15]:
svd = pipeline.named_steps['svd']
print('total variance explained:', np.sum(svd.explained_variance_))

# mapping of document to latent factors/concepts,
# i.e. Eech document is a linear combination of the concepts
pd.DataFrame(X_lsa, index = documents, columns = ['concept1', 'concept2'])

total variance explained: 0.19366296540275155


Unnamed: 0,concept1,concept2
human machine interface for lab abc computer applications,0.223864,0.391323
a survey of user opinion of computer system response time,0.567122,0.453591
the eps user interface management system,0.301517,0.520548
system and human system engineering testing of eps,0.150874,0.342541
relation of user perceived response time to error measurement,0.378285,0.46538
the generation of random binary unordered trees,0.190848,-0.216111
the intersection graph of paths in trees,0.475839,-0.427388
graph minors iv widths of trees and well quasi ordering,0.577473,-0.477549
graph minors a survey,0.673129,-0.3107


In [9]:
from sklearn.preprocessing import normalize

X_normed = normalize(X_lsa, axis = 1)
similarity = X_normed @ X_normed.T
pd.DataFrame(similarity, index = documents, columns = documents)

Unnamed: 0,Human machine interface for lab abc computer applications,A survey of user opinion of computer system response time,The EPS user interface management system,System and human system engineering testing of EPS,Relation of user perceived response time to error measurement,The generation of random binary unordered trees,The intersection graph of paths in trees,Graph minors IV Widths of trees and well quasi ordering,Graph minors A survey
Human machine interface for lab abc computer applications,1.0,0.929942,0.999986,0.99452,0.986761,-0.321932,-0.21059,-0.170502,0.087079
A survey of user opinion of computer system response time,0.929942,1.0,0.931906,0.886403,0.977265,0.048753,0.163624,0.203766,0.447288
The EPS user interface management system,0.999986,0.931906,1.0,0.993943,0.987619,-0.316835,-0.205329,-0.165199,0.092436
System and human system engineering testing of EPS,0.99452,0.886403,0.993943,1.0,0.964398,-0.41915,-0.31164,-0.272585,-0.017549
Relation of user perceived response time to error measurement,0.986761,0.977265,0.987619,0.964398,1.0,-0.164124,-0.049259,-0.008439,0.24749
The generation of random binary unordered trees,-0.321932,0.048753,-0.316835,-0.41915,-0.164124,1.0,0.993327,0.98779,0.915133
The intersection graph of paths in trees,-0.21059,0.163624,-0.205329,-0.31164,-0.049259,0.993327,1.0,0.999166,0.955523
Graph minors IV Widths of trees and well quasi ordering,-0.170502,0.203766,-0.165199,-0.272585,-0.008439,0.98779,0.999166,1.0,0.966767
Graph minors A survey,0.087079,0.447288,0.092436,-0.017549,0.24749,0.915133,0.955523,0.966767,1.0
