In [1]:
import pandas as pd
import matplotlib.pyplot as plt          # plotting
import numpy as np                       # dense matrices
from scipy.sparse import csr_matrix      # sparse matrices
import json
%matplotlib inline

In [2]:
wiki = pd.read_csv('./people_wiki.csv')
print(wiki.shape)
wiki.head()

(59071, 3)


Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [3]:
def load_sparse_csr(filename):
    """
    This function loads a numpy zipped file
    and returns a sparse matrix
    
    csr_matrix((data, indices, indptr), [shape=(M, N)])
        is the standard CSR representation where the column indices for row i are stored 
        in indices[indptr[i]:indptr[i+1]] and their corresponding values are stored in 
        data[indptr[i]:indptr[i+1]]. If the shape parameter is not supplied, the matrix 
        dimensions are inferred from the index arrays.
    
    See here for reference:
    https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html
    """
    
    loader = np.load(filename)
    data = loader['data']
    indices = loader['indices']
    indptr = loader['indptr']
    shape = loader['shape']
    
    return csr_matrix( (data, indices, indptr), shape)

In [4]:
word_count = load_sparse_csr('./people_wiki_word_count.npz')
type(word_count)

scipy.sparse.csr.csr_matrix

In [5]:
with open('./people_wiki_map_index_to_word.json', 'r') as f:
    data = json.load(f)
map_index_to_word = pd.DataFrame({'word': {v: k for k, v in data.items()}}).sort_index()

map_index_to_word.head()

Unnamed: 0,word
0,bioarchaeologist
1,leaguehockey
2,electionruss
3,teramoto
4,trumpeterpercussionist


In [6]:
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(metric='euclidean', algorithm='brute')
model.fit(word_count)  # build the 'map'

NearestNeighbors(algorithm='brute', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [7]:
wiki[wiki['name'] == 'Barack Obama']  # example

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [8]:
distances, indices = model.kneighbors(word_count[35817], n_neighbors=10)

In [9]:
neighbors = pd.DataFrame({'distance':distances.flatten()}, index=indices.flatten())
neighbors = neighbors.join(wiki).sort_values(by='distance')[['name','distance']]
neighbors.head(10)

Unnamed: 0,name,distance
35817,Barack Obama,0.0
24478,Joe Biden,33.075671
28447,George W. Bush,34.394767
35357,Lawrence Summers,36.152455
14754,Mitt Romney,36.166283
13229,Francisco Barrio,36.331804
31423,Walter Mondale,36.400549
22745,Wynn Normington Hugh-Jones,36.496575
36364,Don Bonker,36.633318
9210,Andy Anstett,36.959437


In [10]:
def unpack_dict(matrix, map_index_to_word):
    
    table = map_index_to_word['word'].tolist()    
    # unpack the csr object
    data = matrix.data
    indices = matrix.indices
    indptr = matrix.indptr    
    num_doc = matrix.shape[0]

    return [{k:v for k,v in 
             zip([table[word_id] for word_id in indices[indptr[i]:indptr[i+1]] ],
                                 data[indptr[i]:indptr[i+1]].tolist())}
               for i in range(num_doc) ]

wiki['word_count'] = unpack_dict(word_count, map_index_to_word)

wiki.head()

Unnamed: 0,URI,name,text,word_count
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,"{'brisbaneafter': 1, 'edflhe': 1, 'aflfrom': 1..."
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,"{'maladaptation': 1, 'phasedelay': 1, '25hour'..."
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,"{'germanyover': 1, 'bluesgospel': 1, 'harpdog'..."
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,"{'fantasticrottensteiner': 1, 'waidmannsfeld':..."
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'arhm': 3, 'gangstergenka': 1, 'kuhnja': 1, '..."


In [11]:
def top_words(name):
    """
    Get a table of the most frequent words in the given person's wikipedia page.
    """
    row = wiki[wiki['name'] == name]
    if row.shape[0] != 1:
        raise Exception('Now name of {} found.'.format(name))
    
    word_count_table = pd.DataFrame([[k, v] for k, v in row['word_count'].values[0].items()],
                                   columns=['word', 'count'])
    word_count_table.set_index('word', inplace=True)
    
    return word_count_table.sort_values('count', ascending=False)

obama_words = top_words('Barack Obama')
barrio_words = top_words('Francisco Barrio')
obama_words.head()

Unnamed: 0_level_0,count
word,Unnamed: 1_level_1
the,40
in,30
and,21
of,18
to,14


In [12]:
combined_words = obama_words.join(barrio_words, how='inner',
                                  lsuffix='_Obama', rsuffix='_Barrio')
combined_words.head()

Unnamed: 0_level_0,count_Obama,count_Barrio
word,Unnamed: 1_level_1,Unnamed: 2_level_1
the,40,36
in,30,17
and,21,18
of,18,24
to,14,9


## Quiz Question. 
Among the words that appear in both Barack Obama and Francisco Barrio, take the 5 that appear most frequently in Obama. How many of the articles in the Wikipedia dataset contain all of those 5 words?

In [13]:
def has_top_words(word_count_vector, common_words):
    # extract the keys of word_count_vector and convert it to a set
    # word_count_vector is a <dict>
    unique_words = set(word_count_vector.keys())
#     print(len(unique_words))
    # return True if common_words is a subset of unique_words
    # return False otherwise
    return set(common_words).issubset(unique_words)

wiki['word_count'].apply(has_top_words, args=(['the', 'in', 'and', 'of', 'to'],)).sum()

56066

In [14]:
has_top_words(wiki.iloc[33,:]['word_count'], ['the', 'in', 'and', 'of', 'to'])

False

## Quiz Question. 
Measure the pairwise distance between the Wikipedia pages of Barack Obama, George W. Bush, and Joe Biden. Which of the three pairs has the smallest distance?

In [15]:
neighbors.head(3)

Unnamed: 0,name,distance
35817,Barack Obama,0.0
24478,Joe Biden,33.075671
28447,George W. Bush,34.394767


In [16]:
distances, indices = model.kneighbors(word_count[24478], n_neighbors=2000)
df_tmp = pd.DataFrame({'distance':distances.flatten()}, index=indices.flatten()).\
join(wiki).sort_values(by='distance')[['name','distance']]
df_tmp[df_tmp['name'].isin(['George W. Bush', 'Barack Obama'])]

Unnamed: 0,name,distance
28447,George W. Bush,32.756679
35817,Barack Obama,33.075671


## Quiz Question. 
Collect all words that appear both in Barack Obama and George W. Bush pages. Out of those words, find the 10 words that show up most often in Obama's page.

In [17]:
bush_words = top_words('George W. Bush')
obama_words.join(bush_words, how='inner',
                                  lsuffix='_Obama', rsuffix='_Bush').head(10)

Unnamed: 0_level_0,count_Obama,count_Bush
word,Unnamed: 1_level_1,Unnamed: 2_level_1
the,40,39
in,30,22
and,21,14
of,18,14
to,14,11
his,11,6
act,8,3
he,7,8
a,7,6
law,6,1


## tf-idf

In [18]:
tf_idf = load_sparse_csr('people_wiki_tf_idf.npz')
wiki['tf_idf'] = unpack_dict(tf_idf, map_index_to_word)
wiki.head()

Unnamed: 0,URI,name,text,word_count,tf_idf
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,"{'brisbaneafter': 1, 'edflhe': 1, 'aflfrom': 1...","{'brisbaneafter': 10.986495389225194, 'edflhe'..."
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,"{'maladaptation': 1, 'phasedelay': 1, '25hour'...","{'maladaptation': 10.986495389225194, 'phasede..."
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,"{'germanyover': 1, 'bluesgospel': 1, 'harpdog'...","{'germanyover': 10.986495389225194, 'bluesgosp..."
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,"{'fantasticrottensteiner': 1, 'waidmannsfeld':...","{'fantasticrottensteiner': 10.986495389225194,..."
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'arhm': 3, 'gangstergenka': 1, 'kuhnja': 1, '...","{'arhm': 32.95948616767558, 'gangstergenka': 1..."


In [19]:
model_tf_idf = NearestNeighbors(metric='euclidean', algorithm='brute')
model_tf_idf.fit(tf_idf)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [31]:
distances, indices = model_tf_idf.kneighbors(tf_idf[35817], n_neighbors=2000)

In [33]:
neighbors_tfidf = pd.DataFrame({'distance':distances.flatten()}, index=indices.flatten())
neighbors_tfidf = neighbors_tfidf.join(wiki).sort_values(by='distance')[['name','distance']]
neighbors_tfidf.head(10)

Unnamed: 0,name,distance
35817,Barack Obama,0.0
7914,Phil Schiliro,106.861014
46811,Jeff Sessions,108.871674
44681,Jesse Lee (politician),109.045698
38376,Samantha Power,109.108106
6507,Bob Menendez,109.781867
38714,Eric Stern (politician),109.957788
44825,James A. Guest,110.413889
44368,Roland Grossenbacher,110.470609
33417,Tulsi Gabbard,110.696998


In [34]:
def top_words_tf_idf(name):
    """
    Get a table of the most frequent words in the given person's wikipedia page.
    """
    row = wiki[wiki['name'] == name]
    if row.shape[0] != 1:
        raise Exception('Now name of {} found.'.format(name))
    
    word_count_table = pd.DataFrame([[k, v] for k, v in row['tf_idf'].values[0].items()],
                                   columns=['word', 'weight'])
    word_count_table.set_index('word', inplace=True)
    
    return word_count_table.sort_values('weight', ascending=False)

obama_tf_idf = top_words_tf_idf('Barack Obama')
print(obama_tf_idf.head())

schiliro_tf_idf = top_words_tf_idf('Phil Schiliro')
print(schiliro_tf_idf.head())

            weight
word              
obama    43.295653
act      27.678223
iraq     17.747379
control  14.887061
law      14.722936
                    weight
word                      
schiliro         21.972991
staff            15.856442
congressional    13.547088
daschleschiliro  10.986495
obama             9.621256


## Quiz Question. 
Among the words that appear in both Barack Obama and Phil Schiliro, take the 5 that have largest weights in Obama. How many of the articles in the Wikipedia dataset contain all of those 5 words?

In [35]:
obama_tf_idf.join(schiliro_tf_idf, how='inner', lsuffix='_Obama', rsuffix='_Schiliro').head()

Unnamed: 0_level_0,weight_Obama,weight_Schiliro
word,Unnamed: 1_level_1,Unnamed: 2_level_1
obama,43.295653,9.621256
law,14.722936,7.361468
democratic,12.410689,6.205344
senate,10.164288,3.388096
presidential,7.386955,3.693478


In [36]:
wiki['word_count'].apply(
    has_top_words, args=(['obama', 'law', 'democratic', 'senate', 'presidential'],)).sum()

14

## Quiz Question. 
Compute the Euclidean distance between TF-IDF features of Obama and Biden.

In [37]:
# Comptue length of all documents
wiki['length'] = wiki['text'].apply(lambda x: len(x.split(' ')))

wiki.head()

Unnamed: 0,URI,name,text,word_count,tf_idf,length
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,"{'brisbaneafter': 1, 'edflhe': 1, 'aflfrom': 1...","{'brisbaneafter': 10.986495389225194, 'edflhe'...",251
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,"{'maladaptation': 1, 'phasedelay': 1, '25hour'...","{'maladaptation': 10.986495389225194, 'phasede...",223
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,"{'germanyover': 1, 'bluesgospel': 1, 'harpdog'...","{'germanyover': 10.986495389225194, 'bluesgosp...",226
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,"{'fantasticrottensteiner': 1, 'waidmannsfeld':...","{'fantasticrottensteiner': 10.986495389225194,...",377
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'arhm': 3, 'gangstergenka': 1, 'kuhnja': 1, '...","{'arhm': 32.95948616767558, 'gangstergenka': 1...",201


In [52]:
name_idx = wiki[wiki['name'] == 'Joe Biden'].index.values[0]
word_indices = [map_index_to_word[map_index_to_word['word']==word].index.values[0] 
                for word in wiki.loc[name_idx, 'tf_idf'].keys()]
text_tf_idf = csr_matrix( (list(wiki.loc[name_idx, 'tf_idf'].values()), ([0]*len(word_indices), word_indices)),shape=(1, tf_idf.shape[1]) )

In [56]:
from sklearn.metrics.pairwise import cosine_distances, euclidean_distances
obama_tf_idf = tf_idf[35817]
euclidean_distances(text_tf_idf, obama_tf_idf)

array([[ 123.29745601]])

In [54]:
text_tf_idf[0]

<1x547979 sparse matrix of type '<class 'numpy.float64'>'
	with 219 stored elements in Compressed Sparse Row format>

## Cosine distance

In [27]:
model2_tf_idf = NearestNeighbors(algorithm='brute', metric='cosine')
model2_tf_idf.fit(tf_idf)
distances, indices = model2_tf_idf.kneighbors(tf_idf[35817], n_neighbors=100)

In [28]:
neighbors2_tfidf = pd.DataFrame({'distance':distances.flatten()}, index=indices.flatten())
neighbors2_tfidf = neighbors2_tfidf.join(wiki).sort_values(by='distance')[['name','distance']]
neighbors2_tfidf.head(10)

Unnamed: 0,name,distance
35817,Barack Obama,0.0
24478,Joe Biden,0.703139
38376,Samantha Power,0.742982
57108,Hillary Rodham Clinton,0.758358
38714,Eric Stern (politician),0.770561
46140,Robert Gibbs,0.784678
6796,Eric Holder,0.788039
44681,Jesse Lee (politician),0.790926
18827,Henry Waxman,0.798323
2412,Joe the Plumber,0.799466
