In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
people = pd.read_csv('people_wiki.csv')
analyer = TfidfVectorizer()
X = analyer.fit_transform(people['text'])
idx_2_word_mapping = {v:k for k,v in analyer.vocabulary_.items()}

In [26]:
from collections import Counter

def process(name):
    ## simple word count
    text = people[people['name'] == name]['text'].values.tolist()
    word_count = pd.DataFrame.from_dict(Counter(text[0].split()), orient='index').sort_values(0)
    idx = people[people['name'] == name].index.values[0]
    
    text_tfidf_dense = X[idx].toarray()[0]
    word_count_tfidf = {}
    for i in range(len(text_tfidf_dense)):
        word_count_tfidf[idx_2_word_mapping[i]] = text_tfidf_dense[i]    
    word_count_tfidf = pd.DataFrame.from_dict(word_count_tfidf, orient='index').sort_values(0)
    
    return word_count, word_count_tfidf, text_tfidf_dense

In [4]:
ej_wc, ej_wc_tfidf, ej_tfidf = process('Elton John')

In [5]:
ej_wc[-5:]

Unnamed: 0,0
a,10
of,13
and,15
in,18
the,27


In [6]:
ej_wc_tfidf[-5:]

Unnamed: 0,0
furnish,0.181221
elton,0.184686
john,0.188958
billboard,0.192207
the,0.243684


In [7]:
from scipy.spatial.distance import cosine
vb_wc, vb_wc_tfidf, vb_tfidf = process('Victoria Beckham')

In [8]:
cosine(ej_tfidf, vb_tfidf)

0.85192118138271944

In [9]:
pm_wc, pm_wc_tfidf, pm_tfidf = process('Paul McCartney')

In [10]:
cosine(ej_tfidf, pm_tfidf)

0.69231324786877946

In [7]:
from sklearn.neighbors import NearestNeighbors
neighbor = NearestNeighbors(metric='l2')
neighbor.fit(X)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='l2',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [12]:
idx = people[people['name'] == 'Victoria Beckham'].index.values[0]
neighbor.kneighbors(X[idx], 2)

(array([[ 0.        ,  1.04544409]]), array([[50411, 23386]]))

In [13]:
people.iloc[23386, :]

URI           <http://dbpedia.org/resource/David_Beckham>
name                                        David Beckham
text    david robert joseph beckham obe bkm born 2 may...
Name: 23386, dtype: object

In [38]:
df_X_raw = pd.DataFrame()
for i in range(10):
    tmp_dict = {k:0 for k in analyer.vocabulary_}
    wc = Counter(people.ix[i, 'text'].split())
    tmp_dict.update(wc)
    df_X_raw = df_X_raw.append(tmp_dict, ignore_index=True)

In [35]:
df_X_raw.append(tmp_dict, ignore_index=True)

Unnamed: 0,00,000,0000,00000,00000van,0001,00014338,0001sec,0002,00026,...,zyx,zyzzyva,zyzzyza,zz,zzap64,zzb,zzebra,zzran,zzt,zzts
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [37]:
df_X_raw

In [15]:
neighbor_raw = NearestNeighbors(metric='l2')
neighbor_raw.fit(X_raw)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='l2',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [16]:
idx = people[people['name'] == 'Elton John'].index.values[0]
neighbor_raw.kneighbors(X_raw[idx], 2)

(array([[ 0.        ,  0.25444756]]), array([[19923, 28825]]))

In [13]:
people.iloc[28825, :]

URI             <http://dbpedia.org/resource/Rod_Stewart>
name                                          Rod Stewart
text    roderick david rod stewart cbe born 10 january...
Name: 28825, dtype: object