In [34]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from scipy import spatial

In [4]:
people = pd.read_csv("../../data/people_wiki.csv")
people.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [5]:
count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()

count_weights  = count_vectorizer.fit_transform(people['text'])
count_features = count_vectorizer.get_feature_names()

tfidf_weights  = tfidf_vectorizer.fit_transform(people['text'])
tfidf_features = tfidf_vectorizer.get_feature_names()

## Compare top words according to word counts to TF-IDF:

In [24]:
elton_john = people[people['name']=='Elton John']

def get_top_weights(rownum, weights, top_k=10):
    return np.sort(weights.toarray()[rownum,:])[::-1][:top_k]
def get_top_features(rownum, weights, features, top_k=10):
    weight_vec = weights.toarray()[rownum,:]
    top_idx = np.argsort(weight_vec)[::-1][:top_k]
    return [features[i] for i in top_idx]

In [14]:
get_top_features(elton_john.index.values[0], count_weights, count_features)

['the', 'in', 'and', 'of', 'has', 'he', 'john', 'on', 'award', 'for']

In [17]:
get_top_weights(elton_john.index.values[0], count_weights)

[27 18 15 13  9  7  7  6  5  5]


In [33]:
elton_john_tfidf_features = get_top_features(elton_john.index.values[0], tfidf_weights, tfidf_features)
elton_john_tfidf_weights  = get_top_weights(elton_john.index.values[0], tfidf_weights)
pd.DataFrame({'word':elton_john_tfidf_features, 'tfidf': elton_john_tfidf_weights})

Unnamed: 0,tfidf,word
0,0.243684,the
1,0.192207,billboard
2,0.188958,john
3,0.184686,elton
4,0.181221,furnish
5,0.162596,in
6,0.135467,and
7,0.119493,songwriters
8,0.118768,award
9,0.118181,top


## Measuring distance:

In [32]:
victoria_beckham = people[people['name']== "Victoria Beckham"]
paul_mccartney = people[people['name'] == "Paul McCartney"]

elton_john_tdidf = tfidf_vectorizer.transform(elton_john['text']).todense()
victoria_beckham_tdidf = tfidf_vectorizer.transform(victoria_beckham['text']).todense()
paul_mccartney_tdidf = tfidf_vectorizer.transform(paul_mccartney['text']).todense()

print(spatial.distance.cosine(elton_john_tdidf,victoria_beckham_tdidf))
print(spatial.distance.cosine(elton_john_tdidf,paul_mccartney_tdidf))

0.8519211813827194
0.6923132478687795


## Building nearest neighbors models with different input features and setting the distance metric:

In [35]:
people['word_count']=list(count_weights.toarray())
people['tfidf']=list(tfidf_weights.toarray())

Unnamed: 0,URI,name,text,tfidf
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [None]:
word_count_knn_model = NearestNeighbors(metric='cosine').fit(people['word_count'].tolist())

In [None]:
elton_john = people[people['name']=='Elton John']
dist, idx = word_count_knn_model.kneighbors(elton_john['word_count'].tolist, 5)