In [1]:
import graphlab

# Limit number of worker processes. This preserves system memory, which prevents hosted notebooks from crashing.
graphlab.product_key.set_product_key('9E00-CE2C-5AB0-5149-A5BE-5398-65E2-7D7B')
graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 4)

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1526584118.log


This non-commercial license of GraphLab Create for academic use is assigned to chuoguejiofor@gmail.com and will expire on May 13, 2019.


In [4]:
people = graphlab.SFrame('../data/people_wiki.gl/')

URI,name,text
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...
<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...
<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...
<http://dbpedia.org/resou rce/Sam_Henderson> ...,Sam Henderson,sam henderson born october 18 1969 is an ...
<http://dbpedia.org/resou rce/Aaron_LaCrate> ...,Aaron LaCrate,aaron lacrate is an american music producer ...
<http://dbpedia.org/resou rce/Trevor_Ferguson> ...,Trevor Ferguson,trevor ferguson aka john farrow born 11 november ...
<http://dbpedia.org/resou rce/Grant_Nelson> ...,Grant Nelson,grant nelson born 27 april 1971 in london ...
<http://dbpedia.org/resou rce/Cathy_Caruth> ...,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes ...


In [5]:
people['word_count'] = graphlab.text_analytics.count_words(people['text'])
tfidf = graphlab.text_analytics.tf_idf(people['word_count'])
people['tfidf'] = tfidf

## Compare top words according to word counts to TF-IDF:

In [16]:
elton_john = people[people['name']=='Elton John']
elton_word_count_table = elton_john[['word_count']].stack('word_count', new_column_name = ['word','count'])
elton_word_count_table.sort('count', False)

word,count
the,27
in,18
and,15
of,13
a,10
has,9
john,7
he,7
on,6
award,5


In [17]:
elton_john[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)

word,tfidf
furnish,18.38947184
elton,17.48232027
billboard,17.3036809575
john,13.9393127924
songwriters,11.250406447
tonightcandle,10.9864953892
overallelton,10.9864953892
19702000,10.2933482087
fivedecade,10.2933482087
aids,10.262846934


## Measuring distance:

In [19]:
victoria_beckham = people[people['name']== "Victoria Beckham"]
paul_mccartney = people[people['name'] == "Paul McCartney"]
print(graphlab.distances.cosine(elton_john['tfidf'][0], victoria_beckham['tfidf'][0]))
print(graphlab.distances.cosine(elton_john['tfidf'][0], paul_mccartney['tfidf'][0]))

0.956700637666
0.825031002922


## Building nearest neighbors models with different input features and setting the distance metric:

In [21]:
word_count_knn_model = graphlab.nearest_neighbors.create(people,features=['word_count'],label='name', distance='cosine')
tfidf_knn_model = graphlab.nearest_neighbors.create(people,features=['tfidf'],label='name', distance='cosine')

In [22]:
word_count_knn_model.query(elton_john)

query_label,reference_label,distance,rank
0,Elton John,2.22044604925e-16,1
0,Cliff Richard,0.16142415259,2
0,Sandro Petrone,0.16822542751,3
0,Rod Stewart,0.168327165587,4
0,Malachi O'Doherty,0.177315545979,5


In [23]:
tfidf_knn_model.query(elton_john)

query_label,reference_label,distance,rank
0,Elton John,-2.22044604925e-16,1
0,Rod Stewart,0.717219667893,2
0,George Michael,0.747600998969,3
0,Sting (musician),0.747671954431,4
0,Phil Collins,0.75119324879,5


In [24]:
word_count_knn_model.query(victoria_beckham)

query_label,reference_label,distance,rank
0,Victoria Beckham,-2.22044604925e-16,1
0,Mary Fitzgerald (artist),0.207307036115,2
0,Adrienne Corri,0.214509782788,3
0,Beverly Jane Fry,0.217466468741,4
0,Raman Mundair,0.217695474992,5


In [25]:
tfidf_knn_model.query(victoria_beckham)

query_label,reference_label,distance,rank
0,Victoria Beckham,1.11022302463e-16,1
0,David Beckham,0.548169610263,2
0,Stephen Dow Beckham,0.784986706828,3
0,Mel B,0.809585523409,4
0,Caroline Rush,0.819826422919,5
