In [1]:
import graphlab as gl

In [2]:
people = gl.SFrame('people_wiki.gl')

This non-commercial license of GraphLab Create for academic use is assigned to eric.leung@alumni.utoronto.ca and will expire on September 03, 2017.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1475301091.log


In [3]:
people.head(3)

URI,name,text
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...


In [4]:
len(people)

59071

## Generate word_count column from text column

In [5]:
people['word_count'] = gl.text_analytics.count_words(people['text'])

In [6]:
people.head(3)

URI,name,text,word_count
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,"{'selection': 1, 'carltons': 1, 'being': ..."
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...,"{'precise': 1, 'thomas': 1, 'closely': 1, ..."
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...,"{'just': 1, 'issued': 1, 'mainly': 1, 'nominat ..."


## Generate tf-idf column from word_count column

In [7]:
people['tfidf'] = gl.text_analytics.tf_idf(people['word_count'])

In [8]:
people.head(3)

URI,name,text,word_count
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,"{'selection': 1, 'carltons': 1, 'being': ..."
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...,"{'precise': 1, 'thomas': 1, 'closely': 1, ..."
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...,"{'just': 1, 'issued': 1, 'mainly': 1, 'nominat ..."

tfidf
"{'selection': 3.836578553093086, ..."
"{'precise': 6.44320060695519, ..."
"{'just': 2.7007299687108643, ..."


## Examine Elton John

In [9]:
elton_john = people[people['name'] == 'Elton John']

In [10]:
elton_john

URI,name,text,word_count
<http://dbpedia.org/resou rce/Elton_John> ...,Elton John,sir elton hercules john cbe born reginald ken ...,"{'all': 1, 'least': 1, 'producer': 1, 'heavi ..."

tfidf
"{'all': 1.6431112434912472, ..."


## Turn word_count column (dict) into a table

In [11]:
elton_john_wordcount_table = elton_john[['word_count']].stack('word_count', new_column_name = ['word','count']).sort('count',ascending=False)

In [12]:
elton_john_wordcount_table.head(5)

word,count
the,27
in,18
and,15
of,13
a,10


In [13]:
elton_john_tfidf_table = elton_john[['tfidf']].stack('tfidf', new_column_name=['word', 'tfidf']).sort('tfidf', ascending=False)

In [14]:
elton_john_tfidf_table.head(5)

word,tfidf
furnish,18.38947184
elton,17.48232027
billboard,17.3036809575
john,13.9393127924
songwriters,11.250406447


# Compute the distance between Elton John's article and those of two other famous singers

In [15]:
victoria_beckham = people[people['name'] == 'Victoria Beckham']
paul_mcCartney = people[people['name'] == 'Paul McCartney']

In [16]:
gl.distances.cosine(elton_john['tfidf'][0], victoria_beckham['tfidf'][0])

0.9567006376655429

In [17]:
gl.distances.cosine(elton_john['tfidf'][0], paul_mcCartney['tfidf'][0])

0.8250310029221779

# Build a nearest neighbor model for document retrieval

In [18]:
knn_model = gl.nearest_neighbors.create(people,features=['tfidf'],label='name', distance='cosine')

In [19]:
knn_model_wc = gl.nearest_neighbors.create(people,features=['word_count'],label='name', distance='cosine')

## What’s the most similar article, other than itself, to the one on ‘Elton John’ using word count features?

In [20]:
knn_model_wc.query(elton_john)

query_label,reference_label,distance,rank
0,Elton John,2.22044604925e-16,1
0,Cliff Richard,0.16142415259,2
0,Sandro Petrone,0.16822542751,3
0,Rod Stewart,0.168327165587,4
0,Malachi O'Doherty,0.177315545979,5


In [21]:
knn_model.query(elton_john)

query_label,reference_label,distance,rank
0,Elton John,-2.22044604925e-16,1
0,Rod Stewart,0.717219667893,2
0,George Michael,0.747600998969,3
0,Sting (musician),0.747671954431,4
0,Phil Collins,0.75119324879,5


In [22]:
knn_model_wc.query(victoria_beckham)

query_label,reference_label,distance,rank
0,Victoria Beckham,-2.22044604925e-16,1
0,Mary Fitzgerald (artist),0.207307036115,2
0,Adrienne Corri,0.214509782788,3
0,Beverly Jane Fry,0.217466468741,4
0,Raman Mundair,0.217695474992,5


In [23]:
knn_model.query(victoria_beckham)

query_label,reference_label,distance,rank
0,Victoria Beckham,1.11022302463e-16,1
0,David Beckham,0.548169610263,2
0,Stephen Dow Beckham,0.784986706828,3
0,Mel B,0.809585523409,4
0,Caroline Rush,0.819826422919,5
