In [1]:
# import turicreate
import turicreate as tc

In [2]:
# load people_wiki.sframe
people = tc.SFrame('people_wiki.sframe')
people

URI,name,text
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...
<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...
<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...
<http://dbpedia.org/resou rce/Sam_Henderson> ...,Sam Henderson,sam henderson born october 18 1969 is an ...
<http://dbpedia.org/resou rce/Aaron_LaCrate> ...,Aaron LaCrate,aaron lacrate is an american music producer ...
<http://dbpedia.org/resou rce/Trevor_Ferguson> ...,Trevor Ferguson,trevor ferguson aka john farrow born 11 november ...
<http://dbpedia.org/resou rce/Grant_Nelson> ...,Grant Nelson,grant nelson born 27 april 1971 in london ...
<http://dbpedia.org/resou rce/Cathy_Caruth> ...,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes ...


In [3]:
# get word count

In [4]:
people['word_count'] = tc.text_analytics.count_words(people['text'])

In [5]:
# data exploration: obama
obama = people[people['name'] == 'Barack Obama']
obama

URI,name,text,word_count
<http://dbpedia.org/resou rce/Barack_Obama> ...,Barack Obama,barack hussein obama ii brk husen bm born august ...,"{'normalize': 1.0, 'sought': 1.0, 'combat': ..."


In [6]:
# view term frequency
word_count_table = obama['word_count'].stack(new_column_name=['word', 'count'])
word_count_table = word_count_table.sort('count', ascending=False)
word_count_table

word,count
the,40.0
in,30.0
and,21.0
of,18.0
to,14.0
his,11.0
obama,9.0
act,8.0
a,7.0
he,7.0


In [7]:
# compute tf-idf for the corpus
people['tfidf'] = tc.text_analytics.tf_idf(people['word_count'])
people[['name', 'tfidf']]

name,tfidf
Digby Morrell,"{'melbourne': 3.8914310119380633, ..."
Alfred J. Lewy,"{'time': 1.3253342074200498, ..."
Harpdog Brown,"{'society': 2.4448047262085693, ..."
Franz Rottensteiner,"{'kurdlawitzpreis': 10.986495389225194, ..."
G-Enka,"{'curtis': 5.299520032885375, ..."
Sam Henderson,"{'asses': 9.600201028105303, 's ..."
Aaron LaCrate,"{'streamz': 10.986495389225194, ..."
Trevor Ferguson,"{'concordia': 6.250296940830698, ..."
Grant Nelson,"{'heavies': 8.907053847545358, 'n ..."
Cathy Caruth,"{'2002': 1.8753125887822302, ..."


In [8]:
# examine the tf-idf for the obama article
obama = people[people['name'] == 'Barack Obama']
obama

URI,name,text,word_count
<http://dbpedia.org/resou rce/Barack_Obama> ...,Barack Obama,barack hussein obama ii brk husen bm born august ...,"{'normalize': 1.0, 'sought': 1.0, 'combat': ..."

tfidf
"{'normalize': 10.293348208665249, ..."


In [9]:
obama_tfidf = obama['tfidf'].stack(new_column_name=['word', 'tfidf'])
obama_tfidf = obama_tfidf.sort('tfidf', ascending=False)
obama_tfidf

word,tfidf
obama,43.2956530720749
act,27.67822262297991
iraq,17.747378587965535
control,14.887060845181308
law,14.722935761763422
ordered,14.533373950913514
military,13.115932778499417
involvement,12.784385241175055
response,12.784385241175055
democratic,12.410688697332166


In [10]:
# Manually compute distances between a few people
clinton = people[people['name'] == 'Bill Clinton']
beckham = people[people['name'] == 'David Beckham']

In [12]:
clinton_tfidf = clinton['tfidf'].stack(new_column_name=['word', 'tfidf'])
clinton_tfidf = clinton_tfidf.sort('tfidf', ascending=False)
clinton_tfidf

URI,name,text,word_count
<http://dbpedia.org/resou rce/Bill_Clinton> ...,Bill Clinton,william jefferson bill clinton born william ...,"{'presidents': 1.0, 'polls': 1.0, 'opinion': ..."

tfidf
"{'presidents': 4.520350664987575, ..."


In [14]:
beckham_tfidf = beckham['tfidf'].stack(new_column_name=['word', 'tfidf'])
beckham_tfidf = beckham_tfidf.sort('tfidf', ascending=False)
beckham_tfidf

word,tfidf
beckham,56.922355026538426
galaxy,17.96784934638324
fifa,14.135200103949764
careerbeckhams,10.986495389225194
bkm,10.986495389225194
2004renowned,10.986495389225194
player,10.672013128740325
league,10.428368921549833
uefa,9.680332263112591
freekicks,9.600201028105303


In [19]:
# is Obama closer to Clinton than to Beckham?
print("obama - clinton", tc.distances.cosine(obama['tfidf'][0],clinton['tfidf'][0]))
print("obama - beckham", tc.distances.cosine(obama['tfidf'][0],beckham['tfidf'][0]))

obama - clinton 0.8339854936884277
obama - beckham 0.9791305844747478


In [20]:
# build a nearest neighbor model for document retrieval
knn_model = tc.nearest_neighbors.create(people, features=['tfidf'], label='name')


In [21]:
# applying the nearest-neighbours model for retrieval
# who is closest to Obama
knn_model.query(obama)

query_label,reference_label,distance,rank
0,Barack Obama,0.0,1
0,Joe Biden,0.7941176470588236,2
0,Joe Lieberman,0.7946859903381642,3
0,Kelly Ayotte,0.8119891008174387,4
0,Bill Clinton,0.8138528138528138,5


In [22]:
# other examples of document retrieval
swift = people[people['name'] == 'Taylor Swift']
knn_model.query(swift)

query_label,reference_label,distance,rank
0,Taylor Swift,0.0,1
0,Carrie Underwood,0.7623188405797101,2
0,Alicia Keys,0.7647058823529411,3
0,Jordin Sparks,0.7696335078534031,4
0,Leona Lewis,0.7761194029850746,5


In [25]:
jolie = people[people['name'] == 'Angelina Jolie']
knn_model.query(jolie)

query_label,reference_label,distance,rank
0,Angelina Jolie,0.0,1
0,Brad Pitt,0.7840236686390533,2
0,Julianne Moore,0.7958579881656804,3
0,Billy Bob Thornton,0.80306905370844,4
0,George Clooney,0.8046875,5


In [26]:
arnold = people[people['name'] == 'Arnold Schwarzenegger']
knn_model.query(arnold)

query_label,reference_label,distance,rank
0,Arnold Schwarzenegger,0.0,1
0,Jesse Ventura,0.8189189189189189,2
0,John Kitzhaber,0.8246153846153846,3
0,Lincoln Chafee,0.8338762214983714,4
0,Anthony Foxx,0.8339100346020761,5


# QUIZ

### Q1
Compare top words according to word counts to TF-IDF:  In the notebook we covered in the module, we explored two document representations: word counts and TF-IDF.  Now, take a particular famous person, 'Elton John'. What are the 3 words in his articles with highest word counts?  What are the 3 words in his articles with highest TF-IDF?   These results illustrate why TF-IDF is useful for finding important words.  Save these results to answer the quiz at the end.

In [32]:
elton = people[people['name'] == 'Elton John']


word,tfidf
furnish,18.38947183999428
elton,17.482320270031995
billboard,17.30368095754203
john,13.93931279239831
songwriters,11.25040644703154
overallelton,10.986495389225194
tonightcandle,10.986495389225194
fivedecade,10.293348208665249
19702000,10.293348208665249
aids,10.262846934045534


In [34]:
elton['word_count'].stack(new_column_name=['word', 'count']).sort('count', ascending=False)[:3]

word,count
the,27.0
in,18.0
and,15.0


In [33]:
elton['tfidf'].stack(new_column_name=['word', 'tfidf']).sort('tfidf', ascending=False)[:3]

word,tfidf
furnish,18.38947183999428
elton,17.482320270031995
billboard,17.30368095754203


### Q2
Measuring distance:  Elton John is a famous singer; let’s compute the distance between his article and those of two other famous singers. In this assignment, you will use the cosine distance, which one measure of similarity between vectors, similar to the one discussed in the lectures.  You can compute this distance using the turicreate.distances.cosine function. What’s the cosine distance between the articles on ‘Elton John’ and ‘Victoria Beckham’? What’s the cosine distance between the articles on ‘Elton John’ and Paul McCartney’?  Which one of the two is closest to Elton John?  Does this result make sense to you?  Save these results to answer the quiz at the end.

In [36]:
victoria = people[people['name'] == 'Victoria Beckham']
paul = people[people['name'] == 'Paul McCartney']

In [40]:
print(tc.distances.cosine(elton['tfidf'][0], victoria['tfidf'][0]))
print(tc.distances.cosine(elton['tfidf'][0], paul['tfidf'][0]))

0.9567006376655429
0.8250310029221779


### Q3
Building nearest neighbors models with different input features and setting the distance metric:  In the sample notebook, we built a nearest neighbors model for retrieving articles using TF-IDF as features and using the default setting in the construction of the nearest neighbors model.  Now, you will build two nearest neighbors models:

- Using word counts as features

- Using TF-IDF as features

In [41]:
wc_model = tc.nearest_neighbors.create(people, features=['word_count'], label='name', distance='cosine')
tfidf_model = tc.nearest_neighbors.create(people, features=['tfidf'], label='name', distance='cosine')

In [43]:
wc_model.query(elton)

query_label,reference_label,distance,rank
0,Elton John,2.220446049250313e-16,1
0,Cliff Richard,0.1614241525896703,2
0,Sandro Petrone,0.1682254275104111,3
0,Rod Stewart,0.168327165587061,4
0,Malachi O'Doherty,0.177315545978884,5


In [44]:
tfidf_model.query(elton)

query_label,reference_label,distance,rank
0,Elton John,-2.220446049250313e-16,1
0,Rod Stewart,0.7172196678927374,2
0,George Michael,0.7476009989692848,3
0,Sting (musician),0.7476719544306141,4
0,Phil Collins,0.7511932487904706,5


In [45]:
wc_model.query(victoria)

query_label,reference_label,distance,rank
0,Victoria Beckham,-2.220446049250313e-16,1
0,Mary Fitzgerald (artist),0.2073070361150499,2
0,Adrienne Corri,0.2145097827875479,3
0,Beverly Jane Fry,0.2174664687407927,4
0,Raman Mundair,0.2176954749915048,5


In [46]:
tfidf_model.query(victoria)

query_label,reference_label,distance,rank
0,Victoria Beckham,1.1102230246251563e-16,1
0,David Beckham,0.5481696102632145,2
0,Stephen Dow Beckham,0.7849867068283364,3
0,Mel B,0.8095855234085036,4
0,Caroline Rush,0.81982642291868,5
