# Document retrieval from Wikipedia data

In [1]:
import turicreate as tc

## Load some text data from Wikipedia

In [3]:
people = tc.SFrame('../data/people_wiki.sframe')

In [5]:
len(people)

59071

In [8]:
# The text column is adapted to only contain words to make text analytics easier.
people[0]['text']

'digby morrell born 10 october 1979 is a former australian rules footballer who played with the kangaroos and carlton in the australian football league aflfrom western australia morrell played his early senior football for west perth his 44game senior career for the falcons spanned 19982000 and he was the clubs leading goalkicker in 2000 at the age of 21 morrell was recruited to the australian football league by the kangaroos football club with its third round selection in the 2001 afl rookie draft as a forward he twice kicked five goals during his time with the kangaroos the first was in a losing cause against sydney in 2002 and the other the following season in a drawn game against brisbaneafter the 2003 season morrell was traded along with david teague to the carlton football club in exchange for corey mckernan he played 32 games for the blues before being delisted at the end of 2005 he continued to play victorian football league vfl football with the northern bullants carltons vfla

## Explore data

In [6]:
obama = people[people['name'] == 'Barack Obama']

In [7]:
obama

URI,name,text
<http://dbpedia.org/resou rce/Barack_Obama> ...,Barack Obama,barack hussein obama ii brk husen bm born august ...


In [None]:
obama['text']

# Word counts for Obama acticle

In [10]:
obama['word_count'] = tc.text_analytics.count_words(obama['text'])
# count words uses unigrams for the bag of words.
# another method, count_ngrams counts bigrams and up.

In [11]:
obama['word_count']

dtype: dict
Rows: 1
[{'normalize': 1.0, 'sought': 1.0, 'combat': 1.0, 'continued': 1.0, 'unconstitutional': 1.0, '8': 1.0, 'californias': 1.0, '1996': 1.0, 'marriage': 1.0, 'defense': 1.0, 'down': 1.0, 'proposition': 1.0, 'court': 1.0, 'supreme': 1.0, 'urged': 1.0, 'which': 1.0, 'briefs': 1.0, 'administration': 1.0, 'while': 1.0, 'americans': 1.0, 'called': 1.0, 'cuba': 1.0, 'gun': 1.0, 'related': 1.0, 'policies': 1.0, 'promoted': 1.0, '2013': 1.0, 'second': 2.0, 'romney': 1.0, 'filed': 1.0, '2012': 1.0, 'reelected': 1.0, 'taxpayer': 1.0, 'budget': 1.0, 'nations': 1.0, 'raise': 1.0, 'spending': 1.0, 'over': 1.0, 'lengthy': 1.0, 'gains': 1.0, 'seats': 1.0, '63': 1.0, 'total': 1.0, 'lost': 1.0, 'regained': 1.0, 'whether': 1.0, 'close': 1.0, 'patient': 1.0, 'by': 1.0, 'sandy': 1.0, 'after': 4.0, 'presidential': 2.0, 'november': 2.0, 'obama': 9.0, 'election': 3.0, 'august': 1.0, 'mccain': 1.0, 'primary': 2.0, 'he': 7.0, 'united': 3.0, 'with': 3.0, 'current': 1.0, 'campaign': 3.0, 'degree':

In [None]:
print (obama['word_count'])

## Find most common words in Obama article

In [19]:
obama_word_count_table = obama[['word_count']].stack('word_count',new_column_name=['word','count'])
obama_word_count_table
# the function stack is used to transform the shape of an SFrame
# Here we horizontalize the word_count dictionary into two columns
# We used double brackets to get an SFrame instead of an SArray

word,count
normalize,1.0
sought,1.0
combat,1.0
continued,1.0
unconstitutional,1.0
8,1.0
californias,1.0
1996,1.0
marriage,1.0
defense,1.0


In [20]:
obama_word_count_table.sort('count', ascending=False)

word,count
the,40.0
in,30.0
and,21.0
of,18.0
to,14.0
his,11.0
obama,9.0
act,8.0
a,7.0
he,7.0


## Compute TF-IDF for the entire corpus of articles
To compute _tf-idf_ we need to take into account the entire corpus of articles.

In [23]:
people['word_count'] = tc.text_analytics.count_words(people['text'])

In [None]:
people

In [30]:
# text_analytics already has a built in function for computing tfidf frequencies of every word.
people['tfidf'] = tc.text_analytics.tf_idf(people['text'])

In [31]:
people[['tfidf']]  # show the column of tfidf dictionaries of every person

tfidf
"{'melbourne': 3.8914310119380633, ..."
"{'time': 1.3253342074200498, ..."
"{'society': 2.4448047262085693, ..."
"{'kurdlawitzpreis': 10.986495389225194, ..."
"{'curtis': 5.299520032885375, ..."
"{'asses': 9.600201028105303, 's ..."
"{'streamz': 10.986495389225194, ..."
"{'concordia': 6.250296940830698, ..."
"{'heavies': 8.907053847545358, 'n ..."
"{'2002': 1.8753125887822302, ..."


## Examine the TF-IDF for the Obama article

In [32]:
obama = people[people['name'] == 'Barack Obama']
obama[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)

word,tfidf
obama,43.2956530720749
act,27.67822262297991
iraq,17.747378587965535
control,14.887060845181308
law,14.722935761763422
ordered,14.533373950913514
military,13.115932778499417
involvement,12.784385241175055
response,12.784385241175055
democratic,12.410688697332166


## Examine the TF-IDF for Clooney

In [22]:
clooney = people[people['name'] == 'George Clooney']
clooney[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)

word,tfidf
clooney,30.47679823695488
thriller,19.64459743254604
drama,13.544372218899175
comedydrama,12.973371437789858
er,12.782751078181208
actor,11.832160900443771
categoriesclooney,10.986495389225194
producingclooney,10.986495389225194
heslov,10.986495389225194
comedy,10.481205264908446


### Example: Cosine distance between two people documents
Just for effect let us use BillClinton and David Beckham

In [33]:
clinton = people[people['name'] == 'Bill Clinton']
beckham = people[people['name'] == 'David Beckham']

#### Is Obama closer to Clinton or to Beckham?

In [37]:
# Turicreate implements distance metrics in the 'distance' module.
# We have euclidean, cosine, manhattan and others.
tc.distances.cosine(obama['tfidf'][0],clinton['tfidf'][0])
# Cosine distince is similarity between two normalized vectors.
# We use the dot product to measure similarity. 
#The smaller the value the "closer".

0.8339854936884277

In [36]:
tc.distances.cosine(obama['tfidf'][0],beckham['tfidf'][0])

0.9791305844747478

# Nearest neighbors for retrieval of Wikipedia articles

## Build the NN model

In [38]:
knn_model = tc.nearest_neighbors.create(people,features=['tfidf'],label='name')

## Use model for retrieval... 
 for example, who is closest to Obama?

In [39]:
knn_model.query(obama) 
# the passed along object should also have a column with key tfidf?

query_label,reference_label,distance,rank
0,Barack Obama,0.0,1
0,Joe Biden,0.7941176470588236,2
0,Joe Lieberman,0.7946859903381642,3
0,Kelly Ayotte,0.8119891008174387,4
0,Bill Clinton,0.8138528138528138,5


## Other examples of retrieval

In [40]:
swift = people[people['name'] == 'Taylor Swift']

In [41]:
knn_model.query(swift)

query_label,reference_label,distance,rank
0,Taylor Swift,0.0,1
0,Carrie Underwood,0.7623188405797101,2
0,Alicia Keys,0.7647058823529411,3
0,Jordin Sparks,0.7696335078534031,4
0,Leona Lewis,0.7761194029850746,5


In [42]:
jolie = people[people['name'] == 'Angelina Jolie']

In [43]:
knn_model.query(jolie)

query_label,reference_label,distance,rank
0,Angelina Jolie,0.0,1
0,Brad Pitt,0.7840236686390533,2
0,Julianne Moore,0.7958579881656804,3
0,Billy Bob Thornton,0.80306905370844,4
0,George Clooney,0.8046875,5


In [44]:
arnold = people[people['name'] == 'Arnold Schwarzenegger']

In [45]:
knn_model.query(arnold)

query_label,reference_label,distance,rank
0,Arnold Schwarzenegger,0.0,1
0,Jesse Ventura,0.8189189189189189,2
0,John Kitzhaber,0.8246153846153846,3
0,Lincoln Chafee,0.8338762214983714,4
0,Anthony Foxx,0.8339100346020761,5


## Clustering Assignment

1. Take `Elton John` and compare the top 3 words according to regular word count and according to tf-idf. 

In [46]:
elton = people[people['name'] == 'Elton John']

In [47]:
elton

URI,name,text,word_count,tfidf
<http://dbpedia.org/resou rce/Elton_John> ...,Elton John,sir elton hercules john cbe born reginald ken ...,"{'movements': 1.0, 'social': 1.0, ...","{'movements': 5.030658019760364, ..."


In [53]:
elton_word_counts = elton[['word_count']].stack('word_count', 
                                                new_column_name=['word', 'count'])
elton_word_counts.sort('count', ascending=False)

word,count
the,27.0
in,18.0
and,15.0
of,13.0
a,10.0
has,9.0
he,7.0
john,7.0
on,6.0
award,5.0


In [52]:
elton_tfidf = elton[['tfidf']].stack('tfidf', 
                                                new_column_name=['word', 'tf-idf'])
elton_tfidf.sort('tf-idf', ascending=False)

word,tf-idf
furnish,18.38947183999428
elton,17.482320270031995
billboard,17.30368095754203
john,13.93931279239831
songwriters,11.25040644703154
overallelton,10.986495389225194
tonightcandle,10.986495389225194
fivedecade,10.293348208665249
19702000,10.293348208665249
aids,10.262846934045534


The top 3 words according to **word count** (bag of words):
1. the  
2. in  
3. and  

The top 3 words according to **tf-idf**:
1. furnish  
2. elton  
3. billboard

2. Compute the cosine distance between Elton John and Victoria Beckham? What about Elton John and Paul McCartney? 

In [60]:
victoria = people[people['name'] == 'Victoria Beckham']
paul = people[people['name'] == 'Paul McCartney']
# we use index 0 to retrieve the dictionary itself

In [62]:
print("Elton and Victoria", tc.distances.cosine(elton['tfidf'][0], victoria['tfidf'][0]))
print("Elton and Paul", tc.distances.cosine(elton['tfidf'][0], paul['tfidf'][0]))

Elton and Victoria 0.9567006376655429
Elton and Paul 0.8250310029221779


In [64]:
# using bag-of-words count
bow_model = tc.nearest_neighbors.create(people, features=['word_count'], label='name', distance='cosine')

In [69]:
# using tf-idf count
tfidf_model = tc.nearest_neighbors.create(people, features=['tfidf'], label='name', distance='cosine')

- What’s the most similar article, other than itself, to the one on ‘Elton John’ using word count features?
- What’s the most similar article, other than itself, to the one on ‘Elton John’ using TF-IDF features?
- What’s the most similar article, other than itself, to the one on ‘Victoria Beckham’ using word count features?
- What’s the most similar article, other than itself, to the one on ‘Victoria Beckham’ using TF-IDF features?

In [66]:
bow_model.query(elton)  # Cliff Richard is 2nd closest

query_label,reference_label,distance,rank
0,Elton John,2.220446049250313e-16,1
0,Cliff Richard,0.1614241525896703,2
0,Sandro Petrone,0.1682254275104111,3
0,Rod Stewart,0.168327165587061,4
0,Malachi O'Doherty,0.177315545978884,5


In [70]:
tfidf_model.query(elton) # Rod Stewart is 2nd Closest

query_label,reference_label,distance,rank
0,Elton John,-2.220446049250313e-16,1
0,Rod Stewart,0.7172196678927374,2
0,George Michael,0.7476009989692848,3
0,Sting (musician),0.7476719544306141,4
0,Phil Collins,0.7511932487904706,5


In [71]:
bow_model.query(victoria)  # Mary Fitzgerald

query_label,reference_label,distance,rank
0,Victoria Beckham,-2.220446049250313e-16,1
0,Mary Fitzgerald (artist),0.2073070361150499,2
0,Adrienne Corri,0.2145097827875479,3
0,Beverly Jane Fry,0.2174664687407927,4
0,Raman Mundair,0.2176954749915048,5


In [72]:
tfidf_model.query(victoria)  # David Beckham

query_label,reference_label,distance,rank
0,Victoria Beckham,1.1102230246251563e-16,1
0,David Beckham,0.5481696102632145,2
0,Stephen Dow Beckham,0.7849867068283364,3
0,Mel B,0.8095855234085036,4
0,Caroline Rush,0.81982642291868,5
