# Document retrieval from wikipedia data

## Fire up GraphLab Create

In [1]:
import graphlab

# Load some text data - from wikipedia, pages on people

In [None]:
people = graphlab.SFrame('people_wiki.gl/')

Data contains:  link to wikipedia article, name of person, text of article.

In [None]:
people.head()

In [None]:
len(people)

# Explore the dataset and checkout the text it contains

## Exploring the entry for president Obama

In [None]:
obama = people[people['name'] == 'Barack Obama']

In [None]:
obama

In [None]:
obama['text']

## Exploring the entry for actor George Clooney

In [None]:
clooney = people[people['name'] == 'George Clooney']
clooney['text']

# Get the word counts for Obama article

In [None]:
obama['word_count'] = graphlab.text_analytics.count_words(obama['text'])

In [None]:
print obama['word_count']

## Sort the word counts for the Obama article

### Turning dictonary of word counts into a table

In [None]:
obama_word_count_table = obama[['word_count']].stack('word_count', new_column_name = ['word','count'])

### Sorting the word counts to show most common words at the top

In [None]:
obama_word_count_table.head()

In [None]:
obama_word_count_table.sort('count',ascending=False)

Most common words include uninformative words like "the", "in", "and",...

# Compute TF-IDF for the corpus 

To give more weight to informative words, we weigh them by their TF-IDF scores.

In [None]:
people['word_count'] = graphlab.text_analytics.count_words(people['text'])
people.head()

In [None]:
tfidf = graphlab.text_analytics.tf_idf(people['word_count'])

# Earlier versions of GraphLab Create returned an SFrame rather than a single SArray
# This notebook was created using Graphlab Create version 1.7.1
if graphlab.version <= '1.6.1':
    tfidf = tfidf['docs']

tfidf

In [None]:
people['tfidf'] = tfidf

## Examine the TF-IDF for the Obama article

In [None]:
obama = people[people['name'] == 'Barack Obama']

In [None]:
obama[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)

Words with highest TF-IDF are much more informative.

# Manually compute distances between a few people

Let's manually compare the distances between the articles for a few famous people.  

In [None]:
clinton = people[people['name'] == 'Bill Clinton']

In [None]:
beckham = people[people['name'] == 'David Beckham']

## Is Obama closer to Clinton than to Beckham?

We will use cosine distance, which is given by

(1-cosine_similarity) 

and find that the article about president Obama is closer to the one about former president Clinton than that of footballer David Beckham.

In [None]:
graphlab.distances.cosine(obama['tfidf'][0],clinton['tfidf'][0])

In [None]:
graphlab.distances.cosine(obama['tfidf'][0],beckham['tfidf'][0])

# Build a nearest neighbor model for document retrieval

We now create a nearest-neighbors model and apply it to document retrieval.  

In [None]:
knn_model = graphlab.nearest_neighbors.create(people,features=['tfidf'],label='name')

# ---- Assignment START ----

In [1]:
import graphlab

In [2]:
people = graphlab.SFrame("people_wiki.gl")

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\xingm\AppData\Local\Temp\graphlab_server_1473686743.log.0


This non-commercial license of GraphLab Create for academic use is assigned to guomaoxin@foxmail.com and will expire on September 06, 2017.


In [3]:
people['word_count'] = graphlab.text_analytics.count_words(people['text'])
people['tfidf'] = graphlab.text_analytics.tf_idf(people['word_count'])

## Quiz 1

In [4]:
elton_john = people[people['name'] == 'Elton John']

In [5]:
print elton_john

+-------------------------------+------------+-------------------------------+
|              URI              |    name    |              text             |
+-------------------------------+------------+-------------------------------+
| <http://dbpedia.org/resour... | Elton John | sir elton hercules john cb... |
+-------------------------------+------------+-------------------------------+
+-------------------------------+-------------------------------+
|           word_count          |             tfidf             |
+-------------------------------+-------------------------------+
| {'all': 1L, 'six': 1L, 'pr... | {'all': 1.6431112434912472... |
+-------------------------------+-------------------------------+
[? rows x 5 columns]
Note: Only the head of the SFrame is printed. This SFrame is lazily evaluated.
You can use sf.materialize() to force materialization.


In [6]:
elton_john[['word_count']].stack('word_count', new_column_name = ['word', 'count']).sort('count', ascending=False)

word,count
the,27
in,18
and,15
of,13
a,10
has,9
he,7
john,7
on,6
since,5


In [7]:
elton_john[['tfidf']].stack('tfidf', new_column_name = ['word', 'w']).sort('w', ascending=False)

word,w
furnish,18.38947184
elton,17.48232027
billboard,17.3036809575
john,13.9393127924
songwriters,11.250406447
overallelton,10.9864953892
tonightcandle,10.9864953892
19702000,10.2933482087
fivedecade,10.2933482087
aids,10.262846934


## Quiz 2

In [8]:
victoria_beckham = people[people['name'] == 'Victoria Beckham']

In [9]:
graphlab.distances.cosine(elton_john['tfidf'][0], victoria_beckham['tfidf'][0])

0.9567006376655429

In [10]:
paul_mccartney = people[people['name'] == 'Paul McCartney']

In [11]:
graphlab.distances.cosine(elton_john['tfidf'][0], paul_mccartney['tfidf'][0])

0.8250310029221779

## Quiz 3

In [12]:
counts_model = graphlab.nearest_neighbors.create(people, features=['word_count'], label='name', distance='cosine')

In [13]:
tfidf_model = graphlab.nearest_neighbors.create(people, features=['tfidf'], label='name', distance='cosine')

In [14]:
counts_model.query(elton_john)

query_label,reference_label,distance,rank
0,Elton John,2.22044604925e-16,1
0,Cliff Richard,0.16142415259,2
0,Sandro Petrone,0.16822542751,3
0,Rod Stewart,0.168327165587,4
0,Malachi O'Doherty,0.177315545979,5


In [15]:
tfidf_model.query(elton_john)

query_label,reference_label,distance,rank
0,Elton John,-2.22044604925e-16,1
0,Rod Stewart,0.717219667893,2
0,George Michael,0.747600998969,3
0,Sting (musician),0.747671954431,4
0,Phil Collins,0.75119324879,5


In [16]:
counts_model.query(victoria_beckham)

query_label,reference_label,distance,rank
0,Victoria Beckham,-2.22044604925e-16,1
0,Mary Fitzgerald (artist),0.207307036115,2
0,Adrienne Corri,0.214509782788,3
0,Beverly Jane Fry,0.217466468741,4
0,Raman Mundair,0.217695474992,5


In [17]:
tfidf_model.query(victoria_beckham)

query_label,reference_label,distance,rank
0,Victoria Beckham,1.11022302463e-16,1
0,David Beckham,0.548169610263,2
0,Stephen Dow Beckham,0.784986706828,3
0,Mel B,0.809585523409,4
0,Caroline Rush,0.819826422919,5


# ---- Assignment END ----

# Applying the nearest-neighbors model for retrieval

## Who is closest to Obama?

In [None]:
knn_model.query(obama)

As we can see, president Obama's article is closest to the one about his vice-president Biden, and those of other politicians.  

## Other examples of document retrieval

In [None]:
swift = people[people['name'] == 'Taylor Swift']

In [None]:
knn_model.query(swift)

In [None]:
jolie = people[people['name'] == 'Angelina Jolie']

In [None]:
knn_model.query(jolie)

In [None]:
arnold = people[people['name'] == 'Arnold Schwarzenegger']

In [None]:
knn_model.query(arnold)