# Retrieving Wikipedia articles

## Fire up GraphLab Create

In [3]:
import graphlab

## Load text data from wikipedia

In [4]:
people = graphlab.SFrame('people_wiki.gl/')

2016-03-29 09:03:20,284 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.8.5 started. Logging: /tmp/graphlab_server_1459242199.log


This non-commercial license of GraphLab Create is assigned to gdwangh@189.cn and will expire on February 23, 2017. For commercial licensing options, visit https://dato.com/buy/.


In [5]:
people['word_count'] = graphlab.text_analytics.count_words(people['text'])
people['tfidf'] = graphlab.text_analytics.count_words(people['word_count'])

## 1. Compare top words according to word counts to TF-IDF 

take a particular famous person, 'Elton John'. 

In [6]:
Elton = people[people['name'] == 'Elton John']

In [7]:
Elton

URI,name,text,word_count
<http://dbpedia.org/resou rce/Elton_John> ...,Elton John,sir elton hercules john cbe born reginald ken ...,"{'all': 1, 'six': 1, 'producer': 1, 'heavi ..."

tfidf
"{'all': 1, 'least': 1, 'producer': 1, 'heavi ..."


In [8]:
Elton[['word_count']].stack('word_count', new_column_name = ['word', 'count']).sort('count', ascending=False)

word,count
the,27
in,18
and,15
of,13
a,10
has,9
he,7
john,7
on,6
since,5


In [9]:
Elton[['tfidf']].stack('tfidf', new_column_name = ['word', 'tfidf']).sort('tfidf', ascending=False)

word,tfidf
hercules,1
after,1
single,1
samesex,1
solo,1
overallelton,1
him,1
he,1
march,1
look,1


## 2.Measuring distance

calculate the cosine distance between the articles on ‘Elton John’ and ‘Victoria Beckham' & the cosine distance between the articles on ‘Elton John’ and Paul McCartney

In [10]:
Beckham = people[people['name']=='Victoria Beckham']

graphlab.distances.cosine(Elton['tfidf'][0],Beckham['tfidf'][0])

0.7443742231011785

In [11]:
PaulMcCatney = people[people['name'] == 'Paul McCartney']

graphlab.distances.cosine(Elton['tfidf'][0],PaulMcCatney['tfidf'][0])

0.6703544592711868

## 3.Building nearest neighbors models with different input features and setting the distance metric


build two nearest neighbors models:

* Using word counts as features
* Using TF-IDF as features

In [12]:
word_count_model = graphlab.nearest_neighbors.create(people,features=['word_count'],label='name',distance='cosine')

In [None]:
tfidf_model = graphlab.nearest_neighbors.create(people,features=['tfidf'],label='name',distance='cosine')

### Who is closest to ‘Elton John’ using word count features?

In [13]:
word_count_model.query(Elton)

query_label,reference_label,distance,rank
0,Elton John,2.22044604925e-16,1
0,Cliff Richard,0.16142415259,2
0,Sandro Petrone,0.16822542751,3
0,Rod Stewart,0.168327165587,4
0,Malachi O'Doherty,0.177315545979,5


### Who is closest to ‘Elton John’ using TF-IDF features?

In [1]:
tfidf_model.query(Elton)

NameError: name 'tfidf_model' is not defined

### Who is closest to ‘Victoria Beckham’ using word count features?

In [2]:
word_count_model.query(Beckham)

NameError: name 'word_count_model' is not defined

### Who is closest to‘Victoria Beckham’ using TF-IDF features?

In [None]:
tfidf_model.query(Beckham)