In [None]:
import graphlab

## Load Some Test Data

In [None]:
people = graphlab.SFrame('people_wiki.gl/')

In [None]:
people.head()

In [None]:
len(people)

## Explore the data set and the data it includes


In [None]:
obama = people[people['name'] == 'Barack Obama']

In [None]:
obama

In [None]:
obama['text']

In [None]:
cloney = people[people['name'] == 'George Clooney']

In [None]:
cloney['text']

## Get the word counts for the obama article

In [None]:
obama['word_count'] = graphlab.text_analytics.count_words(obama['text'])

In [None]:
print(obama['word_count'])

## Sort the work count for the Obama article

In [None]:
obama_word_count_table = obama[['word_count']].stack('word_count', new_column_name=['word','count'])

In [None]:
obama_word_count_table.head()

In [None]:
obama_word_count_table.sort('count', ascending = False)

## Compute TF-IDF for the corpus

In [None]:
people['word_count'] = graphlab.text_analytics.count_words(people['text'])

In [None]:
people.head()

In [None]:
tfidf = graphlab.text_analytics.tf_idf(people['word_count'])
tfidf

In [None]:
people['tfidf'] = tfidf

In [None]:
people.head()

## Examine TFIDF for Obama article

In [None]:
obama = people[people['name'] == 'Barack Obama']

In [None]:
obama[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)

# Manually compute distances  between a few people

In [None]:
clinton = people[people['name'] == 'Bill Clinton']

In [None]:
beckham = people[people['name'] == 'David Beckham']

## Is Obama closer to Clinton than to Beckham?

In [None]:
graphlab.distances.cosine(obama['tfidf'][0],clinton['tfidf'][0]) #lower is closer

In [None]:
graphlab.distances.cosine(obama['tfidf'][0],beckham['tfidf'][0])

# Build a nearest neighbor model for document retrieval

In [None]:
knn_model = graphlab.nearest_neighbors.create(people, features=['tfidf'], label='name')

## Applying the nearest neighbors model for retrieval
### Who is nearest to Obama? 

In [None]:
knn_model.query(obama)

## Other examples of document retrieval

In [None]:
swift = people[people['name'] == 'Taylor Swift']

In [None]:
knn_model.query(swift)

In [None]:
jolie = people[people['name'] == 'Angelina Jolie']

In [None]:
knn_model.query(jolie)

In [None]:
arnold = people[people['name'] == 'Arnold Schwarzenegger']

In [None]:
knn_model.query(arnold)

## Elton

In [None]:
elton = people[people['name'] == 'Elton John']

In [None]:
elton_word_count_table = elton[['word_count']].stack('word_count', new_column_name=['word','count'])

In [None]:
elton_word_count_table.sort('count', ascending = False)

In [None]:
elton[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)

## Distances

In [None]:
victoria = people[people['name'] == 'Victoria Beckham']

In [None]:
paul = people[people['name'] == 'Paul McCartney']

In [None]:
graphlab.distances.cosine(elton['tfidf'][0],victoria['tfidf'][0]) #lower is closer

In [None]:
graphlab.distances.cosine(elton['tfidf'][0],paul['tfidf'][0]) #lower is closer

## Comparing Models

In [None]:
wc_model = graphlab.nearest_neighbors.create(people, features=['word_count'], label='name',distance='cosine')

In [None]:
tfidf_model = graphlab.nearest_neighbors.create(people, features=['tfidf'], label='name',distance='cosine')

### Evaluate models for Elton

In [None]:
wc_model.query(elton)

In [None]:
tfidf_model.query(elton)

## Evaluate models for Victoria

In [None]:
wc_model.query(victoria)

In [None]:
tfidf_model.query(victoria)