# Document retrieval from wikipedia data
The dataset used in this notebook can be found here [people_wiki.csv](https://d396qusza40orc.cloudfront.net/phoenixassets/people_wiki.csv)

In [1]:
import pandas as pd
import numpy as np

In [2]:
import json
from collections import OrderedDict
from itertools import islice

In [3]:
from nltk.corpus import stopwords

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import pairwise_distances

from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier

In [5]:
people = pd.read_csv("./data/people_wiki.csv")

In [6]:
people.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [7]:
people.shape

(59071, 3)

## Explore Data Set

In [10]:
obama = people.loc[people.loc[:,"name"] == "Barack Obama"]

In [11]:
obama

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [12]:
print(obama.loc[35817,"text"])

barack hussein obama ii brk husen bm born august 4 1961 is the 44th and current president of the united states and the first african american to hold the office born in honolulu hawaii obama is a graduate of columbia university and harvard law school where he served as president of the harvard law review he was a community organizer in chicago before earning his law degree he worked as a civil rights attorney and taught constitutional law at the university of chicago law school from 1992 to 2004 he served three terms representing the 13th district in the illinois senate from 1997 to 2004 running unsuccessfully for the united states house of representatives in 2000in 2004 obama received national attention during his campaign to represent illinois in the united states senate with his victory in the march democratic party primary his keynote address at the democratic national convention in july and his election to the senate in november he began his presidential campaign in 2007 and after

## Words count for Obama article
**No need to clean the data**: From the data exploration steps, it looks like data (text of the articles) are already clean, so there's no need to perfom a cleaning process. The text could be used "as is" to build bag of words.

**We are not using stop_words**: It is not used in the course notebook, probably on purpose to demonstrate the benefit of TF-IDF technique.

We limit the vocabulary to 5000 words for now, we'll see if it is enough to match the notebook of the course.

In [13]:
vectorizer = CountVectorizer(analyzer = "word",tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 5000)

In [14]:
obama_words_count = vectorizer.fit_transform(obama["text"]).toarray()

In [15]:
obama.loc[35817,"words_count"] = str(dict(zip(vectorizer.get_feature_names(),obama_words_count[0])))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [16]:
obama

Unnamed: 0,URI,name,text,words_count
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...,"{'won': 1, 'republican': 2, 'barack': 1, 'dont..."


In [17]:
obama.loc[35817,"words_count"]

"{'won': 1, 'republican': 2, 'barack': 1, 'dont': 2, 'with': 3, 'republicans': 1, 'by': 1, 'reauthorization': 1, 'initiatives': 1, 'review': 1, 'stimulus': 1, 'sufficient': 1, 'hillary': 1, 'president': 4, 'reelected': 1, 'election': 3, 'receive': 1, 'representatives': 2, 'in': 30, 'repeal': 1, 'tax': 1, 'august': 1, 'related': 1, 'seats': 1, 'close': 1, 'arms': 1, 'budget': 1, 'rights': 1, 'he': 7, 'operation': 1, 'nine': 1, 'bin': 1, 'other': 1, 'democratic': 4, 'supreme': 1, '2013': 1, 'gains': 1, 'while': 1, 'libya': 1, 'before': 1, 'lost': 1, 'hold': 1, 'treaty': 1, 'taught': 1, 'nominee': 2, 'general': 1, 'foreign': 2, 'bm': 1, 'mccain': 1, 'down': 1, 'sworn': 1, 'from': 3, 'three': 1, 'july': 1, 'began': 1, 'act': 8, 'peace': 1, 'current': 1, 'hawaii': 1, 'november': 2, 'convention': 1, 'administration': 1, 'over': 1, 'is': 2, 'honolulu': 1, '1996': 1, 'years': 1, 'illinois': 2, 'represent': 1, 'response': 3, 'then': 1, '63': 1, 'ii': 1, 'where': 1, 'african': 1, 'whether': 1, '

### Sorting words count for Obama

In [18]:
obama_words_count_table = pd.DataFrame({"count":obama_words_count[0],"words":vectorizer.get_feature_names()})

In [19]:
obama_words_count_table.sort_values("count",ascending=False).head(10)

Unnamed: 0,count,words
242,40,the
115,30,in
28,21,and
162,18,of
245,14,to
106,11,his
160,9,obama
18,8,act
104,7,he
30,6,as


## Compute tf-idf for people_wiki corpus

In [20]:
vectorizer_tfidf = TfidfVectorizer(analyzer = "word",tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 5000)

In [21]:
%%time
tfidf = vectorizer_tfidf.fit_transform(people.loc[:,"text"])

Wall time: 14.5 s


### Create the "tfidf" column

In [22]:
tfidf_a = tfidf.toarray()

**to_dict** is transforming one row of the tdidf_a array into a dictionary. Only items with value > 0 are kept when building the dictionary.

The dictionary is encoded in json so that it can be put into the dataframe.

In [23]:
def to_dict(tdidf_row):
    dic = {k:v for k,v in zip(vectorizer_tfidf.get_feature_names(),tdidf_row) if v>0}
    return(json.dumps(dic))

In [24]:
%%time
tfidf_df = pd.DataFrame(tfidf_a,people.index).apply(to_dict, axis=1)

Wall time: 6min 41s


In [25]:
people.insert(len(people.columns),"tfidf",tfidf_df)

In [26]:
people.head(5)

Unnamed: 0,URI,name,text,tfidf
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,"{""being"": 0.03941450419577328, ""retiring"": 0.0..."
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,"{""1973"": 0.06602200240988076, ""to"": 0.05445665..."
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,"{""home"": 0.09833416880092627, ""life"": 0.045857..."
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,"{""1973"": 0.04583276033333993, ""und"": 0.0761122..."
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{""50"": 0.06755992532656652, ""along"": 0.1087204..."


### Look at the first 10 most important words for Obama article as found by TF-IDF

In [27]:
obama = people.loc[people.loc[:,"name"] == "Barack Obama"]

In [28]:
obama

Unnamed: 0,URI,name,text,tfidf
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...,"{""won"": 0.01816452900391534, ""republican"": 0.0..."


Get the 10 most important words:
- Order the dictionary
- transform into a list (dict not indexable), and take first 10.

In [29]:
obama_ordered_tfidf = OrderedDict(sorted(json.loads(obama.loc[35817,"tfidf"]).items(), key=lambda t: t[1], reverse=True))

In [30]:
list(obama_ordered_tfidf.items())[0:10]

[('obama', 0.3983862007203234),
 ('the', 0.30485743226669804),
 ('act', 0.27185987662017774),
 ('in', 0.22884055920050594),
 ('iraq', 0.16568635397679352),
 ('and', 0.16015314257916738),
 ('law', 0.15791425408479473),
 ('control', 0.14391109380018258),
 ('of', 0.13774201209013084),
 ('us', 0.13406301846956642)]

## Compute (cosine) distance between few people
We will use the tfidf sparse matrix as returned by **vectorizer_tfidf.fit_transform**. We first get indexes for the "few people" and compute the cosine distance using the sparse matrix.

In [31]:
obama_index = people.loc[people.loc[:,"name"] == "Barack Obama"].index[0]
clinton_index = people.loc[people.loc[:,"name"] == "Bill Clinton"].index[0]
beckham_index = people.loc[people.loc[:,"name"] == "David Beckham"].index[0]

In [32]:
pairwise_distances(tfidf[obama_index], tfidf[beckham_index], metric = "cosine")

array([[ 0.75699731]])

In [33]:
pairwise_distances(tfidf[obama_index], tfidf[clinton_index], metric = "cosine")

array([[ 0.61985685]])

## Re-compute distance with different configuration of the Vectorizer
### No limit on the number of features

In [34]:
vectorizer_tfidf_2 = TfidfVectorizer(analyzer = "word",tokenizer = None,
                             preprocessor = None,
                             stop_words = None)

In [35]:
%%time
tfidf_2 = vectorizer_tfidf_2.fit_transform(people.loc[:,"text"])

Wall time: 25.7 s


In [36]:
tfidf_2.shape

(59071, 548429)

In [37]:
pairwise_distances(tfidf_2[obama_index], tfidf_2[beckham_index], metric = "cosine")

array([[ 0.8420454]])

In [38]:
pairwise_distances(tfidf_2[obama_index], tfidf_2[clinton_index], metric = "cosine")

array([[ 0.67497775]])

### No limit on the number of features + Removing stop words

In [39]:
stopswrd = set(stopwords.words("english"))

In [40]:
vectorizer_tfidf_3 = TfidfVectorizer(analyzer = "word",tokenizer = None,
                             preprocessor = None,
                             stop_words = stopswrd)

In [41]:
%%time
tfidf_3 = vectorizer_tfidf_3.fit_transform(people.loc[:,"text"])

Wall time: 25.1 s


In [42]:
pairwise_distances(tfidf_3[obama_index], tfidf_3[beckham_index], metric = "cosine")

array([[ 0.97175843]])

In [43]:
pairwise_distances(tfidf_3[obama_index], tfidf_3[clinton_index], metric = "cosine")

array([[ 0.80338951]])

### Removing stop words, keep max_features to 5000

In [44]:
vectorizer_tfidf_4 = TfidfVectorizer(analyzer = "word",tokenizer = None,
                             preprocessor = None,
                             stop_words = stopswrd,
                             max_features = 5000)

In [45]:
%%time
tfidf_4 = vectorizer_tfidf_4.fit_transform(people.loc[:,"text"])

Wall time: 22.9 s


In [46]:
pairwise_distances(tfidf_4[obama_index], tfidf_4[beckham_index], metric = "cosine")

array([[ 0.95236845]])

In [47]:
pairwise_distances(tfidf_4[obama_index], tfidf_4[clinton_index], metric = "cosine")

array([[ 0.76615969]])

## Build a nearest neighbour model for document retrieval
Using **brute** algorithm and **cosine** metric as it seems to be the ones used for the course. Also using the vectorizer that gave the result most similar to the course for the distance(obama - clinton - beckham), i.e. **tfidf_3**.

### Using the KNeighborsClassifier classifier

In [48]:
kneighb = KNeighborsClassifier(algorithm="brute", metric="cosine")

In [49]:
kneighb.fit(tfidf_3,people.loc[:,"name"])

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='cosine',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

### Applying model for document retrieval: Closest documents to Obama document

In [50]:
Nearest_obama_neighbours = kneighb.kneighbors(tfidf_3[obama_index],n_neighbors=5)

In [51]:
Nearest_obama_neighbours_df = pd.DataFrame(people.loc[Nearest_obama_neighbours[1][0],"name"])
Nearest_obama_neighbours_df.insert(1,"distance",Nearest_obama_neighbours[0][0].reshape(-1,1))
Nearest_obama_neighbours_df

Unnamed: 0,name,distance
35817,Barack Obama,0.0
24478,Joe Biden,0.668282
57108,Hillary Rodham Clinton,0.728204
38376,Samantha Power,0.728826
38714,Eric Stern (politician),0.746098


### Using the NearestNeighbors unsupervised learner

In [52]:
neighb = NearestNeighbors(algorithm="brute", metric="cosine")

In [53]:
neighb.fit(tfidf_3)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

### Applying model for document retrieval: Closest documents to Obama document

In [54]:
Nearest_obama_neighbours = neighb.kneighbors(tfidf_3[obama_index])

In [55]:
Nearest_obama_neighbours_df = pd.DataFrame(people.loc[Nearest_obama_neighbours[1][0],"name"])
Nearest_obama_neighbours_df.insert(1,"distance",Nearest_obama_neighbours[0][0].reshape(-1,1))
Nearest_obama_neighbours_df

Unnamed: 0,name,distance
35817,Barack Obama,0.0
24478,Joe Biden,0.668282
57108,Hillary Rodham Clinton,0.728204
38376,Samantha Power,0.728826
38714,Eric Stern (politician),0.746098


### Other examples: Closest documents to Swift document

In [56]:
swift_index = people.loc[people.loc[:,"name"] == "Taylor Swift"].index[0]

In [57]:
Nearest_swift_neighbours = neighb.kneighbors(tfidf_3[swift_index])

In [58]:
Nearest_swift_neighbours_df = pd.DataFrame(people.loc[Nearest_swift_neighbours[1][0],"name"])
Nearest_swift_neighbours_df.insert(1,"distance",Nearest_swift_neighbours[0][0].reshape(-1,1))
Nearest_swift_neighbours_df

Unnamed: 0,name,distance
54264,Taylor Swift,0.0
317,Carrie Underwood,0.697925
29297,Kelly Clarkson,0.710268
9379,Al Swift,0.71164
25403,Ed Sheeran,0.711675


### Other examples: Closest documents to Jolie document

In [59]:
jolie_index = people.loc[people.loc[:,"name"] == "Angelina Jolie"].index[0]

In [60]:
Nearest_jolie_neighbours = neighb.kneighbors(tfidf_3[jolie_index])

In [61]:
Nearest_jolie_neighbours_df = pd.DataFrame(people.loc[Nearest_jolie_neighbours[1][0],"name"])
Nearest_jolie_neighbours_df.insert(1,"distance",Nearest_jolie_neighbours[0][0].reshape(-1,1))
Nearest_jolie_neighbours_df

Unnamed: 0,name,distance
39521,Angelina Jolie,0.0
24426,Brad Pitt,0.690044
16625,Keith Jolie,0.772123
21644,Jodie Foster,0.783009
34756,Maggie Smith,0.786867


### Other examples: Closest documents to Arnold document

In [62]:
arnold_index = people.loc[people.loc[:,"name"] == "Arnold Schwarzenegger"].index[0]

In [63]:
Nearest_arnold_neighbours = neighb.kneighbors(tfidf_3[arnold_index])

In [64]:
Nearest_arnold_neighbours_df = pd.DataFrame(people.loc[Nearest_arnold_neighbours[1][0],"name"])
Nearest_arnold_neighbours_df.insert(1,"distance",Nearest_arnold_neighbours[0][0].reshape(-1,1))
Nearest_arnold_neighbours_df

Unnamed: 0,name,distance
16018,Arnold Schwarzenegger,-2.220446e-16
58965,Bonnie Garcia,0.7935808
35293,Paul Grant (bodybuilder),0.7996773
47709,Gray Davis,0.8240722
8050,James Tramel,0.8255865
