# Document retrieval from wikipedia data

In [1]:
import pandas as pd
import numpy as np

# Load some text data - from wikipedia, pages on people

In [2]:
people = pd.read_csv('people_wiki.csv')

In [3]:
people.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [4]:
len(people)

59071

# Explore the dataset and checkout the text it contains

## Exploring the entry for president Obama

In [5]:
obama = people[people['name'] == 'Barack Obama']

In [6]:
obama

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [7]:
obama['text']

35817    barack hussein obama ii brk husen bm born augu...
Name: text, dtype: object

## Exploring the other entries

In [8]:
clooney = people[people['name'] == 'George Clooney']
clooney['text']

38514    george timothy clooney born may 6 1961 is an a...
Name: text, dtype: object

In [13]:
John = people[people['name'] == 'Elton John']
John['text']

19923    sir elton hercules john cbe born reginald kenn...
Name: text, dtype: object

# Get the word counts for Obama article

In [63]:
from sklearn.feature_extraction.text import CountVectorizer

In [64]:
vectorizer = CountVectorizer()
analyzer = vectorizer.build_analyzer()

In [65]:
def count_words(doc):
    dic = {}
    if analyzer(doc):
        M = count_vectorizer.fit_transform([doc]).toarray()[0]
        for word,index in count_vectorizer.vocabulary_.items():
            dic[word] = M[index]
    return dic

In [66]:
obama['word_count'] = obama['text'].apply(count_words)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [68]:
obama.head()

Unnamed: 0,URI,name,text,word_count
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...,"{'barack': 1, 'hussein': 1, 'obama': 9, 'ii': ..."


# Sort the word counts for the Obama article

In [77]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
x_obama = vectorizer.fit_transform(obama['text'])

In [78]:
print(vectorizer.get_feature_names())
print(x_obama.toarray())

['13th', '1961', '1992', '1996', '1997', '20', '2000in', '2004', '2007', '2008', '2009', '2010', '2011', '2012', '2012obama', '2013', '44th', '63', 'act', 'address', 'administration', 'affordable', 'afghanistan', 'african', 'after', 'against', 'american', 'americans', 'and', 'arms', 'as', 'ask', 'at', 'attention', 'attorney', 'august', 'barack', 'before', 'began', 'bin', 'bm', 'born', 'briefs', 'brk', 'budget', 'by', 'californias', 'called', 'campaign', 'care', 'chicago', 'civil', 'clinton', 'close', 'columbia', 'combat', 'community', 'constitutional', 'consumer', 'continued', 'control', 'convention', 'court', 'creation', 'cuba', 'current', 'death', 'debate', 'debt', 'defeated', 'defeating', 'defense', 'degree', 'delegates', 'democratic', 'district', 'doddfrank', 'domestic', 'dont', 'down', 'during', 'earning', 'economic', 'election', 'elementary', 'ended', 'ending', 'equality', 'federal', 'filed', 'first', 'for', 'foreign', 'form', 'from', 'full', 'gains', 'general', 'graduate', 'grea

In [79]:
word_obama = vectorizer.get_feature_names()

In [80]:
word_count = x_obama.toarray().tolist()
word_count

[[1,
  1,
  1,
  1,
  1,
  2,
  1,
  3,
  1,
  1,
  3,
  2,
  3,
  1,
  1,
  1,
  1,
  1,
  8,
  1,
  1,
  1,
  2,
  1,
  4,
  1,
  3,
  1,
  21,
  1,
  6,
  1,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  3,
  1,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  4,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  4,
  1,
  1,
  2,
  2,
  1,
  2,
  1,
  1,
  3,
  1,
  1,
  1,
  1,
  1,
  1,
  3,
  4,
  2,
  1,
  3,
  1,
  1,
  1,
  1,
  1,
  1,
  2,
  4,
  1,
  7,
  1,
  11,
  1,
  1,
  1,
  2,
  1,
  1,
  1,
  2,
  30,
  1,
  1,
  1,
  1,
  1,
  1,
  3,
  4,
  2,
  1,
  3,
  1,
  1,
  1,
  1,
  1,
  1,
  6,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  4,
  1,
  1,
  1,
  2,
  1,
  1,
  1,
  1,
  1,
  2,
  1,
  1,
  2,
  9,
  1,
  18,
  2,
  1,
  2,
  1,
  1,
  1,
  3,
  1,
  1,
  1,
  1,
  3,
  1,
  1,
  1,
  2,
  4,
  2,
  1,
  2,
  1,
  1,
  1,
  1,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
 

In [81]:
word_count_flat = [item for sublist in word_count for item in sublist]
word_count_flat

[1,
 1,
 1,
 1,
 1,
 2,
 1,
 3,
 1,
 1,
 3,
 2,
 3,
 1,
 1,
 1,
 1,
 1,
 8,
 1,
 1,
 1,
 2,
 1,
 4,
 1,
 3,
 1,
 21,
 1,
 6,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 1,
 1,
 2,
 2,
 1,
 2,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 4,
 2,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 4,
 1,
 7,
 1,
 11,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 2,
 30,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 4,
 2,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 6,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 4,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 9,
 1,
 18,
 2,
 1,
 2,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 2,
 4,
 2,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 2,
 1,
 2,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 2,
 3,
 2,
 1,
 3,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 40,
 1,
 1,
 14,
 1,
 1,
 1

In [90]:
word_count_obama = pd.DataFrame({
    'word_obama':word_obama,
    'word_count':word_count_flat
})
word_count_obama.sort_values(by='word_count', ascending = False)

Unnamed: 0,word_obama,word_count
242,the,40
115,in,30
28,and,21
162,of,18
245,to,14
106,his,11
160,obama,9
18,act,8
104,he,7
30,as,6


# Compute TF-IDF for the corpus 

In [104]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer()
tfidf_matrix = vect.fit_transform(people['text'])

In [105]:
feature_names = vect.get_feature_names()

In [122]:
doc = 35817
feature_index = tfidf_matrix[doc,:].nonzero()[1]
tfidf_scores = zip(feature_index, [tfidf_matrix[doc, x] for x in feature_index])
zip_tfidf = [(feature_names[i], s) for (i, s) in tfidf_scores]

In [123]:
word_O=[]
iftdf_O=[]

In [124]:
for w, s in zip_tfidf:
    print (w, s)
    word_O.append(w)
    iftdf_O.append(s)

born 0.017709931271046624
is 0.014350365063062639
with 0.02518812459925273
the 0.2793227400023615
and 0.14673880270062417
in 0.20967299876631698
his 0.09697568415459205
for 0.029964410726194306
he 0.05930502949224121
was 0.03768254961844391
at 0.016971534950574383
of 0.1262048162788276
to 0.10234195997379708
by 0.009597572040438845
as 0.047221719116605276
during 0.032362102172086446
first 0.035517610776050694
against 0.021001902062722264
other 0.017053337660265806
before 0.017433021692964398
continued 0.02597680547563224
2007 0.017372040001768754
2008 0.01752122556256475
from 0.02710027260881693
2009 0.053716110129215185
2013 0.020629112598581526
district 0.026352926305256295
2011 0.0566046837038299
has 0.03989687916800934
sandy 0.04531093181008557
university 0.025797309845256983
chicago 0.0548648391973696
after 0.05430461125327095
full 0.02794850759724801
national 0.030528412104160435
worked 0.017832308630830872
on 0.0165399428357477
that 0.011600429362851402
current 0.026746457660782

In [125]:
tfidf_obama=pd.DataFrame({
    'word_obmam':word_O,
    'iftdf_obama':iftdf_O
})
tfidf_obama.sort_values(by='iftdf_obama', ascending=False)

Unnamed: 0,word_obmam,iftdf_obama
171,obama,0.365018
3,the,0.279323
177,act,0.249089
5,in,0.209673
155,iraq,0.151809
4,and,0.146739
137,law,0.144687
185,control,0.131857
11,of,0.126205
58,us,0.122834


## Is Obama closer to Clinton than to Beckham?

In [126]:
from sklearn.metrics.pairwise import cosine_distances

In [129]:
def norm(x):
    sum_sq=x.dot(x.T)
    norm=np.sqrt(sum_sq)
    return(norm)

In [145]:
import numpy as np
def cosine_distance(x, y):
    xy = x.dot(y.T)
    dist = xy/(norm(x)*norm(y))
    return 1-dist[0,0]

Biden_tf_idf = tfidf_matrix[24478,:]
Obama_tf_idf = tfidf_matrix[35817,:]


print ('================= Cosine distance from John')
print ('answer:cosine distance between the articles on ‘Joe Biden’ and ‘Barack Obama’',cosine_distance(Biden_tf_idf, Obama_tf_idf))

answer:cosine distance between the articles on ‘Joe Biden’ and ‘Barack Obama’ 0.5707806801562201


# Build a nearest neighbor model for document retrieval

In [None]:
# Approach 1 - using word count features

In [131]:
from sklearn.neighbors import NearestNeighbors
x = CountVectorizer().fit_transform(people['text'])

In [132]:
model_count = NearestNeighbors(metric='cosine')
model_count.fit(x)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [137]:
distances, indices = model_count.kneighbors(x[35817], n_neighbors=2) 

In [138]:
neighbors_obama = pd.DataFrame({'distance':distances[0].tolist(), 'id':indices[0].tolist()})

In [139]:
neighbors_obama

Unnamed: 0,distance,id
0,1.44329e-15,35817
1,0.1224048,24478


In [140]:
people.iloc[24478]['name']

'Joe Biden'

In [None]:
# Approach 2 - using TF-IDF features

In [141]:
model_tf_idf = NearestNeighbors(metric='cosine')
model_tf_idf.fit(tfidf_matrix)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [142]:
distances, indices = model_tf_idf.kneighbors(tfidf_matrix[35817,:], n_neighbors=2)

In [143]:
neighbors_Obama_tf = pd.DataFrame({'distance':distances[0].tolist(), 'id':indices[0].tolist()})
neighbors_Obama_tf

Unnamed: 0,distance,id
0,0.0,35817
1,0.570781,24478
