# Document retrieval from Wikipedia data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
people =pd.read_csv('~/ML-Specialization/Machine Learning Foundations: A Case Study Approach/Week 4/Dataset/people_wiki.csv')

In [3]:
people

Unnamed: 0,name,text
0,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,G-Enka,henry krvits born 30 december 1974 in tallinn ...
...,...,...
59066,Olari Elts,olari elts born april 27 1971 in tallinn eston...
59067,Scott F. Crago,scott francis crago born july 26 1963 twin bro...
59068,David Cass (footballer),david william royce cass born 27 march 1962 in...
59069,Keith Elias,keith hector elias born february 3 1972 in lac...


# Explore data

## Taking a look at the entry for President Obama


In [4]:
obama = people[people['name'] == 'Barack Obama']

In [5]:
obama

Unnamed: 0,name,text
35817,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [6]:
obama['text']

35817    barack hussein obama ii brk husen bm born augu...
Name: text, dtype: object

## Explore the entry for actor George Clooney

In [7]:
clooney = people[people['name'] == 'George Clooney']
clooney['text']

38514    george timothy clooney born may 6 1961 is an a...
Name: text, dtype: object

# Word counts for Obama acticle


In [8]:
from collections import Counter

In [9]:
res = obama['text'].str.split().apply(pd.value_counts)

In [10]:
res

Unnamed: 0,the,in,and,of,to,his,obama,act,a,he,...,street,relations,romney,equality,budget,tax,inaugurated,down,representing,cuba
35817,40,30,21,18,14,11,9,8,7,7,...,1,1,1,1,1,1,1,1,1,1


In [11]:
res = [Counter(x) for x in obama['text'].str.split()]

In [12]:
list(res)

[Counter({'barack': 1,
          'hussein': 1,
          'obama': 9,
          'ii': 1,
          'brk': 1,
          'husen': 1,
          'bm': 1,
          'born': 2,
          'august': 1,
          '4': 1,
          '1961': 1,
          'is': 2,
          'the': 40,
          '44th': 1,
          'and': 21,
          'current': 1,
          'president': 4,
          'of': 18,
          'united': 3,
          'states': 3,
          'first': 3,
          'african': 1,
          'american': 3,
          'to': 14,
          'hold': 1,
          'office': 2,
          'in': 30,
          'honolulu': 1,
          'hawaii': 1,
          'a': 7,
          'graduate': 1,
          'columbia': 1,
          'university': 2,
          'harvard': 2,
          'law': 6,
          'school': 3,
          'where': 1,
          'he': 7,
          'served': 2,
          'as': 6,
          'review': 1,
          'was': 5,
          'community': 1,
          'organizer': 1,
          'chicago': 2,
   

In [13]:
obama['word_count']= [Counter(x) for x in obama['text'].str.split()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [14]:
obama

Unnamed: 0,name,text,word_count
35817,Barack Obama,barack hussein obama ii brk husen bm born augu...,"{'barack': 1, 'hussein': 1, 'obama': 9, 'ii': ..."


## Find most common words in Obama article

In [15]:
obama['word_count']

35817    {'barack': 1, 'hussein': 1, 'obama': 9, 'ii': ...
Name: word_count, dtype: object

In [16]:
rslt = pd.DataFrame(Counter(" ".join(obama['text']).split()).most_common(),columns=['Word', 'Frequency'])

In [17]:
rslt.style

Unnamed: 0,Word,Frequency
0,the,40
1,in,30
2,and,21
3,of,18
4,to,14
5,his,11
6,obama,9
7,act,8
8,a,7
9,he,7


In [18]:
obama_text = people.loc[people['name']=='Barack Obama', 'text'].tolist()

In [19]:
obama_text

['barack hussein obama ii brk husen bm born august 4 1961 is the 44th and current president of the united states and the first african american to hold the office born in honolulu hawaii obama is a graduate of columbia university and harvard law school where he served as president of the harvard law review he was a community organizer in chicago before earning his law degree he worked as a civil rights attorney and taught constitutional law at the university of chicago law school from 1992 to 2004 he served three terms representing the 13th district in the illinois senate from 1997 to 2004 running unsuccessfully for the united states house of representatives in 2000in 2004 obama received national attention during his campaign to represent illinois in the united states senate with his victory in the march democratic party primary his keynote address at the democratic national convention in july and his election to the senate in november he began his presidential campaign in 2007 and aft

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
counter1 = CountVectorizer(stop_words='english')
counter1

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [21]:
count_matrix =counter1.fit_transform(obama_text)
count_matrix

<1x229 sparse matrix of type '<class 'numpy.int64'>'
	with 229 stored elements in Compressed Sparse Row format>

In [22]:
features = counter1.get_feature_names()
features

['13th',
 '1961',
 '1992',
 '1996',
 '1997',
 '20',
 '2000in',
 '2004',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2012obama',
 '2013',
 '44th',
 '63',
 'act',
 'address',
 'administration',
 'affordable',
 'afghanistan',
 'african',
 'american',
 'americans',
 'arms',
 'ask',
 'attention',
 'attorney',
 'august',
 'barack',
 'began',
 'bin',
 'bm',
 'born',
 'briefs',
 'brk',
 'budget',
 'californias',
 'called',
 'campaign',
 'care',
 'chicago',
 'civil',
 'clinton',
 'close',
 'columbia',
 'combat',
 'community',
 'constitutional',
 'consumer',
 'continued',
 'control',
 'convention',
 'court',
 'creation',
 'cuba',
 'current',
 'death',
 'debate',
 'debt',
 'defeated',
 'defeating',
 'defense',
 'degree',
 'delegates',
 'democratic',
 'district',
 'doddfrank',
 'domestic',
 'dont',
 'earning',
 'economic',
 'election',
 'elementary',
 'ended',
 'ending',
 'equality',
 'federal',
 'filed',
 'foreign',
 'form',
 'gains',
 'general',
 'graduate',
 'great',
 'gun',
 'harva

In [23]:
# Create a series from the sparse matrix
obama_counter = pd.Series(count_matrix.toarray().flatten(), 
              index = features).sort_values(ascending=False)

In [24]:
obama_counter

obama         9
act           8
law           6
military      4
control       4
             ..
normalize     1
nomination    1
nobel         1
new           1
13th          1
Length: 229, dtype: int64

# Compute TF-IDF for the entire corpus of articles

In [25]:
people['word_count']= [Counter(x) for x in people['text'].str.split()]

In [26]:
people

Unnamed: 0,name,text,word_count
0,Digby Morrell,digby morrell born 10 october 1979 is a former...,"{'digby': 1, 'morrell': 5, 'born': 1, '10': 1,..."
1,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,"{'alfred': 1, 'j': 1, 'lewy': 3, 'aka': 1, 'sa..."
2,Harpdog Brown,harpdog brown is a singer and harmonica player...,"{'harpdog': 2, 'brown': 2, 'is': 7, 'a': 7, 's..."
3,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,"{'franz': 1, 'rottensteiner': 3, 'born': 1, 'i..."
4,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'henry': 1, 'krvits': 1, 'born': 1, '30': 1, ..."
...,...,...,...
59066,Olari Elts,olari elts born april 27 1971 in tallinn eston...,"{'olari': 2, 'elts': 3, 'born': 1, 'april': 1,..."
59067,Scott F. Crago,scott francis crago born july 26 1963 twin bro...,"{'scott': 1, 'francis': 1, 'crago': 5, 'born':..."
59068,David Cass (footballer),david william royce cass born 27 march 1962 in...,"{'david': 1, 'william': 1, 'royce': 1, 'cass':..."
59069,Keith Elias,keith hector elias born february 3 1972 in lac...,"{'keith': 1, 'hector': 1, 'elias': 4, 'born': ..."


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
tfidf_vec = TfidfVectorizer()
transformed = tfidf_vec.fit_transform(raw_documents=people['text'])
index_value={i[1]:i[0] for i in tfidf_vec.vocabulary_.items()}

In [29]:
fully_indexed = []
for row in transformed:
    fully_indexed.append({index_value[column]:value for (column,value) in zip(row.indices,row.data)})

In [30]:
people['tf_idf'] = fully_indexed

In [31]:
people

Unnamed: 0,name,text,word_count,tf_idf
0,Digby Morrell,digby morrell born 10 october 1979 is a former...,"{'digby': 1, 'morrell': 5, 'born': 1, '10': 1,...","{'melbourne': 0.04943650649482413, 'college': ..."
1,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,"{'alfred': 1, 'j': 1, 'lewy': 3, 'aka': 1, 'sa...","{'every': 0.03831773682944378, 'capsule': 0.07..."
2,Harpdog Brown,harpdog brown is a singer and harmonica player...,"{'harpdog': 2, 'brown': 2, 'is': 7, 'a': 7, 's...","{'society': 0.03991955990922275, 'hamilton': 0..."
3,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,"{'franz': 1, 'rottensteiner': 3, 'born': 1, 'i...","{'kurdlawitzpreis': 0.08928312306619517, 'spec..."
4,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'henry': 1, 'krvits': 1, 'born': 1, '30': 1, ...","{'curtis': 0.052582727709648204, 'promo': 0.06..."
...,...,...,...,...
59066,Olari Elts,olari elts born april 27 1971 in tallinn eston...,"{'olari': 2, 'elts': 3, 'born': 1, 'april': 1,...","{'ivth': 0.09848761070515089, 'orchestraolaris..."
59067,Scott F. Crago,scott francis crago born july 26 1963 twin bro...,"{'scott': 1, 'francis': 1, 'crago': 5, 'born':...","{'cosongwriting': 0.08387253596002299, 'sounda..."
59068,David Cass (footballer),david william royce cass born 27 march 1962 in...,"{'david': 1, 'william': 1, 'royce': 1, 'cass':...","{'grewcock': 0.12163141646353066, '15696': 0.1..."
59069,Keith Elias,keith hector elias born february 3 1972 in lac...,"{'keith': 1, 'hector': 1, 'elias': 4, 'born': ...","{'recordselias': 0.1249798907183773, 'cochampi..."


## Examine the TF-IDF for the Obama article

In [32]:
obama = people[people['name'] == 'Barack Obama']
#obama[['tf_idf']].stack('tf_idf',columns=['word','tf_idf']).sort('tf_idf',ascending=False)

In [33]:
obama

Unnamed: 0,name,text,word_count,tf_idf
35817,Barack Obama,barack hussein obama ii brk husen bm born augu...,"{'barack': 1, 'hussein': 1, 'obama': 9, 'ii': ...","{'2012obama': 0.07885433263257774, 'doddfrank'..."


In [75]:
obama_tf = pd.DataFrame(obama['tf_idf'].iloc[0].items(), columns=['Word', 'tf_idf'])

In [76]:
obama_tf = obama_tf.sort_values('tf_idf', ascending=False)

In [77]:
obama_tf

Unnamed: 0,Word,tf_idf
98,obama,0.365018
266,the,0.279323
92,act,0.249089
264,in,0.209673
114,iraq,0.151809
...,...,...
268,is,0.014350
216,new,0.013177
221,which,0.012341
232,that,0.011600


## Examine the TF-IDF for Clooney

In [96]:
clooney = people[people['name'] == 'George Clooney']

In [97]:
clooney

Unnamed: 0,name,text,word_count,tf_idf
38514,George Clooney,george timothy clooney born may 6 1961 is an a...,"{'george': 1, 'timothy': 1, 'clooney': 4, 'bor...","{'categoriesclooney': 0.10120408406633526, 'he..."


In [98]:
clooney_tf = pd.DataFrame(clooney['tf_idf'].iloc[0].items(), columns=['Word', 'tf_idf'])

In [99]:
clooney_tf

Unnamed: 0,Word,tf_idf
0,categoriesclooney,0.101204
1,heslov,0.101204
2,producingclooney,0.101204
3,syriana,0.089978
4,leatherheads,0.094993
...,...,...
231,the,0.313680
232,with,0.043103
233,who,0.017115
234,is,0.046044


In [100]:
clooney_tf = clooney_tf.sort_values('tf_idf', ascending=False)

In [101]:
clooney_tf

Unnamed: 0,Word,tf_idf
231,the,0.313680
30,clooney,0.307745
42,thriller,0.211806
58,drama,0.157204
65,actor,0.141867
...,...,...
180,american,0.019064
233,who,0.017115
211,from,0.011594
235,born,0.011365
