# Document Analysis in Python
In this notebook we will cover:
- Reading document data into memory
- Creating bag of words features
- Creating smoothed tf-idf features

In [1]:
import requests
import json
from contextlib import closing

# get API key saved on hardrive
with open('../NYTimesAPI.txt') as f:
    api_key = f.read() # read in my private key (sorry, not in this repo ¯\_(ツ)_/¯ )
    
# make base URL and dictionary of get request key/values
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json"
payload = {'api-key': api_key, 'q':'super bowl'} # key/values for get request (look up in api, there are lots)

# Perform the actual request
with closing(requests.get(url,params=payload)) as r:
    articles = r.json()
    print(articles)

{'response': {'meta': {'offset': 0, 'time': 40, 'hits': 38349}, 'docs': [{'subsection_name': 'Pro Football', 'web_url': 'http://www.nytimes.com/2017/01/19/sports/football/patriots-robert-kraft-nfl-roger-goodell.html', 'slideshow_credits': None, 'multimedia': [{'height': 126, 'type': 'image', 'legacy': {'wide': 'images/2017/01/19/sports/22KRAFT1/22KRAFT1-thumbWide.jpg', 'widewidth': '190', 'wideheight': '126'}, 'width': 190, 'subtype': 'wide', 'url': 'images/2017/01/19/sports/22KRAFT1/22KRAFT1-thumbWide.jpg'}, {'height': 400, 'type': 'image', 'legacy': {'xlargewidth': '600', 'xlarge': 'images/2017/01/19/sports/22KRAFT1/22KRAFT1-articleLarge.jpg', 'xlargeheight': '400'}, 'width': 600, 'subtype': 'xlarge', 'url': 'images/2017/01/19/sports/22KRAFT1/22KRAFT1-articleLarge.jpg'}, {'height': 75, 'type': 'image', 'legacy': {'thumbnail': 'images/2017/01/19/sports/22KRAFT1/22KRAFT1-thumbStandard.jpg', 'thumbnailheight': '75', 'thumbnailwidth': '75'}, 'width': 75, 'subtype': 'thumbnail', 'url': 'i

In [None]:
# OR we can load an example query like this
# run this block of code if you can't run anything else
import json 
with open('data/nytime.json') as f:
    articles = json.loads(f.read())
    
articles

In [2]:
# get the summary text from each article
summary_text = [x['lead_paragraph'] for x in articles['response']['docs']]
summary_text

['The strained relations between Robert Kraft and the league will be inescapable if New England returns to the Super Bowl, two years after the “Deflategate” scandal.',
 'Brady threw for 384 yards and three touchdowns to vanquish visiting Pittsburgh and lead New England to the Super Bowl for the seventh time.',
 'Sanu is among the N.F.L. players for whom stocks are traded on an exchange, and his shares could get a bump from his Super Bowl appearance.',
 'Avoiding runway shows, the actor is plugging a website hosting service that helps him sell his theatrical, untrendy men’s wear directly to the public.',
 'The football league has denied that it asked the singer not to discuss politics ahead of her performance at the Super Bowl.',
 'The city, which was hit hard by slumping oil prices, has overhauled its night life, infrastructure and parks in hopes of becoming a world-class destination.',
 'The Patriots advanced to the Super Bowl for a record ninth time, while the Falcons will be trying 

# Converting document data into different representations
First lets go through and count the unique words in each opening sentence (that is what the NYTimes give us for free).
- http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer() # an object capable of counting words in a document!

# count_vect.fit(summary_text)
# count_vect.transform(summary_text)
bag_words = count_vect.fit_transform(summary_text)

In [4]:
print(bag_words.shape) # this is a sparse matrix
print('=========')
print(bag_words[0])

(10, 156)
  (0, 111)	1
  (0, 36)	1
  (0, 6)	1
  (0, 154)	1
  (0, 138)	1
  (0, 24)	1
  (0, 122)	1
  (0, 133)	1
  (0, 106)	1
  (0, 41)	1
  (0, 84)	1
  (0, 65)	1
  (0, 68)	1
  (0, 18)	1
  (0, 149)	1
  (0, 75)	1
  (0, 10)	1
  (0, 73)	1
  (0, 108)	1
  (0, 23)	1
  (0, 105)	1
  (0, 121)	1
  (0, 125)	4


# Self Test: ML01b.3: 
Do you expect the vocabulary from the articles above to be:
- A. Greater than 1M words
- B. Greater than 10,000 words
- C. Fewer than 10,000 words

In [5]:
print(len(count_vect.vocabulary_))
print(count_vect.vocabulary_)

156
{'singer': 117, 'seventh': 114, 'appearance': 12, 'handicappers': 52, 'attend': 16, 'kraft': 73, 'not': 87, 'scandal': 111, 'service': 113, 'if': 65, 'vanquish': 140, 'advanced': 5, 'being': 21, 'wear': 144, 'ahead': 7, 'sell': 112, 'giants': 51, 'falcons': 43, 'league': 75, 'anniversary': 11, 'heart': 56, 'her': 59, 'to': 133, 'avoiding': 17, 'runway': 109, 'they': 128, 'england': 41, 'infrastructure': 69, 'discuss': 40, 'adam': 4, 'world': 152, 'with': 151, 'threw': 130, 'close': 32, 'among': 8, 'directly': 39, 'which': 146, 'on': 91, 'years': 154, 'overhauled': 92, 'cold': 33, 'will': 149, 'is': 70, 'the': 125, 'between': 23, 'than': 123, 'relations': 105, 'life': 76, 'exchange': 42, 'night': 85, 'parks': 93, 'offense': 89, 'able': 2, 'his': 61, 'of': 88, 'him': 60, 'two': 138, 'slumping': 118, 'destination': 38, 'title': 132, 'city': 29, 'minds': 82, '10th': 0, 'strained': 121, 'bowl': 24, 'football': 46, 'get': 50, 'hearts': 57, 'hit': 62, 'might': 81, 'touchdowns': 135, 'at':

In [6]:
# we can still look at the data using an inverse transform
# but we lose the ordering of the words (after all its just a bag of wrods model)
count_vect.inverse_transform(bag_words[0])

[array(['scandal', 'deflategate', 'after', 'years', 'two', 'bowl', 'super',
        'to', 'returns', 'england', 'new', 'if', 'inescapable', 'be',
        'will', 'league', 'and', 'kraft', 'robert', 'between', 'relations',
        'strained', 'the'], 
       dtype='<U14')]

In [7]:
# now let's create a pandas API out of this
import pandas as pd

pd.options.display.max_columns = 999
df = pd.DataFrame(data=bag_words.toarray(),columns=count_vect.get_feature_names())

In [8]:
df # display the full bag of words matrix  

Unnamed: 0,10th,384,able,actor,adam,advanced,after,ahead,among,an,and,anniversary,appearance,are,asked,at,attend,avoiding,be,becoming,behind,being,bet,between,bowl,brady,bump,by,championship,city,claim,class,close,cold,connecticut,could,deflategate,denied,destination,directly,discuss,england,exchange,falcons,fans,first,football,for,from,game,get,giants,handicappers,hard,has,have,heart,hearts,helps,her,him,his,hit,hopes,hosting,if,in,indianapolis,inescapable,infrastructure,is,it,its,kraft,lead,league,life,look,made,mathis,men,might,minds,more,new,night,ninth,not,of,offense,oil,on,overhauled,parks,patriots,performance,pittsburgh,players,plugging,politics,predictions,prices,public,reason,record,relations,returns,reveals,robert,runway,sanu,scandal,sell,service,seventh,shares,shows,singer,slumping,stake,stocks,strained,super,than,that,the,theatrical,their,they,three,threw,time,title,to,top,touchdowns,traded,trying,two,untrendy,vanquish,vinatieri,visiting,was,wear,website,which,while,whom,will,win,with,world,yards,years,york
0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,4,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
1,0,1,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,2,0,0,0,1,1,1,0,2,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,1,2,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0
6,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,4,0,1,0,0,0,1,0,2,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0
7,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
8,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,2,0,0,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,3,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
9,0,0,0,0,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [9]:
# print out 10 most common words in our data
df.sum().sort_values()[-10:]

his       3
time      3
new       4
for       5
of        5
bowl      7
super     7
and       9
to       10
the      23
dtype: int64

In [10]:
# print out 10 least common words in our data
df.sum().sort_values()[:10] # small sample size means most words occur one time

10th           1
players        1
plugging       1
politics       1
predictions    1
prices         1
public         1
pittsburgh     1
reason         1
relations      1
dtype: int64

# TF-IDF Conversion
We have a very small sample of data, but lets covert to tf-idf for the sake of programming it. Recall that Tf-idf transformation (default in `sklearn` is):

$$ \text{tf}(t,d) = f_{td}\text{, } t\in T \text{ and } d \in D $$

$$ \text{idf}(t,d) = \log{\frac{|D|}{|n_t|}}\text{, where } n_t=d\in D \text{ with } t\in d $$

$$\text{tf-idf}(t,d)=\text{tf}(t,d) \cdot (1+\text{idf}(t,d))$$

- http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer() # an object capable of counting words in a document!

tfidf_mat = tfidf_vect.fit_transform(summary_text) # that's it! its converted!!

In [12]:
# convert to pandas to get better idea about the data
df = pd.DataFrame(data=tfidf_mat.toarray(),columns=tfidf_vect.get_feature_names())
df

Unnamed: 0,10th,384,able,actor,adam,advanced,after,ahead,among,an,and,anniversary,appearance,are,asked,at,attend,avoiding,be,becoming,behind,being,bet,between,bowl,brady,bump,by,championship,city,claim,class,close,cold,connecticut,could,deflategate,denied,destination,directly,discuss,england,exchange,falcons,fans,first,football,for,from,game,get,giants,handicappers,hard,has,have,heart,hearts,helps,her,him,his,hit,hopes,hosting,if,in,indianapolis,inescapable,infrastructure,is,it,its,kraft,lead,league,life,look,made,mathis,men,might,minds,more,new,night,ninth,not,of,offense,oil,on,overhauled,parks,patriots,performance,pittsburgh,players,plugging,politics,predictions,prices,public,reason,record,relations,returns,reveals,robert,runway,sanu,scandal,sell,service,seventh,shares,shows,singer,slumping,stake,stocks,strained,super,than,that,the,theatrical,their,they,three,threw,time,title,to,top,touchdowns,traded,trying,two,untrendy,vanquish,vinatieri,visiting,was,wear,website,which,while,whom,will,win,with,world,yards,years,york
0,0.0,0.0,0.0,0.0,0.0,0.0,0.231708,0.0,0.0,0.0,0.112948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.172328,0.0,0.0,0.0,0.0,0.231708,0.112948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.231708,0.0,0.0,0.0,0.0,0.172328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.231708,0.0,0.0,0.231708,0.0,0.0,0.0,0.0,0.231708,0.0,0.172328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.172328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.231708,0.231708,0.0,0.196973,0.0,0.0,0.231708,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.231708,0.112948,0.0,0.0,0.342669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.112948,0.0,0.0,0.0,0.0,0.231708,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.172328,0.0,0.0,0.0,0.0,0.231708,0.0
1,0.0,0.240207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.234182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117091,0.240207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.178649,0.0,0.0,0.0,0.0,0.0,0.317663,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.240207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.178649,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.240207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.240207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117091,0.0,0.0,0.177619,0.0,0.0,0.0,0.240207,0.240207,0.178649,0.0,0.234182,0.0,0.240207,0.0,0.0,0.0,0.0,0.240207,0.0,0.240207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.240207,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.220205,0.220205,0.107341,0.0,0.220205,0.187194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.107341,0.0,0.220205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.220205,0.0,0.0,0.0,0.0,0.0,0.0,0.220205,0.0,0.0,0.0,0.0,0.145606,0.220205,0.0,0.220205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.374389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.187194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.220205,0.0,0.0,0.0,0.0,0.0,0.220205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.220205,0.0,0.0,0.0,0.0,0.220205,0.0,0.0,0.0,0.0,0.220205,0.0,0.107341,0.0,0.0,0.081414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.220205,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.220205,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.224831,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.224831,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.224831,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.224831,0.0,0.224831,0.191127,0.0,0.0,0.224831,0.0,0.0,0.0,0.0,0.0,0.191127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.224831,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.224831,0.0,0.0,0.0,0.224831,0.0,0.0,0.0,0.0,0.0,0.0,0.224831,0.0,0.0,0.224831,0.224831,0.0,0.0,0.224831,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.167213,0.166249,0.224831,0.0,0.0,0.0,0.0,0.0,0.0,0.109596,0.0,0.0,0.0,0.0,0.0,0.224831,0.0,0.0,0.0,0.0,0.224831,0.224831,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.254123,0.0,0.0,0.0,0.0,0.0,0.0,0.254123,0.254123,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.123875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.254123,0.0,0.0,0.254123,0.0,0.0,0.0,0.0,0.0,0.254123,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.216028,0.0,0.0,0.0,0.0,0.254123,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.216028,0.0,0.0,0.0,0.188999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.188999,0.168033,0.0,0.0,0.0,0.0,0.0,0.0,0.254123,0.0,0.0,0.0,0.254123,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.254123,0.0,0.0,0.0,0.0,0.123875,0.0,0.188999,0.281863,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.123875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.102687,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210659,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210659,0.0,0.210659,0.0,0.210659,0.0,0.0,0.0,0.0,0.0,0.0,0.210659,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210659,0.179079,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210659,0.210659,0.0,0.0,0.210659,0.0,0.0,0.210659,0.0,0.0,0.210659,0.0,0.0,0.0,0.210659,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210659,0.0,0.0,0.139293,0.0,0.210659,0.0,0.210659,0.210659,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210659,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210659,0.0,0.0,0.0,0.0,0.0,0.0,0.077885,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210659,0.0,0.0,0.210659,0.0,0.0,0.0,0.0,0.0,0.210659,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.22369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166365,0.0,0.22369,0.0,0.0,0.0,0.109039,0.0,0.0,0.0,0.22369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22369,0.0,0.22369,0.0,0.14791,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22369,0.0,0.0,0.22369,0.0,0.0,0.0,0.0,0.190157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109039,0.0,0.0,0.33081,0.0,0.22369,0.0,0.0,0.0,0.166365,0.0,0.218079,0.22369,0.0,0.0,0.22369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22369,0.0,0.166365,0.22369,0.0,0.0,0.0,0.0,0.0
7,0.266164,0.0,0.0,0.0,0.266164,0.0,0.0,0.0,0.0,0.0,0.129744,0.266164,0.0,0.0,0.0,0.0,0.266164,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.129744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266164,0.0,0.0,0.0,0.0,0.0,0.266164,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266164,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266164,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.197954,0.175995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.226264,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.129744,0.0,0.0,0.098406,0.0,0.0,0.266164,0.0,0.0,0.0,0.266164,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266164,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.197954,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.205145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.152572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.205145,0.0,0.205145,0.0,0.205145,0.0,0.0,0.0,0.0,0.0,0.0,0.152572,0.0,0.0,0.205145,0.0,0.0,0.0,0.0,0.0,0.0,0.205145,0.0,0.0,0.0,0.0,0.0,0.205145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.205145,0.0,0.0,0.0,0.205145,0.205145,0.205145,0.305144,0.0,0.0,0.0,0.271295,0.0,0.0,0.0,0.0,0.0,0.174392,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.205145,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.205145,0.0,0.0,0.0,0.205145,0.152572,0.227538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.199999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.205145
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.258726,0.0,0.0,0.2256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.265383,0.265383,0.0,0.129363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.265383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.175479,0.0,0.0,0.0,0.0,0.265383,0.0,0.0,0.0,0.265383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2256,0.0,0.0,0.0,0.0,0.0,0.0,0.265383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.197373,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.265383,0.0,0.0,0.265383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.129363,0.0,0.0,0.196235,0.0,0.0,0.0,0.0,0.0,0.197373,0.0,0.129363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.265383,0.0,0.0,0.0,0.0


In [13]:
# print out 10 words with max tfidf, normalized by document occurrence
df.max().sort_values()[-10:]

anniversary    0.266164
they           0.266164
adam           0.266164
have           0.266164
10th           0.266164
of             0.271295
new            0.305144
for            0.317663
the            0.342669
his            0.374389
dtype: float64

# Working with (a bit) more data
What if we do not have the memory to deal with dense matrix representatioan and we need to keep it sparse?


In [14]:
from sklearn.datasets import fetch_20newsgroups
bunch = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)

the `bunch` object returned from sklearn is similar to a python dictionary. We can access different fields of the object with keys.

In [15]:
print(bunch.data[0]) # we should split this up by newlines

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [18]:
import numpy as np
idx = round(np.random.rand()*len(bunch.data))
print("\n".join(bunch.data[idx].split("\n")))

From: rcomg@melomys.co.rmit.oz.AU (Mark Gregory)
Subject: AVI file format?
Summary: AVI file format?
Keywords: AVI file format?
Organization: Royal Melbourne Institute of Technology
Lines: 18
NNTP-Posting-Host: melomys.cse.rmit.edu.au


Hi,
	would someone please email the new AVI file
	format.  I'm sure that many people would 
like to know what it is exactly.

Thank you


Mark Gregory Lecturer m.gregory@rmit.edu.au PH(03)6603243 FAX(03)6621060
Royal Melbourne Institute of Technology,
Department of Communication and Electronic Engineering,
P.O. Box 2476V, Melbourne, Victoria, 3001. AUSTRALIA.
--
Mark Gregory Lecturer m.gregory@rmit.edu.au PH(03)6603243 FAX(03)6621060
Royal Melbourne Institute of Technology,
Department of Communication and Electronic Engineering,
P.O. Box 2476V, Melbourne, Victoria, 3001. AUSTRALIA.



In [19]:
%%time
news_tfidf = tfidf_vect.fit_transform(bunch.data) 

CPU times: user 3.54 s, sys: 104 ms, total: 3.64 s
Wall time: 3.66 s


In [20]:
news_tfidf.shape

(11314, 130107)

In [21]:
tfidf_vect.vocabulary_

{'phased': 93466,
 'pdes': 92639,
 'iivy8y0': 66020,
 'incompleteness': 66754,
 'contaminating': 42244,
 'physiotherapy': 93719,
 'decribe': 45751,
 '3178': 11608,
 '47719': 13972,
 'moria': 83731,
 'fs2': 57059,
 'u9ut3': 117651,
 'personal': 93163,
 'pergamon': 93011,
 'disappointed': 47494,
 'decongestants': 45725,
 'eastlake': 50161,
 'kahramanov': 71659,
 'asq': 29797,
 '34jn': 12008,
 'aquinas': 29084,
 'jruisdiction': 70814,
 'makefiles': 79058,
 'dib': 47083,
 '150031': 4610,
 'fluctuations': 55970,
 'hidden': 63126,
 'youd': 128403,
 'peat': 92709,
 '9r9': 22907,
 'hrs': 64433,
 'cbda9': 38272,
 'irwinn': 68525,
 'internationals': 67878,
 '6stcnz': 17975,
 'katsuru': 71910,
 '_2y_q8': 23166,
 'drive': 49047,
 '14261': 4271,
 '3114': 11558,
 'teltech': 113997,
 'hiker': 63203,
 'smegging': 108302,
 'prosecutor': 96459,
 'buttons': 35845,
 'leec': 75124,
 'ust3p': 119829,
 'lff3t': 75523,
 'wo88vg': 124857,
 '_supernatural_': 24433,
 '0i': 1815,
 'skwf': 107914,
 'distrophy': 47

In [22]:
# create pandas dataframe
vec = news_tfidf.max(axis=0)
df  = pd.DataFrame(data=vec.toarray(),columns=tfidf_vect.get_feature_names())

In [23]:
# largest tfidf 
df.max().sort_values()[-10:]

kk          0.870294
db          0.871473
scsi        0.875086
blah        0.879426
donoghue    0.891653
00          0.907726
___         0.908826
25          0.913127
forged      0.940511
ax          0.998314
dtype: float64

In [24]:
# now lets do the transformation with a smaller vocabulary
tfidf_vect = TfidfVectorizer(stop_words='english',
                             max_df=0.01,
                             min_df=4)
news_tfidf = tfidf_vect.fit_transform(bunch.data) 
print(news_tfidf.shape)
vec=news_tfidf.max(axis=0)
df = pd.DataFrame(data=vec.toarray(),columns=tfidf_vect.get_feature_names())
df.max().sort_values()[-10:]

(11314, 28592)


dialix            0.947270
blah              0.952881
ualberta          0.956563
stephanopoulos    0.959855
forged            0.971300
mufti             0.976947
ax                0.999881
meyers            1.000000
slower            1.000000
ucsd              1.000000
dtype: float64

# Using your own vocabulary

In [25]:
# read in scrabble dictionary from file
with open('data/ospd.txt') as f:
    vocab = f.read().split('\n')
    
# now lets do the transformation with a custom vocabulary
tfidf_vect = TfidfVectorizer(vocabulary=vocab)
news_tfidf = tfidf_vect.fit_transform(bunch.data) 
print(news_tfidf.shape)
vec=news_tfidf.max(axis=0)
df = pd.DataFrame(data=vec.toarray(),columns=tfidf_vect.get_feature_names())
df.max().sort_values()[-10:]

(11314, 79340)


incoming    0.925305
siemens     0.927114
water       0.928029
echo        0.947289
blah        0.951962
dos         0.953675
lib         0.954914
forged      0.978762
la          0.982737
ax          0.999999
dtype: float64