# Document Analysis in Python
In this notebook we will cover:
- Reading document data into memory
- Creating bag of words features
- Creating smoothed tf-idf features

In [1]:
import requests
import json
from contextlib import closing

# get API key saved on hardrive
with open('../NYTimesAPI.txt') as f:
    api_key = f.read() # read in my private key (sorry, not in this repo ¯\_(ツ)_/¯ )
    
# make base URL and dictionary of get request key/values
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json"
payload = {'api-key': api_key, 'q':'Shutdown'} # key/values for get request (look up in api, there are lots)

# Perform the actual request
with closing(requests.get(url, params=payload)) as r:
    articles = r.json()
    
articles

{'fault': {'faultstring': 'Invalid ApiKey',
  'detail': {'errorcode': 'oauth.v2.InvalidApiKey'}}}

In [2]:
# OR we can load an example query like this
# run this block of code if you can't run anything else
import json 
with open('data/nytime.json') as f:
    articles = json.loads(f.read())
    
articles

{'response': {'meta': {'hits': 66110, 'time': 321, 'offset': 0},
  'docs': [{'web_url': 'http://www.nytimes.com/2016/07/24/books/review/the-games-a-global-history-of-the-olympics-david-goldblatt.html',
    'snippet': 'David Goldblatt’s “The Games” recalls unflattering aspects of the Olympics long before doping and gender testing.',
    'lead_paragraph': 'David Goldblatt’s “The Games” recalls unflattering aspects of the Olympics long before doping and gender testing.',
    'abstract': 'Mary Pilon reviews book The Games: A Global History of the Olympics by David Goldblatt.',
    'print_page': '11',
    'blog': [],
    'source': 'The New York Times',
    'multimedia': [{'width': 190,
      'url': 'images/2016/07/24/books/review/24PILON1/24PILON1-thumbWide.jpg',
      'height': 126,
      'subtype': 'wide',
      'legacy': {'wide': 'images/2016/07/24/books/review/24PILON1/24PILON1-thumbWide.jpg',
       'wideheight': '126',
       'widewidth': '190'},
      'type': 'image'},
     {'width':

In [3]:
# get the summary text from each article
#  lead_paragraph  is no snippet
summary_text = [x['snippet'] for x in articles['response']['docs']]
summary_text

['David Goldblatt’s “The Games” recalls unflattering aspects of the Olympics long before doping and gender testing.',
 'Penalizing a country for doping seems to be a much more effective way to ensure rapid and enduring change.',
 'Loroupe, who won the New York City Marathon in 1994 and 1995, is the leader of the 10 displaced athletes who make up the Refugee Olympic Team.',
 'It is time to focus on the folks going to Rio without fear or complaint — like Diana Taurasi and Sue Bird, who hope to win their fourth gold with the United States women’s basketball team.',
 'Fans may want athletes to be drug free, but they also want to be entertained by raw power, and there has long been a wink-and-nod pragmatism about weight lifting in particular.',
 'By deciding against a complete ban of Russian athletes from the Rio Games, the International Olympic Committee is sending mixed messages.',
 'How did The New York Times prepare for its Olympics coverage? Here’s a quantitative look.',
 'A real-time 

# Converting document data into different representations
First lets go through and count the unique words in each opening sentence (that is what the NYTimes give us for free).
- http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer() # an object capable of counting words in a document!

# count_vect.fit(summary_text)
# count_vect.transform(summary_text)
bag_words = count_vect.fit_transform(summary_text)

In [5]:
print(bag_words.shape) # this is a sparse matrix
print('=========')
print(bag_words[0])

(10, 161)
  (0, 137)	1
  (0, 60)	1
  (0, 10)	1
  (0, 40)	1
  (0, 18)	1
  (0, 84)	1
  (0, 101)	1
  (0, 98)	1
  (0, 11)	1
  (0, 145)	1
  (0, 120)	1
  (0, 58)	1
  (0, 138)	2
  (0, 64)	1
  (0, 34)	1


# Self Test: ML01b.3: 
Do you expect the vocabulary from the articles above to be:
- A. Greater than 1M words
- B. Greater than 10,000 words
- C. Fewer than 10,000 words

In [6]:
print(len(count_vect.vocabulary_))
print(count_vect.vocabulary_)

161
{'david': 34, 'goldblatt': 64, 'the': 138, 'games': 58, 'recalls': 120, 'unflattering': 145, 'aspects': 11, 'of': 98, 'olympics': 101, 'long': 84, 'before': 18, 'doping': 40, 'and': 10, 'gender': 60, 'testing': 137, 'penalizing': 107, 'country': 32, 'for': 52, 'seems': 126, 'to': 144, 'be': 16, 'much': 95, 'more': 94, 'effective': 42, 'way': 149, 'ensure': 44, 'rapid': 117, 'enduring': 43, 'change': 24, 'loroupe': 86, 'who': 152, 'won': 158, 'new': 96, 'york': 160, 'city': 26, 'marathon': 88, 'in': 73, '1994': 1, '1995': 2, 'is': 75, 'leader': 80, '10': 0, 'displaced': 38, 'athletes': 13, 'make': 87, 'up': 147, 'refugee': 121, 'olympic': 100, 'team': 136, 'it': 76, 'time': 142, 'focus': 50, 'on': 102, 'folks': 51, 'going': 62, 'rio': 124, 'without': 156, 'fear': 49, 'or': 103, 'complaint': 29, 'like': 83, 'diana': 36, 'taurasi': 135, 'sue': 132, 'bird': 20, 'hope': 70, 'win': 153, 'their': 139, 'fourth': 54, 'gold': 63, 'with': 155, 'united': 146, 'states': 130, 'women': 157, 'bask

In [7]:
# we can still look at the data using an inverse transform
# but we lose the ordering of the words (after all its just a bag of wrods model)
count_vect.inverse_transform(bag_words[0])

[array(['testing', 'gender', 'and', 'doping', 'before', 'long', 'olympics',
        'of', 'aspects', 'unflattering', 'recalls', 'games', 'the',
        'goldblatt', 'david'], dtype='<U13')]

In [8]:
# now let's create a pandas API out of this
import pandas as pd

pd.options.display.max_columns = 999
df = pd.DataFrame(data=bag_words.toarray(),columns=count_vect.get_feature_names())

In [9]:
df # display the full bag of words matrix  

Unnamed: 0,10,1994,1995,2016,about,account,against,agence,agency,also,and,aspects,associated,athletes,ban,basketball,be,been,before,bid,bird,boston,but,by,change,citizens,city,comeback,committee,complaint,complete,countries,country,coverage,david,deciding,diana,did,displaced,dominated,doping,drug,effective,enduring,ensure,entertained,essay,european,fans,fear,focus,folks,for,fought,fourth,france,free,from,games,gatlin,gender,getty,going,gold,goldblatt,has,have,here,heroes,history,hope,how,images,in,international,is,it,its,justin,katie,leader,ledecky,lifting,like,long,look,loroupe,make,marathon,may,memories,messages,mixed,modern,more,much,new,nod,of,olympians,olympic,olympics,on,or,particular,past,peerless,penalizing,photo,photographers,power,pragmatism,prepare,press,presse,pressphoto,quantitative,rapid,raw,real,recalls,refugee,reject,reuters,rio,russian,seems,sending,snapshot,sprinter,states,stream,sue,summer,swimmer,taurasi,team,testing,the,their,there,they,time,times,to,unflattering,united,up,want,way,weight,which,who,win,wink,with,without,women,won,writers,york
0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,4,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,0,0,0,0,1,0,1
3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1,1,0,2,1,0,0,1,0,3,0,1,0,0,0,0,0,1,1,0,1,1,1,0,0,0
4,0,0,0,0,1,0,0,0,0,1,2,0,0,1,0,0,2,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,2,0,0,0,2,0,1,0,0,0,1,0,0,0,0,0,0
5,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
7,0,0,0,1,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,3,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
8,1,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2,1,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
9,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [10]:
# print out 10 most common words in our data
df.sum().sort_values()[-10:]

for          3
who          3
by           3
olympics     4
is           4
their        4
of           8
to           8
and          9
the         18
dtype: int64

In [11]:
# print out 10 least common words in our data
df.sum().sort_values()[:10] # small sample size means most words occur one time

leader      1
nod         1
they        1
much        1
more        1
modern      1
mixed       1
messages    1
memories    1
may         1
dtype: int64

# TF-IDF Conversion
We have a very small sample of data, but lets covert to tf-idf for the sake of programming it. Recall that Tf-idf transformation (default in `sklearn` is):

$$ \text{tf}(t,d) = f_{td}\text{, } t\in T \text{ and } d \in D $$

$$ \text{idf}(t,d) = \log{\frac{|D|}{|n_t|}}\text{, where } n_t=d\in D \text{ with } t\in d $$

$$\text{tf-idf}(t,d)=\text{tf}(t,d) \cdot (1+\text{idf}(t,d))$$

- http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer() # an object capable of counting words in a document!

tfidf_mat = tfidf_vect.fit_transform(summary_text) # that's it! its converted!!

In [13]:
# convert to pandas to get better idea about the data
df = pd.DataFrame(data=tfidf_mat.toarray(),columns=tfidf_vect.get_feature_names())
df

Unnamed: 0,10,1994,1995,2016,about,account,against,agence,agency,also,and,aspects,associated,athletes,ban,basketball,be,been,before,bid,bird,boston,but,by,change,citizens,city,comeback,committee,complaint,complete,countries,country,coverage,david,deciding,diana,did,displaced,dominated,doping,drug,effective,enduring,ensure,entertained,essay,european,fans,fear,focus,folks,for,fought,fourth,france,free,from,games,gatlin,gender,getty,going,gold,goldblatt,has,have,here,heroes,history,hope,how,images,in,international,is,it,its,justin,katie,leader,ledecky,lifting,like,long,look,loroupe,make,marathon,may,memories,messages,mixed,modern,more,much,new,nod,of,olympians,olympic,olympics,on,or,particular,past,peerless,penalizing,photo,photographers,power,pragmatism,prepare,press,presse,pressphoto,quantitative,rapid,raw,real,recalls,refugee,reject,reuters,rio,russian,seems,sending,snapshot,sprinter,states,stream,sue,summer,swimmer,taurasi,team,testing,the,their,there,they,time,times,to,unflattering,united,up,want,way,weight,which,who,win,wink,with,without,women,won,writers,york
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142208,0.291734,0.0,0.0,0.0,0.0,0.0,0.0,0.291734,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.291734,0.0,0.0,0.0,0.0,0.0,0.248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.216971,0.0,0.291734,0.0,0.0,0.0,0.291734,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.156611,0.0,0.0,0.192903,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.291734,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.291734,0.259008,0.0,0.0,0.0,0.0,0.0,0.0,0.291734,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125925,0.0,0.0,0.0,0.0,0.0,0.219603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.258328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.258328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.219603,0.0,0.258328,0.258328,0.258328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.192127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.258328,0.258328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.258328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.258328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.258328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.341628,0.0,0.0,0.0,0.0,0.258328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.180101,0.21186,0.21186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.103273,0.0,0.0,0.157567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.180101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.180101,0.0,0.140088,0.0,0.0,0.0,0.0,0.21186,0.0,0.0,0.0,0.0,0.0,0.21186,0.21186,0.21186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.157567,0.0,0.113733,0.0,0.180101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.180101,0.0,0.37619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21186,0.0,0.0,0.0,0.0,0.360201,0.0,0.0,0.0,0.0,0.0,0.21186,0.0,0.157567
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.086764,0.0,0.0,0.0,0.0,0.177992,0.0,0.0,0.0,0.0,0.177992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177992,0.0,0.0,0.0,0.0,0.0,0.0,0.177992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177992,0.177992,0.177992,0.0,0.0,0.177992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177992,0.177992,0.0,0.0,0.0,0.0,0.0,0.0,0.177992,0.0,0.0,0.0,0.0,0.117693,0.177992,0.0,0.0,0.0,0.0,0.0,0.0,0.177992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.151309,0.177992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.132378,0.0,0.0,0.0,0.0,0.0,0.177992,0.0,0.177992,0.0,0.0,0.177992,0.151309,0.0,0.158026,0.151309,0.0,0.0,0.151309,0.0,0.35308,0.0,0.177992,0.0,0.0,0.0,0.0,0.0,0.151309,0.177992,0.0,0.177992,0.177992,0.177992,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.177153,0.0,0.0,0.0,0.0,0.150596,0.17271,0.0,0.0,0.131754,0.0,0.0,0.301192,0.177153,0.0,0.0,0.0,0.0,0.177153,0.131754,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177153,0.0,0.0,0.0,0.177153,0.0,0.0,0.177153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.150596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177153,0.0,0.150596,0.0,0.0,0.0,0.0,0.177153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177153,0.0,0.0,0.0,0.0,0.0,0.0,0.177153,0.0,0.0,0.0,0.0,0.0,0.177153,0.177153,0.0,0.0,0.0,0.0,0.0,0.0,0.177153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.177153,0.177153,0.0,0.0,0.234277,0.0,0.0,0.0,0.354306,0.0,0.177153,0.0,0.0,0.0,0.177153,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.256736,0.0,0.0,0.0,0.0,0.0,0.0,0.190942,0.256736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.190942,0.0,0.0,0.0,0.0,0.256736,0.0,0.256736,0.0,0.0,0.0,0.0,0.256736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.218249,0.190942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.256736,0.169761,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.256736,0.256736,0.0,0.0,0.0,0.0,0.0,0.137823,0.0,0.218249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.190942,0.256736,0.0,0.256736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.227936,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.305153,0.0,0.0,0.0,0.305153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.226952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.305153,0.0,0.0,0.0,0.259408,0.0,0.0,0.0,0.0,0.0,0.305153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.305153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.226952,0.0,0.0,0.0,0.0,0.201776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.305153,0.0,0.0,0.0,0.305153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.135461,0.0,0.0,0.0,0.0,0.259408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.226952
7,0.0,0.0,0.0,0.197372,0.0,0.0,0.0,0.197372,0.197372,0.0,0.096211,0.0,0.197372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.146791,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.197372,0.0,0.0,0.0,0.0,0.146791,0.0,0.0,0.197372,0.0,0.167784,0.0,0.0,0.0,0.197372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.394744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.146791,0.0,0.105955,0.0,0.0,0.130508,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.197372,0.0,0.0,0.0,0.197372,0.197372,0.197372,0.0,0.0,0.0,0.197372,0.0,0.0,0.0,0.197372,0.146791,0.0,0.0,0.0,0.0,0.0,0.0,0.197372,0.0,0.0,0.0,0.0,0.0,0.0,0.262848,0.0,0.0,0.0,0.167784,0.167784,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.146791
8,0.147366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.169005,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.173353,0.0,0.173353,0.0,0.0,0.0,0.173353,0.147366,0.173353,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.173353,0.0,0.0,0.0,0.0,0.0,0.0,0.173353,0.0,0.0,0.0,0.0,0.128928,0.173353,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.173353,0.0,0.0,0.147366,0.0,0.0,0.0,0.0,0.0,0.0,0.173353,0.173353,0.0,0.173353,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.173353,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.186122,0.173353,0.0,0.0,0.147366,0.0,0.0,0.173353,0.173353,0.0,0.173353,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.173353,0.0,0.0,0.0,0.0,0.0,0.0,0.173353,0.0,0.0,0.0,0.0,0.173353,0.0,0.0,0.0,0.153907,0.442099,0.0,0.0,0.0,0.0,0.114626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.173353,0.0
9,0.0,0.0,0.0,0.0,0.0,0.282413,0.0,0.0,0.0,0.240077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.282413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.282413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.282413,0.0,0.0,0.282413,0.0,0.0,0.0,0.0,0.0,0.18674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.282413,0.0,0.0,0.0,0.0,0.303214,0.0,0.0,0.18674,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.282413,0.0,0.0,0.0,0.0,0.282413,0.0,0.0,0.0,0.0,0.250733,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.282413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# print out 10 words with max tfidf, normalized by document occurrence
df.max().sort_values()[-10:]

quantitative    0.305153
prepare         0.305153
did             0.305153
coverage        0.305153
to              0.353080
want            0.354306
who             0.360201
the             0.376190
images          0.394744
their           0.442099
dtype: float64

# Working with (a bit) more data
What if we do not have the memory to deal with dense matrix representatioan and we need to keep it sparse?


In [15]:
from sklearn.datasets import fetch_20newsgroups
bunch = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)

the `bunch` object returned from sklearn is similar to a python dictionary. We can access different fields of the object with keys.

In [16]:
len(bunch.data)

11314

In [17]:
print(bunch.data[0]) # we should split this up by newlines

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [18]:
bunch.target_names[bunch.target[0]]

'rec.autos'

In [20]:
import numpy as np
#randomly grad an index
idx = round(np.random.rand()*len(bunch.data))
# and print it pretty
print("\n".join(bunch.data[idx].split("\n")))

From: andrew.payne@hal9k.ann-arbor.mi.us (Andrew Payne) 
Subject: WANTED:  TCM3105 chips, small quantities
Distribution: world
Organization: HAL 9000 BBS, W-NET HQ, Ann Arbor, Michigan, USA
Reply-To: andrew.payne@hal9k.ann-arbor.mi.us (Andrew Payne) 
Keywords: rec mod
Summary: Reposted by Keith Petersen
Lines: 29

From: payne@crl.dec.com (Andrew Payne)
Message-ID: <1993Apr20.004418.11548@crl.dec.com>
Organization: DEC Cambridge Research Lab
Date: Tue, 20 Apr 1993 00:44:18 GMT


Does anyone know if a source for the TCM3105 modem chips (as used in the
Baycom and my PMP modems)?  Ideally, something that is geared toward 
hobbyists:  small quantity, mail order, etc.

For years, we've been buying them from a distributor (Marshall) by the
hundreds for PMP kits.  But orders have dropped to the point where we can
no longer afford to offer this service.  And all of the distributors I've
checked have some crazy minimum order ($100, or so).

I'd like to find a source for those still interested in

In [21]:
%%time
news_tfidf = tfidf_vect.fit_transform(bunch.data) 

CPU times: user 3.49 s, sys: 141 ms, total: 3.63 s
Wall time: 3.67 s


In [22]:
news_tfidf.shape

(11314, 130107)

In [23]:
tfidf_vect.vocabulary_

{'from': 56979,
 'lerxst': 75358,
 'wam': 123162,
 'umd': 118280,
 'edu': 50527,
 'where': 124031,
 'my': 85354,
 'thing': 114688,
 'subject': 111322,
 'what': 123984,
 'car': 37780,
 'is': 68532,
 'this': 114731,
 'nntp': 87620,
 'posting': 95162,
 'host': 64095,
 'rac3': 98949,
 'organization': 90379,
 'university': 118983,
 'of': 89362,
 'maryland': 79666,
 'college': 40998,
 'park': 92081,
 'lines': 76032,
 '15': 4605,
 'was': 123292,
 'wondering': 124931,
 'if': 65798,
 'anyone': 28615,
 'out': 90774,
 'there': 114579,
 'could': 42876,
 'enlighten': 51793,
 'me': 80638,
 'on': 89860,
 'saw': 104813,
 'the': 114455,
 'other': 90686,
 'day': 45295,
 'it': 68766,
 'door': 48618,
 'sports': 109581,
 'looked': 76718,
 'to': 115475,
 'be': 32311,
 'late': 74693,
 '60s': 16574,
 'early': 50111,
 '70s': 18299,
 'called': 37433,
 'bricklin': 34995,
 'doors': 48620,
 'were': 123796,
 'really': 99822,
 'small': 108252,
 'in': 66608,
 'addition': 26073,
 'front': 56989,
 'bumper': 35612,
 'se

In [24]:
# create pandas dataframe
vec = news_tfidf.max(axis=0)
df  = pd.DataFrame(data=vec.toarray(),columns=tfidf_vect.get_feature_names())

In [25]:
# largest tfidf 
df.max().sort_values()[-10:]

kk          0.870294
db          0.871473
scsi        0.875086
blah        0.879426
donoghue    0.891653
00          0.907726
___         0.908826
25          0.913127
forged      0.940511
ax          0.998314
dtype: float64

In [26]:
# now lets do the transformation with a smaller vocabulary
tfidf_vect = TfidfVectorizer(stop_words='english',
                             max_df=0.01,
                             min_df=4)
news_tfidf = tfidf_vect.fit_transform(bunch.data) 
print(news_tfidf.shape)
vec=news_tfidf.max(axis=0)
df = pd.DataFrame(data=vec.toarray(),columns=tfidf_vect.get_feature_names())
df.max().sort_values()[-10:]

(11314, 28592)


dialix            0.947270
blah              0.952881
ualberta          0.956563
stephanopoulos    0.959855
forged            0.971300
mufti             0.976947
ax                0.999881
meyers            1.000000
slower            1.000000
ucsd              1.000000
dtype: float64

# Using your own vocabulary

In [27]:
# read in scrabble dictionary from file
with open('data/ospd.txt') as f:
    vocab = f.read().split('\n')
    
# now lets do the transformation with a custom vocabulary
tfidf_vect = TfidfVectorizer(vocabulary=vocab)
news_tfidf = tfidf_vect.fit_transform(bunch.data) 
print(news_tfidf.shape)
vec=news_tfidf.max(axis=0)
df = pd.DataFrame(data=vec.toarray(),columns=tfidf_vect.get_feature_names())
df.max().sort_values()[-10:]

(11314, 79340)


incoming    0.925305
siemens     0.927114
water       0.928029
echo        0.947289
blah        0.951962
dos         0.953675
lib         0.954914
forged      0.978762
la          0.982737
ax          0.999999
dtype: float64

Looking for how to do a word cloud? Check this out:
- https://github.com/amueller/word_cloud

Want to perform more serious NLP with richer options:
- http://www.nltk.org