# Document Analysis in Python
In this notebook we will cover:
- Reading document data into memory
- Creating bag of words features
- Creating smoothed tf-idf features

In [1]:
import requests
import json
from contextlib import closing

# get API key saved on hardrive
with open('../NYTimesAPI.txt') as f:
    api_key = f.read() # read in my private key (sorry, not in this repo ¯\_(ツ)_/¯ )
    
# make base URL and dictionary of get request key/values
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json"
payload = {'api-key': api_key, 'q':'cohen'} # key/values for get request (look up in api, there are lots)

# Perform the actual request
with closing(requests.get(url, params=payload)) as r:
    articles = r.json()
    
articles

{'status': 'OK',
 'copyright': 'Copyright (c) 2018 The New York Times Company. All Rights Reserved.',
 'response': {'docs': [{'web_url': 'https://www.nytimes.com/2018/08/02/movies/king-cohen-review-larry-cohen.html',
    'snippet': 'This documentary about Larry Cohen, who made films like “It’s Alive” and “The Private Files of J. Edgar Hoover,” is full of lively anecdotes.',
    'print_page': '6',
    'blog': {},
    'source': 'The New York Times',
    'multimedia': [{'rank': 0,
      'subtype': 'xlarge',
      'caption': None,
      'credit': None,
      'type': 'image',
      'url': 'images/2018/08/03/arts/03kingcohen/merlin_141821214_c583d60e-d25d-43d8-8939-d2cdc8a5cf5e-articleLarge.jpg',
      'height': 337,
      'width': 600,
      'legacy': {'xlarge': 'images/2018/08/03/arts/03kingcohen/merlin_141821214_c583d60e-d25d-43d8-8939-d2cdc8a5cf5e-articleLarge.jpg',
       'xlargewidth': 600,
       'xlargeheight': 337},
      'subType': 'xlarge',
      'crop_name': 'articleLarge'},
    

In [None]:
# OR we can load an example query like this
# run this block of code if you can't run anything else
import json 
with open('data/nytime.json') as f:
    articles = json.loads(f.read())
    
articles

In [2]:
# get the summary text from each article
#  lead_paragraph  is no snippet
summary_text = [x['snippet'] for x in articles['response']['docs']]
summary_text

['This documentary about Larry Cohen, who made films like “It’s Alive” and “The Private Files of J. Edgar Hoover,” is full of lively anecdotes.',
 '<div class="vcard">\n<p><span class="fn">Roger Cohen</span> joined <span class="org">The New York Times</span> in 1990. He was a foreign correspondent for more than a decade before becoming acting foreign editor on Sept. 11, 2001, and foreign edito...',
 'Are we paying attention to the right story lines?',
 'Recent and archived news articles by Joyce Cohen of The New York Times.',
 'Quick answers to your questions about mistrials, pardons, impeachment and more.',
 'Recent and archived news articles by Patricia Cohen of The New York Times.',
 'Who knows, the once-cocky fixer, now humbled, could find himself a star witness at hearings on impeachment of our 45th president. ',
 '<p>Recent and archived news articles by Sarah Cohen of The New York Times.</p>',
 'Recent and archived news articles by Noam Cohen of The New York Times.',
 'Recent and

# Converting document data into different representations
First lets go through and count the unique words in each opening sentence (that is what the NYTimes give us for free).
- http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer() # an object capable of counting words in a document!

# count_vect.fit(summary_text)
# count_vect.transform(summary_text)
bag_words = count_vect.fit_transform(summary_text)

In [4]:
print(bag_words.shape) # this is a sparse matrix
print('=========')
print(bag_words[0])

(10, 93)
  (0, 9)	1
  (0, 53)	1
  (0, 37)	1
  (0, 45)	1
  (0, 41)	1
  (0, 27)	1
  (0, 61)	2
  (0, 30)	1
  (0, 70)	1
  (0, 82)	1
  (0, 8)	1
  (0, 7)	1
  (0, 46)	1
  (0, 51)	1
  (0, 31)	1
  (0, 54)	1
  (0, 89)	1
  (0, 21)	1
  (0, 50)	1
  (0, 4)	1
  (0, 26)	1
  (0, 83)	1


# Self Test: ML01b.3: 
Do you expect the vocabulary from the articles above to be:
- A. Greater than 1M words
- B. Greater than 10,000 words
- C. Fewer than 10,000 words

In [5]:
print(len(count_vect.vocabulary_))
print(count_vect.vocabulary_)

93
{'this': 83, 'documentary': 26, 'about': 4, 'larry': 50, 'cohen': 21, 'who': 89, 'made': 54, 'films': 31, 'like': 51, 'it': 46, 'alive': 7, 'and': 8, 'the': 82, 'private': 70, 'files': 30, 'of': 61, 'edgar': 27, 'hoover': 41, 'is': 45, 'full': 37, 'lively': 53, 'anecdotes': 9, 'div': 25, 'class': 19, 'vcard': 86, 'span': 78, 'fn': 34, 'roger': 75, 'joined': 47, 'org': 64, 'new': 57, 'york': 91, 'times': 84, 'in': 44, '1990': 1, 'he': 38, 'was': 87, 'foreign': 36, 'correspondent': 22, 'for': 35, 'more': 56, 'than': 81, 'decade': 24, 'before': 17, 'becoming': 16, 'acting': 5, 'editor': 29, 'on': 62, 'sept': 77, '11': 0, '2001': 2, 'edito': 28, 'are': 12, 'we': 88, 'paying': 68, 'attention': 15, 'to': 85, 'right': 74, 'story': 80, 'lines': 52, 'recent': 73, 'archived': 11, 'news': 58, 'articles': 13, 'by': 18, 'joyce': 48, 'quick': 72, 'answers': 10, 'your': 92, 'questions': 71, 'mistrials': 55, 'pardons': 66, 'impeachment': 43, 'patricia': 67, 'knows': 49, 'once': 63, 'cocky': 20, 'fi

In [6]:
# we can still look at the data using an inverse transform
# but we lose the ordering of the words (after all its just a bag of wrods model)
count_vect.inverse_transform(bag_words[0])

[array(['anecdotes', 'lively', 'full', 'is', 'hoover', 'edgar', 'of',
        'files', 'private', 'the', 'and', 'alive', 'it', 'like', 'films',
        'made', 'who', 'cohen', 'larry', 'about', 'documentary', 'this'],
       dtype='<U13')]

In [7]:
# now let's create a pandas API out of this
import pandas as pd

pd.options.display.max_columns = 999
df = pd.DataFrame(data=bag_words.toarray(),columns=count_vect.get_feature_names())

In [8]:
df # display the full bag of words matrix  

Unnamed: 0,11,1990,2001,45th,about,acting,adam,alive,and,anecdotes,answers,archived,are,articles,at,attention,becoming,before,by,class,cocky,cohen,correspondent,could,decade,div,documentary,edgar,edito,editor,files,films,find,fixer,fn,for,foreign,full,he,hearings,himself,hoover,humbled,impeachment,in,is,it,joined,joyce,knows,larry,like,lines,lively,made,mistrials,more,new,news,noam,now,of,on,once,org,our,pardons,patricia,paying,president,private,questions,quick,recent,right,roger,sarah,sept,span,star,story,than,the,this,times,to,vcard,was,we,who,witness,york,your
0,0,0,0,0,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0,0,1,1,0,1,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0
1,1,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0,3,0,1,1,0,1,1,0,0,1,1,0,0,0,0,1,1,3,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,4,0,0,1,1,0,1,0,1,1,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0
4,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
5,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0
6,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0,0
7,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0
8,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0
9,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0


In [9]:
# print out 10 most common words in our data
df.sum().sort_values()[-10:]

recent      5
articles    5
by          5
york        6
times       6
new         6
cohen       7
of          8
and         8
the         9
dtype: int64

In [10]:
# print out 10 least common words in our data
df.sum().sort_values()[:10] # small sample size means most words occur one time

11           1
org          1
once         1
now          1
noam         1
mistrials    1
made         1
lively       1
our          1
lines        1
dtype: int64

# TF-IDF Conversion
We have a very small sample of data, but lets covert to tf-idf for the sake of programming it. Recall that Tf-idf transformation (default in `sklearn` is):

$$ \text{tf}(t,d) = f_{td}\text{, } t\in T \text{ and } d \in D $$

$$ \text{idf}(t,d) = \log{\frac{|D|}{|n_t|}}\text{, where } n_t=d\in D \text{ with } t\in d $$

$$\text{tf-idf}(t,d)=\text{tf}(t,d) \cdot (1+\text{idf}(t,d))$$

- http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer() # an object capable of counting words in a document!

tfidf_mat = tfidf_vect.fit_transform(summary_text) # that's it! its converted!!

In [12]:
# convert to pandas to get better idea about the data
df = pd.DataFrame(data=tfidf_mat.toarray(),columns=tfidf_vect.get_feature_names())
df

Unnamed: 0,11,1990,2001,45th,about,acting,adam,alive,and,anecdotes,answers,archived,are,articles,at,attention,becoming,before,by,class,cocky,cohen,correspondent,could,decade,div,documentary,edgar,edito,editor,files,films,find,fixer,fn,for,foreign,full,he,hearings,himself,hoover,humbled,impeachment,in,is,it,joined,joyce,knows,larry,like,lines,lively,made,mistrials,more,new,news,noam,now,of,on,once,org,our,pardons,patricia,paying,president,private,questions,quick,recent,right,roger,sarah,sept,span,star,story,than,the,this,times,to,vcard,was,we,who,witness,york,your
0,0.0,0.0,0.0,0.0,0.195053,0.0,0.0,0.229449,0.101855,0.229449,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.111847,0.0,0.0,0.0,0.0,0.229449,0.229449,0.0,0.0,0.229449,0.229449,0.0,0.0,0.0,0.0,0.0,0.229449,0.0,0.0,0.0,0.229449,0.0,0.0,0.0,0.229449,0.229449,0.0,0.0,0.0,0.229449,0.229449,0.0,0.229449,0.229449,0.0,0.0,0.0,0.0,0.0,0.0,0.223694,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.229449,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.092917,0.229449,0.0,0.0,0.0,0.0,0.0,0.195053,0.0,0.0,0.0
1,0.13029,0.13029,0.13029,0.0,0.0,0.13029,0.0,0.0,0.057837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13029,0.13029,0.0,0.39087,0.0,0.063511,0.13029,0.0,0.13029,0.13029,0.0,0.0,0.13029,0.13029,0.0,0.0,0.0,0.0,0.13029,0.13029,0.39087,0.0,0.13029,0.0,0.0,0.0,0.0,0.0,0.13029,0.0,0.0,0.13029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.110758,0.069943,0.0,0.0,0.0,0.0,0.110758,0.0,0.13029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13029,0.0,0.13029,0.52116,0.0,0.0,0.13029,0.052762,0.0,0.069943,0.0,0.13029,0.13029,0.0,0.0,0.0,0.069943,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.356085,0.0,0.0,0.356085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.356085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.356085,0.0,0.0,0.0,0.0,0.0,0.356085,0.0,0.0,0.0,0.0,0.0,0.356085,0.0,0.1442,0.0,0.0,0.302705,0.0,0.0,0.356085,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210105,0.0,0.0,0.281058,0.0,0.281058,0.0,0.0,0.0,0.0,0.281058,0.0,0.0,0.230716,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.473304,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.254083,0.281058,0.0,0.0,0.230716,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.281058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191668,0.0,0.254083,0.0,0.0,0.0,0.0,0.0,0.0,0.254083,0.0
4,0.0,0.0,0.0,0.0,0.281994,0.0,0.0,0.0,0.147255,0.0,0.331721,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.281994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.331721,0.281994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.331721,0.0,0.0,0.0,0.0,0.331721,0.331721,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.281994,0.0,0.0,0.0,0.0,0.0,0.0,0.331721
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210105,0.0,0.0,0.281058,0.0,0.281058,0.0,0.0,0.0,0.0,0.281058,0.0,0.0,0.230716,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.254083,0.281058,0.0,0.0,0.230716,0.0,0.0,0.0,0.0,0.0,0.473304,0.0,0.0,0.0,0.0,0.0,0.281058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191668,0.0,0.254083,0.0,0.0,0.0,0.0,0.0,0.0,0.254083,0.0
6,0.0,0.0,0.0,0.232059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.232059,0.0,0.0,0.0,0.0,0.0,0.232059,0.0,0.0,0.232059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.232059,0.232059,0.0,0.0,0.0,0.0,0.0,0.232059,0.232059,0.0,0.232059,0.197272,0.0,0.0,0.0,0.0,0.0,0.232059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.232059,0.113119,0.197272,0.232059,0.0,0.232059,0.0,0.0,0.0,0.232059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.232059,0.0,0.0,0.093974,0.0,0.0,0.0,0.0,0.0,0.0,0.197272,0.232059,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210105,0.0,0.0,0.281058,0.0,0.281058,0.0,0.0,0.0,0.0,0.281058,0.0,0.0,0.230716,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.254083,0.281058,0.0,0.0,0.230716,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.281058,0.0,0.0,0.473304,0.0,0.0,0.0,0.0,0.0,0.191668,0.0,0.254083,0.0,0.0,0.0,0.0,0.0,0.0,0.254083,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.210105,0.0,0.0,0.281058,0.0,0.281058,0.0,0.0,0.0,0.0,0.281058,0.0,0.0,0.230716,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.254083,0.281058,0.473304,0.0,0.230716,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.281058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191668,0.0,0.254083,0.0,0.0,0.0,0.0,0.0,0.0,0.254083,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.473304,0.0,0.210105,0.0,0.0,0.281058,0.0,0.281058,0.0,0.0,0.0,0.0,0.281058,0.0,0.0,0.230716,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.254083,0.281058,0.0,0.0,0.230716,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.281058,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.191668,0.0,0.254083,0.0,0.0,0.0,0.0,0.0,0.0,0.254083,0.0


In [13]:
# print out 10 words with max tfidf, normalized by document occurrence
df.max().sort_values()[-10:]

paying      0.356085
are         0.356085
class       0.390870
foreign     0.390870
noam        0.473304
adam        0.473304
sarah       0.473304
joyce       0.473304
patricia    0.473304
span        0.521160
dtype: float64

# Working with (a bit) more data
What if we do not have the memory to deal with dense matrix representatioan and we need to keep it sparse?


In [14]:
from sklearn.datasets import fetch_20newsgroups
bunch = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)

the `bunch` object returned from sklearn is similar to a python dictionary. We can access different fields of the object with keys.

In [15]:
len(bunch.data)

11314

In [16]:
print(bunch.data[0]) # we should split this up by newlines

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [17]:
import numpy as np
#randomly grad an index
idx = round(np.random.rand()*len(bunch.data))
# and print it pretty
print("\n".join(bunch.data[idx].split("\n")))

From: maverick@wpi.WPI.EDU (T. Giaquinto)
Subject: General Information Request
Organization: Worcester Polytechnic Institute, Worcester, MA 01609-2280
Lines: 11
NNTP-Posting-Host: wpi.wpi.edu


	I am looking for any information about the space program.
This includes NASA, the shuttles, history, anything!  I would like to
know if anyone could suggest books, periodicals, even ftp sites for a
novice who is interested in the space program.



					Todd Giaquinto
					maverick@wpi.WPI.EDU
					



In [18]:
%%time
news_tfidf = tfidf_vect.fit_transform(bunch.data) 

CPU times: user 3.82 s, sys: 156 ms, total: 3.97 s
Wall time: 3.94 s


In [19]:
news_tfidf.shape

(11314, 130107)

In [20]:
tfidf_vect.vocabulary_

{'from': 56979,
 'lerxst': 75358,
 'wam': 123162,
 'umd': 118280,
 'edu': 50527,
 'where': 124031,
 'my': 85354,
 'thing': 114688,
 'subject': 111322,
 'what': 123984,
 'car': 37780,
 'is': 68532,
 'this': 114731,
 'nntp': 87620,
 'posting': 95162,
 'host': 64095,
 'rac3': 98949,
 'organization': 90379,
 'university': 118983,
 'of': 89362,
 'maryland': 79666,
 'college': 40998,
 'park': 92081,
 'lines': 76032,
 '15': 4605,
 'was': 123292,
 'wondering': 124931,
 'if': 65798,
 'anyone': 28615,
 'out': 90774,
 'there': 114579,
 'could': 42876,
 'enlighten': 51793,
 'me': 80638,
 'on': 89860,
 'saw': 104813,
 'the': 114455,
 'other': 90686,
 'day': 45295,
 'it': 68766,
 'door': 48618,
 'sports': 109581,
 'looked': 76718,
 'to': 115475,
 'be': 32311,
 'late': 74693,
 '60s': 16574,
 'early': 50111,
 '70s': 18299,
 'called': 37433,
 'bricklin': 34995,
 'doors': 48620,
 'were': 123796,
 'really': 99822,
 'small': 108252,
 'in': 66608,
 'addition': 26073,
 'front': 56989,
 'bumper': 35612,
 'se

In [None]:
# create pandas dataframe
vec = news_tfidf.max(axis=0)
df  = pd.DataFrame(data=vec.toarray(),columns=tfidf_vect.get_feature_names())

In [None]:
# largest tfidf 
df.max().sort_values()[-10:]

In [None]:
# now lets do the transformation with a smaller vocabulary
tfidf_vect = TfidfVectorizer(stop_words='english',
                             max_df=0.01,
                             min_df=4)
news_tfidf = tfidf_vect.fit_transform(bunch.data) 
print(news_tfidf.shape)
vec=news_tfidf.max(axis=0)
df = pd.DataFrame(data=vec.toarray(),columns=tfidf_vect.get_feature_names())
df.max().sort_values()[-10:]

# Using your own vocabulary

In [None]:
# read in scrabble dictionary from file
with open('data/ospd.txt') as f:
    vocab = f.read().split('\n')
    
# now lets do the transformation with a custom vocabulary
tfidf_vect = TfidfVectorizer(vocabulary=vocab)
news_tfidf = tfidf_vect.fit_transform(bunch.data) 
print(news_tfidf.shape)
vec=news_tfidf.max(axis=0)
df = pd.DataFrame(data=vec.toarray(),columns=tfidf_vect.get_feature_names())
df.max().sort_values()[-10:]

Looking for how to do a word cloud? Check this out:
- https://github.com/amueller/word_cloud

Want to perform more serious NLP with richer options:
- http://www.nltk.org