# Document Analysis in Python
In this notebook we will cover:
- Reading document data into memory
- Creating bag of words features
- Creating smoothed tf-idf features

In [34]:
import requests
import json
from contextlib import closing

# get API key saved on hardrive
with open('../NYTimesAPI.txt') as f:
    api_key = f.read() # read in my private key (sorry, not in this repo ¯\_(ツ)_/¯ )
    
# make base URL and dictionary of get request key/values
url = "https://api.nytimes.com/svc/search/v2/articlesearch.json"
payload = {'api-key': api_key, 'q':'Olympics'} # key/values for get request (look up in api, there are lots)

# Perform the actual request]
with closing(requests.get(url,params=payload))as r:
    articles = r.json()
    print(articles)

{'status': 'OK', 'response': {'docs': [{'subsection_name': 'Olympics', 'byline': {'person': [{'lastname': 'CLAREY', 'organization': '', 'firstname': 'Christopher', 'rank': 1, 'role': 'reported'}], 'original': 'By CHRISTOPHER CLAREY'}, '_id': '57b3927938f0d828cf0cf132', 'type_of_material': 'News', 'blog': [], 'snippet': 'When the German twins Anna and Lisa Hahner joined hands as they finished deep in the pack in the women’s marathon, they quickly drew sharp criticism from their federation.', 'web_url': 'http://www.nytimes.com/2016/08/17/sports/olympics/twins-finish-marathon-hand-in-hand-but-their-country-says-they-crossed-a-line.html', 'print_page': '10', 'keywords': [{'value': 'Olympic Games (2016)', 'is_major': 'Y', 'name': 'subject', 'rank': '1'}, {'value': 'Hahner, Lisa (1989- )', 'is_major': 'Y', 'name': 'persons', 'rank': '2'}, {'value': 'Hahner, Anna (1989- )', 'is_major': 'Y', 'name': 'persons', 'rank': '3'}, {'value': 'Marathon Running', 'is_major': 'Y', 'name': 'subject', 'ran

In [1]:
# OR
# could use the NYTIMES API like this
from nytimesarticle import articleAPI

with open('../NYTimesAPI.txt') as f:
    api_key=f.read() # read in my private key (not in this repository, sorry)
    
api = articleAPI(api_key)

articles = api.search(start_date='20160601')
articles

{'copyright': 'Copyright (c) 2013 The New York Times Company.  All Rights Reserved.',
 'response': {'docs': [{'_id': '57b3b1f57988101da6e76f9b',
    'abstract': None,
    'blog': [],
    'byline': {'original': 'By GARDINER HARRIS',
     'person': [{'firstname': 'Gardiner',
       'lastname': 'HARRIS',
       'organization': '',
       'rank': 1,
       'role': 'reported'}]},
    'document_type': 'article',
    'headline': {'content_kicker': 'Political Memo',
     'kicker': 'Political Memo',
     'main': 'Martha’s Vineyard Longs for a President Who R.S.V.P.s ‘Yes’',
     'print_headline': 'An Island Longs for a President Who R.S.V.P.s ‘Yes’'},
    'keywords': [{'is_major': 'N',
      'name': 'subject',
      'rank': '1',
      'value': 'Presidential Election of 2016'},
     {'is_major': 'N',
      'name': 'subject',
      'rank': '2',
      'value': 'Campaign Finance'},
     {'is_major': 'N',
      'name': 'subject',
      'rank': '3',
      'value': 'United States Politics and Governme

In [1]:
# OR we can load an example query like this
# run this block of code if you can't run anything else
import json 
with open('data/nytime.json') as f:
    articles = json.loads(f.read())
    
articles

{'copyright': 'Copyright (c) 2013 The New York Times Company.  All Rights Reserved.',
 'response': {'docs': [{'_id': '578ca1a179881043e05a62b7',
    'abstract': 'Mary Pilon reviews book The Games: A Global History of the Olympics by David Goldblatt.',
    'blog': [],
    'byline': {'original': 'By MARY PILON',
     'person': [{'firstname': 'Mary',
       'lastname': 'PILON',
       'organization': '',
       'rank': 1,
       'role': 'reported'}]},
    'document_type': 'article',
    'headline': {'content_kicker': 'Nonfiction',
     'kicker': 'Nonfiction',
     'main': 'The Dark History of the Olympics',
     'print_headline': 'Tarnish on the Torch'},
    'keywords': [{'is_major': 'Y',
      'name': 'subject',
      'rank': '1',
      'value': 'Books and Literature'},
     {'is_major': 'Y',
      'name': 'persons',
      'rank': '2',
      'value': 'Goldblatt, David (1965- )'},
     {'is_major': 'N',
      'name': 'subject',
      'rank': '3',
      'value': 'Olympic Games'},
     {'is

In [35]:
# get the summary text from each article
summary_text = [x['lead_paragraph'] for x in articles['response']['docs']]
summary_text

['When the German twins Anna and Lisa Hahner joined hands as they finished deep in the pack in the women’s marathon, they quickly drew sharp criticism from their federation.',
 'Had the United States women competed as a separate country, they would have been third in the overall medal chart with 61.',
 'David Goldblatt’s “The Games” recalls unflattering aspects of the Olympics long before doping and gender testing.',
 '“What was in my mind was, I had to get a gold medal,” said Shaunae Miller, who won the 400 in spectacular fashion, depriving Allyson Felix of career-capping satisfaction.',
 'Naturally high testosterone levels in women have not been scientifically demonstrated to give them a more significant edge than many other factors.',
 'Penalizing a country for doping seems to be a much more effective way to ensure rapid and enduring change.',
 'With some of the smallest crowds in Olympic history at Rio, Coe, president of the I.A.A.F., is looking afresh at ways to save the sport.',


# Converting document data into different representations
First lets go through and count the unique words in each opening sentence (that is what the NYTimes give us for free). 

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer() # an object capable of counting words in a document!

bag_words = count_vect.fit_transform(summary_text)

In [15]:
print(bag_words.shape) # this is a sparse matrix

print(count_vect.vocabulary_)

print(bag_words[0])

(10, 237)
{'he': 97, 'book': 38, 'mr': 144, 'chadburn': 44, 'hardcover': 94, 'named': 147, 'became': 28, 'contractor': 54, 'adhd': 8, 'his': 100, 'that': 199, 'relatively': 174, 'berkeley': 33, 'ideal': 109, 'former': 86, 'years': 234, 'storm': 194, 'cities': 48, 'like': 126, 'reporter': 176, 'children': 47, 'latest': 122, 'humane': 108, 'charlie': 45, 'are': 20, 'us': 217, 'important': 111, 'whether': 223, 'discuss': 67, 'her': 98, 'cate': 43, 'when': 221, 'trick': 211, 'here': 99, 'have': 96, 'something': 190, 'olympic': 156, 'all': 11, 'open': 158, 'draws': 69, 'hospital': 102, 'california': 42, 'chewy': 46, 'months': 140, 'the': 200, 'fascinated': 81, 'produce': 169, 'apartment': 18, 'where': 222, 'modern': 139, 'com': 50, 'accompanying': 6, 'marjorie': 135, 'commerce': 51, 'benefit': 31, 'almost': 12, 'simple': 188, 'resettlement': 177, 'host': 103, 'was': 219, 'memorial': 138, 'left': 124, 'correspondent': 55, 'drink': 70, 'marks': 136, 'anti': 17, 'father': 82, 'than': 198, 'goo

In [16]:
# we can still look at the data using an inverse transform
count_vect.inverse_transform(bag_words[0])

[array(['the', 'reporter', 'charlie', 'savage', 'wrote', 'about', 'an',
        'ex', 'guantánamo', 'inmate', 'resettlement', 'story', 'didn',
        'end', 'when', 'article', 'published'], 
       dtype='<U13')]

In [17]:
# now let's create a pandas API out of this
import pandas as pd

pd.options.display.max_columns = 999
df = pd.DataFrame(data=bag_words.toarray(),columns=count_vect.get_feature_names())

In [18]:
df # display the full bag of words matrix  

Unnamed: 0,08,120,1996,25,30,about,accompanying,account,adhd,after,alan,all,almost,amount,an,and,annual,anti,apartment,apollo,are,article,at,aug,bagels,baking,baron,be,became,been,began,benefit,bergen,berkeley,best,big,bigwigs,bite,book,books,by,calif,california,cate,chadburn,charlie,chewy,children,cities,clinton,com,commerce,compellingly,competition,contractor,correspondent,county,crust,dan,day,days,death,decreasing,deep,delicatessen,didn,dinner,discuss,do,draws,drink,dropped,each,election,employer,end,era,eventually,ex,exploring,face,fascinated,father,fink,five,for,former,founded,genetics,good,graf,grew,guantánamo,hand,hardcover,has,have,he,her,here,his,hollywood,hospital,host,however,href,html,http,humane,ideal,if,important,in,indelible,ingall,inmate,is,it,jewish,jobs,kinds,knows,latest,learn,left,life,like,liqueur,list,little,looking,major,make,mamaleh,many,marjorie,marks,masks,memorial,modern,months,more,mothers,moving,mr,musicians,name,named,nation,new,nonfiction,noon,nytimes,oakland,occupied,of,olympic,on,open,or,ounce,out,part,pear,planned,politicians,port,preprandial,pretzel,produce,published,ravaged,recipe,recommends,relatively,relics,reporter,resettlement,restaurant,ritual,rutgers,saul,savage,says,schwarz,sea,sheri,should,simple,six,something,son,spots,stopping,storm,story,sweetness,telegraph,than,that,the,theater,there,this,three,tiki,times,titanic,to,top,trends,trick,tuesday,two,unanticipated,until,up,us,values,was,weekends,when,where,whether,who,will,with,won,worked,working,worth,written,wrote,www,years,york,you
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,2,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,1,1,0,2,0,1,1,0,2,3,0,0,1,0,1,0,1,0,3,3,1,0,1,0,1,0,1,1,0,0,0,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,0,2,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,1,0,0,2,1,1,1,0,2,1,0,0,0,1,0,4,0,0,4,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,1,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,4,0,1,0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,2,0,1,1,0,0,0,0,2,0,0,0,0,1,0,0,1,0,0,1,1,0,1,0,1,0,2,1,1,1,0,0,0,0,1,0,0
9,1,0,1,1,0,0,0,0,0,1,0,0,0,1,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,1,1,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,2,0,0,1,0,1,0,0,2,0,0,2,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,0,0,0,1,1,0,1,5,0,0,1,0,1,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1


In [19]:
# print out 10 most common words in our data
df.sum().sort_values()[-10:]

he      4
his     4
to      4
are     4
on      5
for     6
and     8
in      9
of     11
the    15
dtype: int64

In [20]:
# print out 10 least common words in our data
df.sum().sort_values()[:10] # small sample size means most words occur one time

08            1
name          1
named         1
nation        1
new           1
nonfiction    1
noon          1
nytimes       1
oakland       1
occupied      1
dtype: int64

# TF-IDF Conversion
We have a very small sample of data, but lets covert to tf-idf for the sake of programming it. Recall that Tf-idf transformation (default in `sklearn` is):

$$ \text{tf}(t,d) = f_{td}\text{, } t\in T \text{ and } d \in D $$

$$ \text{idf}(t,d) = \frac{|D|}{|n_t|}\text{, where } n_t=d\in D \text{ with } t\in d $$

$$\text{tf-idf}(t,d)=\text{tf}(t,d) \cdot (1+\text{idf}(t,d))$$

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer() # an object capable of counting words in a document!

tfidf_mat = tfidf_vect.fit_transform(summary_text) # that's it! its converted!!

In [22]:
# convert to pandas to get better idea about the data
df = pd.DataFrame(data=tfidf_mat.toarray(),columns=tfidf_vect.get_feature_names())
df

Unnamed: 0,08,120,1996,25,30,about,accompanying,account,adhd,after,alan,all,almost,amount,an,and,annual,anti,apartment,apollo,are,article,at,aug,bagels,baking,baron,be,became,been,began,benefit,bergen,berkeley,best,big,bigwigs,bite,book,books,by,calif,california,cate,chadburn,charlie,chewy,children,cities,clinton,com,commerce,compellingly,competition,contractor,correspondent,county,crust,dan,day,days,death,decreasing,deep,delicatessen,didn,dinner,discuss,do,draws,drink,dropped,each,election,employer,end,era,eventually,ex,exploring,face,fascinated,father,fink,five,for,former,founded,genetics,good,graf,grew,guantánamo,hand,hardcover,has,have,he,her,here,his,hollywood,hospital,host,however,href,html,http,humane,ideal,if,important,in,indelible,ingall,inmate,is,it,jewish,jobs,kinds,knows,latest,learn,left,life,like,liqueur,list,little,looking,major,make,mamaleh,many,marjorie,marks,masks,memorial,modern,months,more,mothers,moving,mr,musicians,name,named,nation,new,nonfiction,noon,nytimes,oakland,occupied,of,olympic,on,open,or,ounce,out,part,pear,planned,politicians,port,preprandial,pretzel,produce,published,ravaged,recipe,recommends,relatively,relics,reporter,resettlement,restaurant,ritual,rutgers,saul,savage,says,schwarz,sea,sheri,should,simple,six,something,son,spots,stopping,storm,story,sweetness,telegraph,than,that,the,theater,there,this,three,tiki,times,titanic,to,top,trends,trick,tuesday,two,unanticipated,until,up,us,values,was,weekends,when,where,whether,who,will,with,won,worked,working,worth,written,wrote,www,years,york,you
0,0.0,0.0,0.0,0.0,0.0,0.202726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.202726,0.0,0.0,0.0,0.0,0.0,0.0,0.238475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.238475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.238475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.238475,0.0,0.0,0.238475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.238475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.238475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.238475,0.0,0.0,0.0,0.0,0.0,0.238475,0.238475,0.0,0.0,0.0,0.0,0.238475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.238475,0.0,0.0,0.0,0.0,0.34874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.238475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.238475,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.181539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.107801,0.0,0.0,0.0,0.0,0.0,0.0,0.308649,0.181539,0.0,0.0,0.0,0.181539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181539,0.0,0.0,0.0,0.0,0.181539,0.181539,0.0,0.0,0.0,0.0,0.0,0.181539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181539,0.154324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181539,0.0,0.0,0.0,0.0,0.181539,0.0,0.0,0.0,0.181539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.107801,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.154324,0.0,0.0,0.181539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181539,0.0,0.181539,0.0,0.0,0.0,0.0,0.0,0.240077,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181539,0.0,0.135016,0.0,0.0,0.0,0.181539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181539,0.0
2,0.0,0.209658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.138632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209658,0.0,0.0,0.0,0.0,0.209658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209658,0.0,0.0,0.209658,0.0,0.0,0.0,0.209658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209658,0.0,0.209658,0.0,0.0,0.209658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2251,0.209658,0.138632,0.0,0.209658,0.0,0.0,0.0,0.0,0.209658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.209658,0.0,0.0,0.0,0.0,0.0,0.0,0.209658,0.0,0.0,0.0,0.178228,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.319472,0.0,0.319472,0.0,0.0,0.0,0.0,0.189709,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.27158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.319472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.319472,0.0,0.0,0.319472,0.0,0.0,0.0,0.0,0.27158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.319472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.319472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.319472,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.278656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.278656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.278656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.165472,0.0,0.278656,0.0,0.0,0.0,0.278656,0.0,0.0,0.278656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.278656,0.0,0.278656,0.0,0.0,0.0,0.0,0.0,0.0,0.278656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14959,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.278656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.278656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.135833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.278656,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.214539,0.0,0.0,0.141859,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.214539,0.182378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.214539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.214539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.214539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.182378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.214539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.214539,0.214539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.214539,0.182378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.214539,0.0,0.0,0.0,0.214539,0.115171,0.0,0.141859,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.214539,0.0,0.0,0.0,0.0,0.0,0.214539,0.0,0.209158,0.0,0.0,0.0,0.214539,0.0,0.0,0.0,0.0,0.214539,0.0,0.0,0.0,0.182378,0.0,0.214539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.182378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.170406,0.286966,0.0,0.0,0.286966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.286966,0.0,0.0,0.0,0.286966,0.286966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.286966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.213425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.286966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.286966,0.286966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.286966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.139884,0.286966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.320245,0.0,0.0,0.0,0.0,0.0,0.320245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.376718,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.223703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.320245,0.376718,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.376718,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.202233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.183635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.376718,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.083505,0.083505,0.0,0.141975,0.0,0.070987,0.083505,0.0,0.141975,0.148762,0.0,0.0,0.083505,0.0,0.055216,0.0,0.070987,0.0,0.250516,0.250516,0.083505,0.0,0.083505,0.0,0.083505,0.0,0.083505,0.083505,0.0,0.0,0.0,0.083505,0.0,0.0,0.0,0.083505,0.083505,0.0,0.0,0.0,0.083505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083505,0.0,0.083505,0.083505,0.083505,0.0,0.0,0.0,0.0,0.0,0.167011,0.0,0.0,0.0,0.0,0.0,0.0,0.083505,0.0,0.0,0.083505,0.0,0.0,0.083505,0.0,0.0,0.0,0.083505,0.083505,0.0,0.0,0.124211,0.083505,0.083505,0.083505,0.0,0.167011,0.083505,0.0,0.0,0.0,0.083505,0.0,0.334022,0.0,0.0,0.334022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.198349,0.0,0.0,0.0,0.070987,0.0,0.0,0.083505,0.083505,0.0,0.0,0.0,0.0,0.0,0.083505,0.0,0.0,0.0,0.0,0.083505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083505,0.070987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083505,0.0,0.179312,0.0,0.055216,0.0,0.0,0.0,0.083505,0.083505,0.0,0.0,0.0,0.0,0.0,0.083505,0.0,0.0,0.0,0.083505,0.0,0.0,0.0,0.0,0.0,0.083505,0.083505,0.083505,0.083505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083505,0.0,0.083505,0.0,0.0,0.0,0.0,0.0,0.0,0.081411,0.0,0.083505,0.070987,0.0,0.0,0.0,0.0,0.124211,0.0,0.0,0.0,0.0,0.070987,0.0,0.0,0.083505,0.0,0.0,0.083505,0.083505,0.0,0.083505,0.0,0.083505,0.0,0.141975,0.083505,0.083505,0.083505,0.0,0.0,0.0,0.0,0.070987,0.0,0.0
9,0.114879,0.0,0.114879,0.114879,0.0,0.0,0.0,0.0,0.0,0.097658,0.0,0.0,0.0,0.114879,0.0,0.136435,0.0,0.0,0.0,0.0,0.075962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.114879,0.114879,0.0,0.0,0.0,0.0,0.0,0.114879,0.114879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.114879,0.114879,0.0,0.0,0.114879,0.0,0.114879,0.0,0.114879,0.0,0.114879,0.0,0.0,0.0,0.114879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.256317,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.114879,0.114879,0.114879,0.114879,0.0,0.114879,0.114879,0.0,0.136435,0.0,0.0,0.0,0.0,0.114879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.229759,0.0,0.0,0.114879,0.0,0.114879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.097658,0.0,0.0,0.097658,0.0,0.0,0.114879,0.0,0.0,0.0,0.0,0.114879,0.0,0.0,0.123341,0.0,0.0,0.114879,0.0,0.114879,0.0,0.0,0.229759,0.0,0.0,0.229759,0.114879,0.0,0.0,0.0,0.0,0.0,0.114879,0.114879,0.114879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.114879,0.0,0.114879,0.114879,0.0,0.114879,0.0,0.0,0.0,0.0,0.0,0.114879,0.114879,0.0,0.114879,0.279995,0.0,0.0,0.097658,0.0,0.114879,0.0,0.114879,0.085439,0.0,0.0,0.114879,0.0,0.0,0.0,0.0,0.0,0.114879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.114879,0.0,0.0,0.114879


In [23]:
# print out 10 most influential words in given document, normalized by document occurrence
df.max().sort_values()[-10:]

about     0.320245
all       0.320245
latest    0.320245
his       0.334022
he        0.334022
the       0.348740
learn     0.376718
masks     0.376718
trends    0.376718
face      0.376718
dtype: float64

# Working with (a bit) more data
What if we do not have the memory to deal with dense matrix representatioan and we need to keep it sparse?


In [24]:
from sklearn.datasets import fetch_20newsgroups
bunch = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)

Downloading dataset from http://people.csail.mit.edu/jrennie/20Newsgroups/20news-bydate.tar.gz (14 MB)


the `bunch` object returned from sklearn is similar to a python dictionary. We can access different fields of the object with keys.

In [25]:
print(bunch.data[0]) # we should split this up by newlines

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [26]:
import numpy as np
idx = round(np.random.rand()*len(bunch.data))
print("\n".join(bunch.data[idx].split("\n")))

From: ian@nasser.eecs.nwu.edu (Ian Sutherland)
Subject: Re: Limiting Govt (was Re: Employment (was Re: Why not concentrate...)
Organization: EECS Department, Northwestern University
Lines: 40

In article <1993Apr15.170731.8797@isc-br.isc-br.com> steveh@thor.isc-br.com (Steve Hendricks) writes:
>In article <1993Apr15.013651.11353@tijc02.uucp> pjs269@tijc02.uucp (Paul Schmidt) writes:
>>steveh@thor.isc-br.com (Steve Hendricks) writes:
>>: 
>>: As noted in another thread (Limiting govt), the problem libertarians face
>>: is insuring that the "limited government" they seek does not become the 
>>: tool of private interests to pursue their own agenda.
>>: 

[...]

>It is a failure of libertarianism if the ideology does not provide any
>reasonable way to restrain such actions other than utopian dreams.

You seem to be saying that a LIMITED government will provide MORE
opportunities for private interests to use it to pursue their own
agendas, and asking libertarians to prove that this will NO

In [27]:
news_tfidf = tfidf_vect.fit_transform(bunch.data) 

In [28]:
news_tfidf.shape

(11314, 130107)

In [29]:
tfidf_vect.vocabulary_

{'mouseless': 83957,
 'cancell': 37592,
 'avleak': 30682,
 'heiken': 62602,
 'immolation': 66338,
 'aalac': 25179,
 'seventies': 106334,
 'testaments': 114244,
 'outbreaks': 90782,
 'hizbolah': 63400,
 '_lwlp': 24035,
 '_national': 24112,
 'ol6s3m': 89666,
 'kl5cr4hejb': 73001,
 'giuseppe': 59192,
 'maimone': 78994,
 'objectivity': 89011,
 'la7': 74281,
 '_times_': 24481,
 'pod': 94684,
 'hierarchical': 63144,
 'wev4': 123884,
 'sparc10': 109139,
 'bambam': 31707,
 'icy': 65616,
 'colorboard': 41055,
 'cohn': 40898,
 'vindicated': 121594,
 'emmett': 51405,
 'ulfie': 118196,
 'yeltsin': 128066,
 'mni': 83129,
 'enchanted': 51539,
 'kmmgt1u': 73121,
 '4nt1': 14577,
 'fifp': 55242,
 'vasillates': 120762,
 'uncertainties': 118459,
 'mlyj': 82929,
 '160493205451': 5049,
 '4a2': 14205,
 'ohwz': 89530,
 'ci1': 39787,
 'db25f': 45327,
 '9m0': 22804,
 'alexia': 27338,
 'wwhjnux': 125521,
 'rr2b': 103221,
 '00110101b': 183,
 'c5i7ap': 36605,
 '33587': 11824,
 'enet': 51688,
 'lgorbet': 75571,
 '

In [30]:
# create pandas dataframe
vec=news_tfidf.max(axis=0)
df = pd.DataFrame(data=vec.toarray(),columns=tfidf_vect.get_feature_names())

In [31]:
df.max().sort_values()[-10:]

kk          0.870294
db          0.871473
scsi        0.875086
blah        0.879426
donoghue    0.891653
00          0.907726
___         0.908826
25          0.913127
forged      0.940511
ax          0.998314
dtype: float64

In [32]:
# now lets do the transformation with a smaller vocabulary
tfidf_vect = TfidfVectorizer(stop_words='english',
                             max_df=0.01,
                             min_df=4)
news_tfidf = tfidf_vect.fit_transform(bunch.data) 
print(news_tfidf.shape)
vec=news_tfidf.max(axis=0)
df = pd.DataFrame(data=vec.toarray(),columns=tfidf_vect.get_feature_names())
df.max().sort_values()[-10:]

(11314, 28593)


dialix            0.947270
blah              0.952881
ualberta          0.956563
stephanopoulos    0.959843
forged            0.971300
mufti             0.976947
ax                0.999881
meyers            1.000000
slower            1.000000
ucsd              1.000000
dtype: float64

# Using your own vocabulary

In [33]:
# read in scrabble dictionary from file
with open('data/ospd.txt') as f:
    vocab = f.read().split('\n')
    
# now lets do the transformation with a custom vocabulary
tfidf_vect = TfidfVectorizer(vocabulary=vocab)
news_tfidf = tfidf_vect.fit_transform(bunch.data) 
print(news_tfidf.shape)
vec=news_tfidf.max(axis=0)
df = pd.DataFrame(data=vec.toarray(),columns=tfidf_vect.get_feature_names())
df.max().sort_values()[-10:]

(11314, 79340)


incoming    0.925305
siemens     0.927114
water       0.928029
echo        0.947289
blah        0.951962
dos         0.953675
lib         0.954914
forged      0.978762
la          0.982737
ax          0.999999
dtype: float64