
# Example(20 News Groups Data Set)

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
# load articles
# NOTE: Give path to folder only, do not specify the file.
NewsArticles = fetch_20newsgroups(subset='all',
                            remove=('headers','footers','quotes'),
                            data_home='Data/NewsGroups', #pkz file only need to go to the directory
                            download_if_missing=False)



In [3]:
# create corpus dataframe
GroupNames = pd.Series(NewsArticles.target_names)
corpus = pd.DataFrame()
corpus['group']      = NewsArticles.target
corpus['group_names'] = GroupNames[NewsArticles.target].values
corpus['articles']    = NewsArticles.data
print(corpus.shape)
corpus.head()

(18846, 3)


Unnamed: 0,group,group_names,articles
0,10,rec.sport.hockey,\n\nI am sure some bashers of Pens fans are pr...
1,3,comp.sys.ibm.pc.hardware,My brother is in the market for a high-perform...
2,17,talk.politics.mideast,\n\n\n\n\tFinally you said what you dream abou...
3,3,comp.sys.ibm.pc.hardware,\nThink!\n\nIt's the SCSI card doing the DMA t...
4,4,comp.sys.mac.hardware,1) I have an old Jasmine drive which I cann...


In [4]:
# print an article 
ix = 10010
print(corpus.group_names[ix])
print('-----------')
print(corpus.articles[ix])
print()

comp.graphics
-----------
Does anyone know of any good shareware animation or paint software for an SGI
 machine?  I've exhausted everyplace on the net I can find and still don't hava
 a nice piece of software.

Thanks alot!

Chad





In [5]:
corpus.shape

(18846, 3)

In [6]:
#Use narive bayes to predict use feature tf-idf
# vectorize articles (not sparse, limit number of articles)
Vectorizer = CountVectorizer(analyzer='word',stop_words='english')
word_counts = Vectorizer.fit_transform(corpus.articles)
vocabulary = Vectorizer.get_feature_names()                           

In [7]:
len(vocabulary)

134101

In [8]:
# NOTE: limit number of articles using Pandas
ix = 1000
df = pd.DataFrame(word_counts[:ix,:].toarray(),columns=vocabulary)
print(df.shape)
df.head()

(1000, 134101)


Unnamed: 0,00,000,0000,00000,000000,00000000,0000000004,00000000b,00000001,00000001b,...,zzs,zzvsi,zzy_3w,zzz,zzzoh,zzzzzz,zzzzzzt,³ation,ýé,ÿhooked
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# sparse numpy commands
word_counts_total = np.asarray(word_counts.sum(axis=0)).flatten()
word_counts_total = pd.Series(word_counts_total,index=vocabulary)
print('top ten words')
print('-------------')
print(word_counts_total.sort_values(ascending=False).head(10))

top ten words
-------------
ax        62396
like       6525
don        6524
people     6458
just       6172
know       5763
use        5027
think      5001
time       4867
max        4637
dtype: int64


In [10]:
vectorizer = TfidfVectorizer(analyzer='word',stop_words='english')
tfidf = vectorizer.fit_transform(corpus.articles)
vocabulary = vectorizer.get_feature_names()   

In [11]:
key_words = np.array(tfidf.sum(axis = 0)).flatten()
key_words = pd.Series(key_words, index=vocabulary).sort_values(ascending=False)
print('top ten key words')
print('-------------')
print(key_words.head(10))

top ten key words
-------------
like      239.600310
just      238.905625
don       236.881580
know      235.757506
people    208.374281
think     200.380657
does      197.060018
use       171.948899
thanks    167.436382
good      164.778817
dtype: float64


In [12]:
important_words = pd.Series(vectorizer.idf_,index=vocabulary).sort_values(ascending=False)
print('top ten idf words')
print('-------------')
print(important_words.head(10))

top ten idf words
-------------
ÿhooked    10.150962
gg4x6q1    10.150962
gg4x6um    10.150962
gg4x6z     10.150962
gg5        10.150962
gg56k      10.150962
gg5a       10.150962
gg6        10.150962
gg7tu      10.150962
gga2       10.150962
dtype: float64


In [13]:
pd.DataFrame(tfidf[0:100,:].toarray(),columns=vocabulary).head(3)

Unnamed: 0,00,000,0000,00000,000000,00000000,0000000004,00000000b,00000001,00000001b,...,zzs,zzvsi,zzy_3w,zzz,zzzoh,zzzzzz,zzzzzzt,³ation,ýé,ÿhooked
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
