Loading the packages we need for this exercise

In [3]:
import sys

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn import metrics

print 'loaded'

loaded


Now we load a restricted version of the data

In [4]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
print help(fetch_20newsgroups)
twenty_train.keys()

Help on function fetch_20newsgroups in module sklearn.datasets.twenty_newsgroups:

fetch_20newsgroups(data_home=None, subset='train', categories=None, shuffle=True, random_state=42, remove=(), download_if_missing=True)
    Load the filenames and data from the 20 newsgroups dataset.
    
    Parameters
    ----------
    subset: 'train' or 'test', 'all', optional
        Select the dataset to load: 'train' for the training set, 'test'
        for the test set, 'all' for both, with shuffled ordering.
    
    data_home: optional, default: None
        Specify an download and cache folder for the datasets. If None,
        all scikit-learn data is stored in '~/scikit_learn_data' subfolders.
    
    categories: None or collection of string or unicode
        If None (default), load all the categories.
        If not None, list of category names to load (other categories
        ignored).
    
    shuffle: bool, optional
        Whether or not to shuffle the data: might be important for mo

['DESCR', 'data', 'target', 'target_names', 'filenames']

Now that we have the data, let's take a look at it.

In [5]:
print twenty_train.target_names # these are the names of the newsgroups
print twenty_train.keys() # these are the items available in the 'bunch'
print len(twenty_train.data)
print len(twenty_train.filenames)

print twenty_train.target

for t in twenty_train.target[:10]:
    print t, twenty_train.target_names[t]

# first three lines of the first record (data point, which is a newsgroup document)
print "\n".join(twenty_train.data[0].split("\n")[:3])


['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
['DESCR', 'data', 'target', 'target_names', 'filenames']
2257
2257
[1 1 3 ..., 2 2 2]
1 comp.graphics
1 comp.graphics
3 soc.religion.christian
3 soc.religion.christian
3 soc.religion.christian
3 soc.religion.christian
3 soc.religion.christian
2 sci.med
2 sci.med
2 sci.med
From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


Now, let's tokenize and remove stop words

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words=["only"])
X_train_counts = count_vect.fit_transform(twenty_train.data)
print X_train_counts.shape

print type(X_train_counts)

### write something here

### end of space

print count_vect.vocabulary_.get(u'algorithm')
print count_vect.stop_words_ # no stop words : - /
print count_vect.get_stop_words()

(2257, 35787)
<class 'scipy.sparse.csr.csr_matrix'>
4690
set([])
['only']


TF-IDF transformer!

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer 
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts) 
X_train_tf = tf_transformer.transform(X_train_counts) 
print X_train_tf.shape

print "[0, 5195]", X_train_tf.getrow(0).getcol(5195)

# the above does not do much, since idf is turned off

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print X_train_tfidf.shape

print "[0, 5195]", X_train_tf.getrow(0).getcol(5195)

(2257, 35787)
[0, 5195]   (0, 0)	0.0753778361444
(2257, 35787)
[0, 5195]   (0, 0)	0.0753778361444


In [20]:
from sklearn.naive_bayes import MultinomialNB 
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target) 

docs_new = ['God is love', 'OpenGL on the GPU is fast', 'god gpus are fast!']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

print X_new_tfidf

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category])) 

  (0, 20537)	0.809401056239
  (0, 18474)	0.255894264685
  (0, 15521)	0.528571712694
  (1, 32141)	0.0891122234242
  (1, 23789)	0.600483554876
  (1, 23733)	0.128412754679
  (1, 18474)	0.100340184397
  (1, 15628)	0.677839555469
  (1, 14048)	0.381384400353
  (2, 15521)	0.45724798045
  (2, 14048)	0.841388285598
  (2, 5410)	0.288079914666
'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics
'god gpus are fast!' => soc.religion.christian


The above command printed hello to you

In [29]:
# a pipeline 
from sklearn.pipeline import Pipeline 

text_clf = Pipeline([('vect', CountVectorizer()), 
                     ('tfidf', TfidfTransformer()), 
                     ('clf', MultinomialNB())])

print text_clf
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

print text_clf

Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        st...False,
         use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])
Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        st...False,
         use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])


Above we made a pipeline

In [30]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',
    categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
print predicted
np.mean(predicted == twenty_test.target)

# we should practice some plotting here to look into these results more!

[2 2 3 ..., 2 2 1]


0.83488681757656458