In [1]:
from __future__ import print_function

import numpy as np

from sklearn import __version__ as sklearn_version
print('Sklearn version:', sklearn_version)

Sklearn version: 0.18.1


# The data

The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation). The split between the train and test set is based upon a messages posted before and after a specific date.


In [2]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']

twenty_train = fetch_20newsgroups(subset='train',
                 remove=('headers', 'footers', 'quotes'),
                 categories=categories, shuffle=True, random_state=42)

twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [20]:
# Sample data
print(twenty_train.data[0])
print('---------------')
print('Target: ', twenty_train.target[0])


Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
---------------
Target:  1


In [4]:
# Text preprocessing, tokenizing and filtering of stopwords

from sklearn.feature_extraction.text import CountVectorizer
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=5000,
                                stop_words='english')
X_train_counts = tf_vectorizer.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 5000)

In [5]:
print(X_train_counts[0,:])
print(X_train_counts[:,0])


  (0, 2866)	1
  (0, 238)	1
  (0, 4522)	1
  (0, 2058)	1
  (0, 1123)	1
  (0, 3867)	1
  (0, 1543)	1
  (0, 3385)	1
  (0, 2197)	1
  (0, 1094)	1
  (0, 2643)	1
  (0, 1865)	1
  (0, 2237)	1
  (0, 1795)	2
  (0, 4520)	1
  (0, 2251)	1
  (0, 1090)	1
  (0, 4744)	1
  (0, 3276)	1
  (0, 357)	1
  (0, 3273)	1
  (0, 4299)	1
  (0, 4869)	1
  (0, 2014)	1
  (0, 2550)	1
  (0, 1445)	1
  (232, 0)	2
  (272, 0)	1
  (282, 0)	1
  (400, 0)	1
  (433, 0)	2
  (581, 0)	2
  (588, 0)	1
  (766, 0)	1
  (768, 0)	2
  (837, 0)	3
  (844, 0)	1
  (859, 0)	1
  (880, 0)	1
  (1030, 0)	1
  (1056, 0)	6
  (1057, 0)	2
  (1263, 0)	1
  (1475, 0)	1
  (1665, 0)	16
  (1795, 0)	1
  (1802, 0)	1
  (1833, 0)	1
  (1890, 0)	2
  (2069, 0)	1
  (2144, 0)	1


In [6]:
#From occurrences to frequencies
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(X_train_counts)
X_train_tf = tfidf_transformer.transform(X_train_counts)
X_train_tf.shape

(2257, 5000)

In [7]:
print(X_train_tf[0,:])
print(X_train_tf[:,0])


  (0, 1445)	0.0998496101737
  (0, 2550)	0.0920875619201
  (0, 2014)	0.10905059472
  (0, 4869)	0.112409159775
  (0, 4299)	0.172232378831
  (0, 3273)	0.189497984618
  (0, 357)	0.196147304589
  (0, 3276)	0.239358101611
  (0, 4744)	0.242697172074
  (0, 1090)	0.185367646905
  (0, 2251)	0.281517460204
  (0, 4520)	0.239358101611
  (0, 1795)	0.326673936513
  (0, 2237)	0.217882788689
  (0, 1865)	0.182356290661
  (0, 2643)	0.0944312658437
  (0, 1094)	0.250397930473
  (0, 2197)	0.225991796704
  (0, 3385)	0.272954303671
  (0, 1543)	0.163780615995
  (0, 3867)	0.165608347231
  (0, 1123)	0.157610927262
  (0, 2058)	0.144807482284
  (0, 4522)	0.126533637604
  (0, 238)	0.170069829145
  :	:
  (9, 1041)	0.0576401552277
  (9, 1780)	0.115995106978
  (9, 1492)	0.0533385546095
  (9, 2755)	0.0583670692289
  (9, 1446)	0.0372652083697
  (9, 3264)	0.102003115359
  (9, 4551)	0.0847112103101
  (9, 423)	0.0486076782268
  (9, 703)	0.0613781399834
  (9, 2365)	0.147719409349
  (9, 3160)	0.0711204602374
  (9, 2266)	0.06

## First basic model 

In [8]:
from sklearn.naive_bayes import MultinomialNB

# Define and fit in one line
clf = MultinomialNB().fit(X_train_tf, twenty_train.target)


In [9]:
#Score test data

# Read test data
twenty_test = fetch_20newsgroups(subset='test',
                 remove=('headers', 'footers', 'quotes'),
                 categories=categories, shuffle=True, random_state=42)

# Transform text to counts
X_test_counts = tf_vectorizer.transform(twenty_test.data)

# tf-idf transformation
X_test_tf = tfidf_transformer.transform(X_test_counts)

# Prediction
predicted = clf.predict(X_test_tf)

# Accuracy
from sklearn.metrics import accuracy_score
print('Accuracy test: ', accuracy_score(twenty_test.target, predicted))


Accuracy test:  0.798934753662


In [10]:
# Score 2 new docs
docs_new = ['God is love', 'OpenGL on the GPU is fast']

X_new_counts = tf_vectorizer.transform(docs_new)

X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


## Build a pipeline

In [11]:
#Define the pipeline

from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])

# Fit all the pipeline
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.95, max_features=5000, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
    ...False,
         use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [12]:
#Evaluate test data
twenty_test = fetch_20newsgroups(subset='test',
                    remove=('headers', 'footers', 'quotes'),
                    categories=categories, 
                    shuffle=True, random_state=42)

predicted = text_clf.predict(twenty_test.data)

np.mean(predicted == twenty_test.target) 

0.79893475366178424

## Change classifier in the pipeline

In [13]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, n_iter=5, random_state=42)),
                    ])
#Fit
_ = text_clf.fit(twenty_train.data, twenty_train.target)

# Predict
predicted = text_clf.predict(twenty_test.data)

# Evaluate accuracy
np.mean(predicted == twenty_test.target)            


0.80692410119840208

## Other classifier

In [14]:
from sklearn import svm
text_clf_svm = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', svm.LinearSVC()),
                    ])

_ = text_clf_svm.fit(twenty_train.data, twenty_train.target)

predicted = text_clf_svm.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)            

0.80892143808255657

## Optimize a pipeline

In [15]:
from sklearn.model_selection import RandomizedSearchCV

# Define estimator. No parameters of the search
clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2)),
                ('tfidf', TfidfTransformer()),
                ('clf', svm.LinearSVC()),
                ])

# Specify parameters and distributions to sample from
# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_dist = {"vect__max_features": [1000, 2500, 5000, 7500, 10000, None], 
              "vect__stop_words": ['english', None], 
              "clf__C": [.1, .5, 1., 1.5, 2.]}

# Define randomized search
n_iter_search = 10
random_search = RandomizedSearchCV(clf, param_distributions=param_dist, n_iter=n_iter_search)

# Run the randomized search
random_search.fit(twenty_train.data, twenty_train.target)

print("Done!")

Done!


In [16]:
# Load dictionary of search results to a Pandas dataframe
import pandas as pd

df_cv_results = pd.DataFrame.from_dict(random_search.cv_results_)
df_cv_results

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_clf__C,param_vect__max_features,param_vect__stop_words,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,0.435259,0.186195,0.834293,0.978734,0.5,2500.0,english,"{u'clf__C': 0.5, u'vect__max_features': 2500, ...",5,0.841965,0.978723,0.828685,0.980718,0.832224,0.97676,0.008963,0.010124,0.005617,0.001616
1,0.433495,0.190499,0.781568,0.899203,0.1,1000.0,,"{u'clf__C': 0.1, u'vect__max_features': 1000, ...",10,0.804781,0.895612,0.772908,0.902926,0.766977,0.89907,0.009889,0.011081,0.016602,0.002987
2,0.460868,0.182298,0.859548,0.982499,1.0,7500.0,,"{u'clf__C': 1.0, u'vect__max_features': 7500, ...",3,0.869854,0.982048,0.847278,0.984043,0.861518,0.981408,0.003911,0.010476,0.009325,0.001122
3,0.441299,0.188977,0.782898,0.974302,1.5,1000.0,english,"{u'clf__C': 1.5, u'vect__max_features': 1000, ...",9,0.796813,0.972739,0.786189,0.975399,0.765646,0.974768,0.008202,0.010658,0.012932,0.001135
4,0.457468,0.189954,0.787328,0.977182,1.5,1000.0,,"{u'clf__C': 1.5, u'vect__max_features': 1000, ...",8,0.802125,0.975399,0.791501,0.978723,0.768309,0.977424,0.006002,0.010357,0.014114,0.001368
5,0.484521,0.18452,0.86132,0.982499,1.5,10000.0,,"{u'clf__C': 1.5, u'vect__max_features': 10000,...",1,0.869854,0.982048,0.849934,0.984043,0.864181,0.981408,0.008559,0.009678,0.008383,0.001122
6,0.454389,0.191902,0.792645,0.970978,1.0,1000.0,,"{u'clf__C': 1.0, u'vect__max_features': 1000, ...",7,0.800797,0.968085,0.799469,0.972074,0.77763,0.972776,0.00612,0.010315,0.010617,0.002066
7,0.458575,0.183262,0.860434,0.98117,0.5,,,"{u'clf__C': 0.5, u'vect__max_features': None, ...",2,0.868526,0.980053,0.856574,0.983378,0.856192,0.98008,0.009978,0.009732,0.005728,0.001561
8,0.450393,0.183927,0.828977,0.951043,0.1,,,"{u'clf__C': 0.1, u'vect__max_features': None, ...",6,0.843293,0.954122,0.816733,0.951463,0.826897,0.947543,0.007579,0.009494,0.010947,0.002702
9,0.490448,0.18391,0.85689,0.982499,1.5,7500.0,,"{u'clf__C': 1.5, u'vect__max_features': 7500, ...",4,0.864542,0.982048,0.844622,0.984043,0.861518,0.981408,0.00976,0.010077,0.008768,0.001122


In [17]:
# Score & evaluate test data using the best estimator

text_clf_svm = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=10000, stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', svm.LinearSVC(C=1.5)),
                    ])

_ = text_clf_svm.fit(twenty_train.data, twenty_train.target)

predicted = text_clf_svm.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)            

0.81424766977363516

## Aditional metrics for multiclass classification

In [18]:
from sklearn import metrics

print(metrics.classification_report(twenty_test.target, 
                                    predicted,
                                    target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.76      0.61      0.68       319
         comp.graphics       0.82      0.92      0.87       389
               sci.med       0.88      0.85      0.86       396
soc.religion.christian       0.78      0.84      0.81       398

           avg / total       0.81      0.81      0.81      1502



In [19]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[196,  22,  24,  77],
       [ 16, 356,  14,   3],
       [ 14,  32, 337,  13],
       [ 32,  22,  10, 334]])