In [153]:
import sys

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn import metrics

In [2]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [4]:
from sklearn.datasets import fetch_20newsgroups

In [6]:
twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)



In [24]:
print(twenty_train.keys())
print(twenty_train.target_names)
print('{:*^50}'.format('TRAIN'))
print(len(twenty_train.data))
print(twenty_train.data[0])
print('{:*^50}'.format('TARGET'))
print(len(twenty_train.target))
print(twenty_train.target[0])
print(twenty_train.DESCR)
print(twenty_train.filenames[0])

['target_names', 'data', 'target', 'DESCR', 'filenames']
['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
**********************TRAIN***********************
2257
From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.

Please email any response.

Is this the correct group?

Thanks in advance.  Michael.
-- 
Michael Collier (Programmer)                 The Computer Unit,
Email: M.P.Collier@uk.ac.city                The City University,
Tel: 071 477-8000 x3769                      London,
Fax: 071 477-8565                            EC1V 0HB.

**********************TARGET**********************
2257
1
None
/home/davidheryanto/scikit_learn_data/20news_home/20news-bydate-tra

In [29]:
print('\n'.join(twenty_train.data[0].split('\n')[:3]))
print('*'*50)
for t in twenty_train.target[:3]:
    print(twenty_train.target_names[t])

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
**************************************************
comp.graphics
comp.graphics
soc.religion.christian


In [30]:
from sklearn.feature_extraction.text import CountVectorizer

In [41]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
print(X_train_counts.shape)
vars(X_train_counts)

(2257, 35788)


{'_shape': (2257, 35788),
 'data': array([1, 1, 4, ..., 1, 1, 1]),
 'format': 'csr',
 'indices': array([14887, 29022,  8696, ...,  1298,  2375,  3921], dtype=int32),
 'indptr': array([     0,     73,    175, ..., 364799, 365812, 365886], dtype=int32),
 'maxprint': 50}

In [43]:
count_vect.vocabulary_.get(u'algorithm')

4690

In [44]:
from sklearn.feature_extraction.text import TfidfTransformer

In [46]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

(2257, 35788)


In [65]:
print(X_train_counts.indptr[:10])
print(X_train_counts.indices[:10])
print(X_train_counts.data[:10])
print(X_train_tfidf.data[:10])

[   0   73  175  458  520  630  799  950 1236 1325]
[14887 29022  8696  4017 33256 21661  9031 31077  9805 17366]
[1 1 4 2 2 3 3 1 2 1]
[ 0.13487106  0.13487106  0.13487106  0.06666452  0.10960586  0.13487106
  0.10783603  0.24645541  0.25612026  0.08631915]


In [74]:
print(count_vect.vocabulary_.keys()[:10])

[u'3ds2scn', u'schlegel', u'tilton', u'circuitry', u'pantheistic', u'mdbs', u'hanging', u'woody', u'localized', u'sation']


In [75]:
from sklearn.naive_bayes import MultinomialNB

In [79]:
X_train_tfidf.data.shape

(365886,)

In [81]:
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [82]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)

In [83]:
for doc, category in zip(docs_new, predicted):
    print(doc, twenty_train.target_names[category])

('God is love', 'soc.religion.christian')
('OpenGL on the GPU is fast', 'comp.graphics')


In [84]:
# Using pipeline
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

In [88]:
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_new)
print([twenty_train.target_names[t] for t in predicted])

['soc.religion.christian', 'comp.graphics']


In [90]:
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.83488681757656458

In [91]:
# Use SVM
from sklearn.linear_model import SGDClassifier

In [94]:
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3,
                                           n_iter=10**6/len(twenty_train.data), random_state=42))])

In [95]:
text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.9127829560585885

In [97]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.82      0.88       319
         comp.graphics       0.87      0.98      0.92       389
               sci.med       0.95      0.89      0.92       396
soc.religion.christian       0.89      0.95      0.92       398

           avg / total       0.92      0.91      0.91      1502



In [98]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[260,  10,  12,  37],
       [  3, 380,   2,   4],
       [  5,  35, 351,   5],
       [  5,  10,   3, 380]])

In [102]:
print(count_vect.get_feature_names()[:50])

[u'00', u'000', u'0000', u'0000001200', u'000005102000', u'0001', u'000100255pixel', u'00014', u'000406', u'0007', u'000usd', u'0010', u'001004', u'0010580b', u'001125', u'001200201pixel', u'0014', u'001642', u'00196', u'002', u'0028', u'003258u19250', u'0033', u'0038', u'0039', u'004021809', u'004158', u'004627', u'0049', u'00500', u'005148', u'00630', u'008561', u'0094', u'00am', u'00index', u'00pm', u'01', u'0100', u'010116', u'010702', u'011255', u'011308pxf3', u'011605', u'011720', u'012019', u'012536', u'012946', u'013', u'013034']


In [118]:
tf = zip(count_vect.get_feature_names(), np.asarray(X_train_counts.sum(axis=0)).ravel())

In [126]:
from operator import itemgetter

print(tf[:20])
print('*'*70)
print(sorted(tf, key=itemgetter(1), reverse=True)[:20])

[(u'00', 134), (u'000', 92), (u'0000', 1), (u'0000001200', 2), (u'000005102000', 1), (u'0001', 3), (u'000100255pixel', 1), (u'00014', 1), (u'000406', 1), (u'0007', 1), (u'000usd', 2), (u'0010', 1), (u'001004', 1), (u'0010580b', 3), (u'001125', 1), (u'001200201pixel', 1), (u'0014', 1), (u'001642', 2), (u'00196', 1), (u'002', 3)]
**********************************************************************
[(u'the', 29825), (u'of', 17660), (u'to', 17222), (u'and', 13017), (u'is', 12343), (u'in', 11226), (u'that', 10723), (u'it', 8447), (u'you', 6157), (u'for', 5962), (u'not', 5285), (u'this', 5063), (u'be', 4898), (u'from', 4877), (u'are', 4769), (u'edu', 4675), (u'have', 4218), (u'as', 4015), (u'on', 3709), (u'with', 3655)]


In [142]:
# Grid search
from sklearn.grid_search import GridSearchCV
parameters = {'vect__ngram_range': [(1,1), (1,2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}

In [143]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)

In [144]:
gs_clf = gs_clf.fit(twenty_train.data[:500], twenty_train.target[:500])

In [187]:
gs_clf.grid_scores_

[mean: 0.89800, std: 0.01797, params: {'vect__ngram_range': (1, 1), 'tfidf__use_idf': True, 'clf__alpha': 0.01},
 mean: 0.90600, std: 0.01011, params: {'vect__ngram_range': (1, 2), 'tfidf__use_idf': True, 'clf__alpha': 0.01},
 mean: 0.81200, std: 0.03563, params: {'vect__ngram_range': (1, 1), 'tfidf__use_idf': False, 'clf__alpha': 0.01},
 mean: 0.83200, std: 0.03457, params: {'vect__ngram_range': (1, 2), 'tfidf__use_idf': False, 'clf__alpha': 0.01},
 mean: 0.91600, std: 0.00960, params: {'vect__ngram_range': (1, 1), 'tfidf__use_idf': True, 'clf__alpha': 0.001},
 mean: 0.91600, std: 0.00826, params: {'vect__ngram_range': (1, 2), 'tfidf__use_idf': True, 'clf__alpha': 0.001},
 mean: 0.86400, std: 0.00713, params: {'vect__ngram_range': (1, 1), 'tfidf__use_idf': False, 'clf__alpha': 0.001},
 mean: 0.88400, std: 0.01207, params: {'vect__ngram_range': (1, 2), 'tfidf__use_idf': False, 'clf__alpha': 0.001}]

In [172]:
print(gs_clf.grid_scores_[0])
print(gs_clf.grid_scores_[0][0])
print(gs_clf.grid_scores_[0][1])

mean: 0.89800, std: 0.01797, params: {'vect__ngram_range': (1, 1), 'tfidf__use_idf': True, 'clf__alpha': 0.01}
{'vect__ngram_range': (1, 1), 'tfidf__use_idf': True, 'clf__alpha': 0.01}
0.898


In [181]:
gs_clf.grid_scores_[0]._fields

('parameters', 'mean_validation_score', 'cv_validation_scores')

In [170]:
type(gs_clf.grid_scores_[0])

sklearn.grid_search._CVScoreTuple

In [184]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x.mean_validation_score)
for param_name in sorted(parameters.keys()):
    print(param_name, best_parameters[param_name])
print('score', score)

('clf__alpha', 0.001)
('tfidf__use_idf', True)
('vect__ngram_range', (1, 1))
('score', 0.91600000000000004)
