In [58]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',shuffle=False)

twenty_train.data[0]

"From: cubbie@garnet.berkeley.edu (                               )\nSubject: Re: Cubs behind Marlins? How?\nArticle-I.D.: agate.1pt592$f9a\nOrganization: University of California, Berkeley\nLines: 12\nNNTP-Posting-Host: garnet.berkeley.edu\n\n\ngajarsky@pilot.njin.net writes:\n\nmorgan and guzman will have era's 1 run higher than last year, and\n the cubs will be idiots and not pitch harkey as much as hibbard.\n castillo won't be good (i think he's a stud pitcher)\n\n       This season so far, Morgan and Guzman helped to lead the Cubs\n       at top in ERA, even better than THE rotation at Atlanta.\n       Cubs ERA at 0.056 while Braves at 0.059. We know it is early\n       in the season, we Cubs fans have learned how to enjoy the\n       short triumph while it is still there.\n"

In [59]:
#efficient version
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect',CountVectorizer()),
                    ('tfidf',TfidfTransformer()),
                    ('clf',MultinomialNB())])



text_clf = text_clf.fit(twenty_train.data,twenty_train.target)

### Construct NLTK stemer

In [60]:
import nltk
# nltk.download()
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english", ignore_stopwords=True)
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
stemmed_count_vect = StemmedCountVectorizer(stop_words='english')

## Fully loaded Pipeline

In [61]:
text_clf_svm = Pipeline([('vect',stemmed_count_vect),
                    ('tfidf',TfidfTransformer()),
                    ('clf_svm',SGDClassifier(loss="hinge",max_iter=1000,random_state=42))]) #can also pass MultiNomialNB

text_clf_svm = text_clf_svm.fit(twenty_train.data,twenty_train.target)

### PRedictions before grid search

In [62]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',shuffle=True)
predicted = text_clf_svm.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.8502389803505045

### Refine with GridSearch

In [64]:
from sklearn.model_selection import GridSearchCV
"""
best params: default params
{'clf_svm__alpha': 0.001,
 'clf_svm__early_stopping': False,
 'clf_svm__penalty': 'l2',
 'vect__ngram_range': (1, 2)}
"""
params_svm= {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf':(True,False),
    'clf_svm__penalty':['l2'],
    'clf_svm__alpha':[.001,.0005],
    'clf_svm__early_stopping':(False,)
}

gs_clf = GridSearchCV(text_clf_svm,params_svm,n_jobs=-1,verbose = 2)
gs_clf = gs_clf.fit(twenty_train.data,twenty_train.target)
gs_clf.best_params_

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [None]:
gs_clf.best_score_

In [None]:
twenty_test = fetch_20newsgroups(subset='test',shuffle=True)
predicted = gs_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.8311205523101434

In [None]:
import time
import random
count = 1
start = time.time()
for i in range(10**7):
    count+=random.random()
print(time.time()-start)

1.0295655727386475
