In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

We load the dataset.

In [None]:
twenty_train = fetch_20newsgroups(subset='train',
    shuffle=True, random_state=42)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [None]:
type(twenty_train)

sklearn.utils.Bunch

The dataset is stored in a bunch. We look at the keys

In [None]:
twenty_train.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [None]:
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [None]:
twenty_train.data[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [None]:
print("\n".join(twenty_train.data[0].split("\n")[:10]))

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 


In [None]:
print(twenty_train.target_names[twenty_train.target[0]])

rec.autos


In [None]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

rec.autos
comp.sys.mac.hardware
comp.sys.mac.hardware
comp.graphics
sci.space
talk.politics.guns
sci.med
comp.sys.ibm.pc.hardware
comp.os.ms-windows.misc
comp.sys.mac.hardware


In [None]:
type(twenty_train.target)

numpy.ndarray

In [None]:
import collections
collections.Counter(twenty_train.target)

Counter({0: 480,
         1: 584,
         2: 591,
         3: 590,
         4: 578,
         5: 593,
         6: 585,
         7: 594,
         8: 598,
         9: 597,
         10: 600,
         11: 595,
         12: 591,
         13: 594,
         14: 593,
         15: 599,
         16: 546,
         17: 564,
         18: 465,
         19: 377})

In [None]:
def evaluate_model(model,X,y):
  X_train,X_test,y_train,y_test=train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)
  model.fit(X_train,y_train)
  print("Accuracy score on training set {}".format(model.score(X_train,y_train)))
  print("Accuracy score on test set {}".format(model.score(X_test,y_test)))

def predict_category(model,X_test):
  predicted=model.predict(X_test)
  for doc,category in zip(X_test,predicted):
    print('{} => {}'.format(doc,twenty_train.target_names[category]))

def evaluation_test(model,X_test,y_test):
  predicted = model.predict(X_test)
  print("Accuracy = {:2f} %".format(np.mean(predicted == y_test)))
  print(metrics.classification_report(y_test, predicted,
    target_names=twenty_test.target_names))

def model_optimisation(model,grid_params):
  X_train,X_test,y_train,y_test=train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)
  model_gs=GridSearchCV(model,param_grid=grid_params,cv=5,verbose=3,n_jobs=-1)

  model_gs.fit(X_train,y_train)
  print("Best parameters {}".format(model_gs.best_params_))
  print("Best score: {:.3f}".format(model_gs.best_score_))  



We store the features (data) in X and the target in y.

In [None]:
X=twenty_train.data
y=twenty_train.target

# Multinomial Naive Bayes Classifier

We use a standard CountVectorizer.

In [None]:
cv_NB_pipe=Pipeline([('cvec',CountVectorizer(min_df=5)),('nb',MultinomialNB())])
evaluate_model(cv_NB_pipe,X,y)

Accuracy score on training set 0.9173571980996575
Accuracy score on test set 0.8515245249668582


In [None]:
docs_new = ['God is love', 'Go and see a doctor']
predict_category(cv_NB_pipe,docs_new)

God is love => soc.religion.christian
Go and see a doctor => sci.med


In [None]:
tfidf_NB_pipe=Pipeline([('tvec',TfidfVectorizer(min_df=5)),('nb',MultinomialNB())])
evaluate_model(tfidf_NB_pipe,X,y)

Accuracy score on training set 0.9360291680477295
Accuracy score on test set 0.862129916040654


In [None]:
predict_category(tfidf_NB_pipe,docs_new)

God is love => soc.religion.christian
Go and see a doctor => sci.med


# Linear Support Vector Machine classifier

In [None]:
SVM_clf=Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC()) 
  ])
evaluate_model(SVM_clf,X,y)
predict_category(SVM_clf,docs_new)

Accuracy score on training set 0.9993370898243288
Accuracy score on test set 0.9270879363676535
God is love => soc.religion.christian
Go and see a doctor => sci.med


# Evaluation on a new dataset

We load the dataset used to evaluate the model.

In [None]:
twenty_test = fetch_20newsgroups(subset='test',
    shuffle=True, random_state=42)

Evaluation of the accuracy for Multinomial Naive Bayes with standard Count Vectorizer.

In [None]:
evaluation_test(cv_NB_pipe,twenty_test.data,twenty_test.target)

Accuracy = 0.781731 %
                          precision    recall  f1-score   support

             alt.atheism       0.73      0.82      0.77       319
           comp.graphics       0.55      0.82      0.66       389
 comp.os.ms-windows.misc       0.20      0.00      0.01       394
comp.sys.ibm.pc.hardware       0.52      0.79      0.63       392
   comp.sys.mac.hardware       0.76      0.81      0.78       385
          comp.windows.x       0.75      0.75      0.75       395
            misc.forsale       0.83      0.82      0.82       390
               rec.autos       0.83      0.91      0.87       396
         rec.motorcycles       0.90      0.93      0.92       398
      rec.sport.baseball       0.93      0.90      0.91       397
        rec.sport.hockey       0.96      0.95      0.95       399
               sci.crypt       0.88      0.89      0.89       396
         sci.electronics       0.75      0.70      0.72       393
                 sci.med       0.90      0.80      0.

Evaluation of the accuracy for Multinomial Naive Bayes with standard Count Vectorizer and TFIDF transformer.

In [None]:
evaluation_test(tfidf_NB_pipe,twenty_test.data,twenty_test.target)

Accuracy = 0.791822 %
                          precision    recall  f1-score   support

             alt.atheism       0.78      0.63      0.70       319
           comp.graphics       0.78      0.68      0.73       389
 comp.os.ms-windows.misc       0.75      0.72      0.73       394
comp.sys.ibm.pc.hardware       0.67      0.77      0.72       392
   comp.sys.mac.hardware       0.83      0.80      0.81       385
          comp.windows.x       0.85      0.76      0.81       395
            misc.forsale       0.89      0.78      0.83       390
               rec.autos       0.84      0.92      0.88       396
         rec.motorcycles       0.96      0.90      0.93       398
      rec.sport.baseball       0.92      0.89      0.91       397
        rec.sport.hockey       0.88      0.97      0.92       399
               sci.crypt       0.74      0.94      0.83       396
         sci.electronics       0.81      0.63      0.70       393
                 sci.med       0.86      0.82      0.

Evaluation of the accuracy for the Linear SVM.

In [None]:
evaluation_test(SVM_clf,twenty_test.data,twenty_test.target)

Accuracy = 0.846389 %
                          precision    recall  f1-score   support

             alt.atheism       0.79      0.78      0.79       319
           comp.graphics       0.74      0.79      0.77       389
 comp.os.ms-windows.misc       0.76      0.74      0.75       394
comp.sys.ibm.pc.hardware       0.70      0.76      0.73       392
   comp.sys.mac.hardware       0.82      0.84      0.83       385
          comp.windows.x       0.86      0.74      0.80       395
            misc.forsale       0.84      0.90      0.87       390
               rec.autos       0.92      0.89      0.91       396
         rec.motorcycles       0.95      0.95      0.95       398
      rec.sport.baseball       0.92      0.95      0.93       397
        rec.sport.hockey       0.96      0.98      0.97       399
               sci.crypt       0.92      0.93      0.93       396
         sci.electronics       0.81      0.79      0.80       393
                 sci.med       0.90      0.87      0.

# Model optimisation with Grid Search CV

In [None]:
cvec_params = {
 'cvec__min_df':[1,5,10], 
 'cvec__ngram_range': [(1, 1), (1, 2)],
}
model_optimisation(cv_NB_pipe,cvec_params)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  2.5min finished


Best parameters {'cvec__min_df': 5, 'cvec__ngram_range': (1, 1)}
Best score: 0.842
