In [2]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import text_normalizer_el as tn # greek
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
# to prevent the model from overfitting or not generalizing well - parameter remove
data = fetch_20newsgroups(subset='all', shuffle=True,
                          remove=('headers', 'footers', 'quotes'))
data_labels_map = dict(enumerate(data.target_names))

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [3]:
# We now leverage the helper function from Scikit-Learn to fetch the required data.
# Once we get the data, we transform this data into an easy-to-use dataframe
corpus, target_labels, target_names = (data.data, data.target, 
                                       [data_labels_map[label] for label in data.target])
data_df = pd.DataFrame({'Article': corpus, 'Target Label': target_labels, 'Target Name': target_names})
print(data_df.shape)
data_df.head(10)

(18846, 3)


Unnamed: 0,Article,Target Label,Target Name
0,\n\nI am sure some bashers of Pens fans are pr...,10,rec.sport.hockey
1,My brother is in the market for a high-perform...,3,comp.sys.ibm.pc.hardware
2,\n\n\n\n\tFinally you said what you dream abou...,17,talk.politics.mideast
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3,comp.sys.ibm.pc.hardware
4,1) I have an old Jasmine drive which I cann...,4,comp.sys.mac.hardware
5,\n\nBack in high school I worked as a lab assi...,12,sci.electronics
6,\n\nAE is in Dallas...try 214/241-6060 or 214/...,4,comp.sys.mac.hardware
7,"\n[stuff deleted]\n\nOk, here's the solution t...",10,rec.sport.hockey
8,"\n\n\nYeah, it's the second one. And I believ...",10,rec.sport.hockey
9,\nIf a Christian means someone who believes in...,19,talk.religion.misc


In [20]:
# empty documents in our dataset and remove them
total_nulls = data_df[data_df.Article.str.strip() == ''].shape[0]
print("Empty documents:", total_nulls)

Empty documents: 0


In [21]:
data_df = data_df[~(data_df.Article.str.strip() == '')]
data_df.shape

(18301, 4)

In [3]:
# import nltk
# stopword_list = nltk.corpus.stopwords.words('greek')
# print (stopword_list)

['αλλα', 'αν', 'αντι', 'απο', 'αυτα', 'αυτεσ', 'αυτη', 'αυτο', 'αυτοι', 'αυτοσ', 'αυτουσ', 'αυτων', 'αἱ', 'αἳ', 'αἵ', 'αὐτόσ', 'αὐτὸς', 'αὖ', 'γάρ', 'γα', 'γα^', 'γε', 'για', 'γοῦν', 'γὰρ', "δ'", 'δέ', 'δή', 'δαί', 'δαίσ', 'δαὶ', 'δαὶς', 'δε', 'δεν', "δι'", 'διά', 'διὰ', 'δὲ', 'δὴ', 'δ’', 'εαν', 'ειμαι', 'ειμαστε', 'ειναι', 'εισαι', 'ειστε', 'εκεινα', 'εκεινεσ', 'εκεινη', 'εκεινο', 'εκεινοι', 'εκεινοσ', 'εκεινουσ', 'εκεινων', 'ενω', 'επ', 'επι', 'εἰ', 'εἰμί', 'εἰμὶ', 'εἰς', 'εἰσ', 'εἴ', 'εἴμι', 'εἴτε', 'η', 'θα', 'ισωσ', 'κ', 'καί', 'καίτοι', 'καθ', 'και', 'κατ', 'κατά', 'κατα', 'κατὰ', 'καὶ', 'κι', 'κἀν', 'κἂν', 'μέν', 'μή', 'μήτε', 'μα', 'με', 'μεθ', 'μετ', 'μετά', 'μετα', 'μετὰ', 'μη', 'μην', 'μἐν', 'μὲν', 'μὴ', 'μὴν', 'να', 'ο', 'οι', 'ομωσ', 'οπωσ', 'οσο', 'οτι', 'οἱ', 'οἳ', 'οἷς', 'οὐ', 'οὐδ', 'οὐδέ', 'οὐδείσ', 'οὐδεὶς', 'οὐδὲ', 'οὐδὲν', 'οὐκ', 'οὐχ', 'οὐχὶ', 'οὓς', 'οὔτε', 'οὕτω', 'οὕτως', 'οὕτωσ', 'οὖν', 'οὗ', 'οὗτος', 'οὗτοσ', 'παρ', 'παρά', 'παρα', 'παρὰ', 'περί', 'περὶ', 'πο

In [22]:
# general text preprocessing or wrangling stage.This involves cleaning, preprocessing, and normalizing text to bring text
# components like sentences, phrases, and words to some standard format
import nltk
stopword_list = nltk.corpus.stopwords.words('greek')
print (stopword_list)
# just to keep negation if any in bi-grams
stopword_list.remove('όχι')
stopword_list.remove('δεν')

# normalize our corpus
norm_corpus = tn.normalize_corpus(corpus=data_df['Article'], html_stripping=True, contraction_expansion=True, 
                                  accented_char_removal=True, text_lower_case=True, text_lemmatization=True, 
                                  text_stemming=False, special_char_removal=True, remove_digits=True,
                                  stopword_removal=True, stopwords=stopword_list)
data_df['Clean Article'] = norm_corpus

In [23]:
# view sample data
data_df = data_df[['Article', 'Clean Article', 'Target Label', 'Target Name']]
data_df.head(10)
# The 20 Newsgroups dataset after text preprocessing

Unnamed: 0,Article,Clean Article,Target Label,Target Name
0,\n\nI am sure some bashers of Pens fans are pr...,sure basher pens fan pretty confused lack kind...,10,rec.sport.hockey
1,My brother is in the market for a high-perform...,brother market high performance video card sup...,3,comp.sys.ibm.pc.hardware
2,\n\n\n\n\tFinally you said what you dream abou...,finally say dream mediterranean new area great...,17,talk.politics.mideast
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,think scsi card dma transfer not disk scsi car...,3,comp.sys.ibm.pc.hardware
4,1) I have an old Jasmine drive which I cann...,old jasmine drive not use new system understan...,4,comp.sys.mac.hardware
5,\n\nBack in high school I worked as a lab assi...,back high school work lab assistant bunch expe...,12,sci.electronics
6,\n\nAE is in Dallas...try 214/241-6060 or 214/...,ae dallas try tech support may line one get start,4,comp.sys.mac.hardware
7,"\n[stuff deleted]\n\nOk, here's the solution t...",stuff delete ok solution problem move canada y...,10,rec.sport.hockey
8,"\n\n\nYeah, it's the second one. And I believ...",yeah second one believe price try get good loo...,10,rec.sport.hockey
9,\nIf a Christian means someone who believes in...,christian mean someone believe divinity jesus ...,19,talk.religion.misc


In [24]:
data_df = data_df.replace(r'^(\s?)+$', np.nan, regex=True)
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18301 entries, 0 to 18300
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Article        18301 non-null  object
 1   Clean Article  18301 non-null  object
 2   Target Label   18301 non-null  int64 
 3   Target Name    18301 non-null  object
dtypes: int64(1), object(3)
memory usage: 714.9+ KB


In [25]:
# We definitely have some null articles after our preprocessing operation
data_df = data_df.dropna().reset_index(drop=True)
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18301 entries, 0 to 18300
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Article        18301 non-null  object
 1   Clean Article  18301 non-null  object
 2   Target Label   18301 non-null  int64 
 3   Target Name    18301 non-null  object
dtypes: int64(1), object(3)
memory usage: 572.0+ KB


In [26]:
# store the dataset using the following code if needed so you don’t need to run the
# preprocessing step every time
data_df.to_csv('clean_newsgroups.csv', index=False)

In [27]:
data_df = pd.read_csv('clean_newsgroups.csv')

### Building Train and Test Datasets

In [28]:
# To build a machine learning system, we need to build our models on training data and
# then test and evaluate their performance on test data.
# train dataset : test dataset split of 67%/33% of the total data.
from sklearn.model_selection import train_test_split

train_corpus, test_corpus, train_label_nums, test_label_nums, train_label_names, test_label_names =\
                                 train_test_split(np.array(data_df['Clean Article']), np.array(data_df['Target Label']),
                                                       np.array(data_df['Target Name']), test_size=0.33, random_state=42)

train_corpus.shape, test_corpus.shape

((12261,), (6040,))

In [29]:
# Distribution of train and test articles by the 20 newsgroups
from collections import Counter

trd = dict(Counter(train_label_names))
tsd = dict(Counter(test_label_names))

(pd.DataFrame([[key, trd[key], tsd[key]] for key in trd], 
             columns=['Target Label', 'Train Count', 'Test Count'])
.sort_values(by=['Train Count', 'Test Count'],
             ascending=False))

Unnamed: 0,Target Label,Train Count,Test Count
6,sci.crypt,675,287
8,comp.windows.x,669,311
1,comp.graphics,661,292
5,rec.motorcycles,658,311
2,soc.religion.christian,656,318
0,rec.sport.hockey,655,318
14,sci.electronics,645,311
15,comp.os.ms-windows.misc,642,304
18,sci.med,641,319
7,rec.sport.baseball,641,310


### Feature Engineering Techniques
In machine learning terminology, features are unique measurable attributes or
properties for each observation or data point in a dataset. Features are usually numeric in nature and can be absolute numeric values or categorical features that can be encoded as binary features for each category.

In [30]:
# Bag of Words Features with Classification Models
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score

# build BOW features on train articles
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0)
cv_train_features = cv.fit_transform(train_corpus)

In [31]:
# transform test articles into features
cv_test_features = cv.transform(test_corpus)

In [32]:
print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)

BOW model:> Train features shape: (12261, 74354)  Test features shape: (6040, 74354)


## Classification Models (with sklearn...)
Build several classifiers on these features using the training data and test
their performance on the test dataset and then check model accuracies.

### Naive Bayes Classifier

In [33]:
# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB(alpha=1)
mnb.fit(cv_train_features, train_label_names)
mnb_bow_cv_scores = cross_val_score(mnb, cv_train_features, train_label_names, cv=5)
mnb_bow_cv_mean_score = np.mean(mnb_bow_cv_scores)
print('CV Accuracy (5-fold):', mnb_bow_cv_scores)
print('Mean CV Accuracy:', mnb_bow_cv_mean_score)
mnb_bow_test_score = mnb.score(cv_test_features, test_label_names)
print('Test Accuracy:', mnb_bow_test_score)

CV Accuracy (5-fold): [0.67713004 0.64600326 0.67495922 0.67128874 0.66598695]
Mean CV Accuracy: 0.6670736435526229
Test Accuracy: 0.6811258278145695


### Logistic Regression

In [34]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=42)
lr.fit(cv_train_features, train_label_names)
lr_bow_cv_scores = cross_val_score(lr, cv_train_features, train_label_names, cv=5)
lr_bow_cv_mean_score = np.mean(lr_bow_cv_scores)
print('CV Accuracy (5-fold):', lr_bow_cv_scores)
print('Mean CV Accuracy:', lr_bow_cv_mean_score)
lr_bow_test_score = lr.score(cv_test_features, test_label_names)
print('Test Accuracy:', lr_bow_test_score)

CV Accuracy (5-fold): [0.68202201 0.67128874 0.68515498 0.68760196 0.66965742]
Mean CV Accuracy: 0.6791450226742366
Test Accuracy: 0.6910596026490067


### Support Vector Machines

In [35]:
from sklearn.svm import LinearSVC

svm = LinearSVC(penalty='l2', C=1, random_state=42)
svm.fit(cv_train_features, train_label_names)
svm_bow_cv_scores = cross_val_score(svm, cv_train_features, train_label_names, cv=5)
svm_bow_cv_mean_score = np.mean(svm_bow_cv_scores)
print('CV Accuracy (5-fold):', svm_bow_cv_scores)
print('Mean CV Accuracy:', svm_bow_cv_mean_score)
svm_bow_test_score = svm.score(cv_test_features, test_label_names)
print('Test Accuracy:', svm_bow_test_score)

CV Accuracy (5-fold): [0.63065634 0.637031   0.65375204 0.6411093  0.637031  ]
Mean CV Accuracy: 0.6399159334144228
Test Accuracy: 0.6582781456953642


### SVM with Stochastic Gradient Descent

In [36]:
from sklearn.linear_model import SGDClassifier

svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=5, random_state=42)
svm_sgd.fit(cv_train_features, train_label_names)
svmsgd_bow_cv_scores = cross_val_score(svm_sgd, cv_train_features, train_label_names, cv=5)
svmsgd_bow_cv_mean_score = np.mean(svmsgd_bow_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_bow_cv_scores)
print('Mean CV Accuracy:', svmsgd_bow_cv_mean_score)
svmsgd_bow_test_score = svm_sgd.score(cv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_bow_test_score)

CV Accuracy (5-fold): [0.64492458 0.63907015 0.63295269 0.64355628 0.6451876 ]
Mean CV Accuracy: 0.6411382606376718
Test Accuracy: 0.653476821192053


### Random Forest

In [37]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=10, random_state=42)
rfc.fit(cv_train_features, train_label_names)
rfc_bow_cv_scores = cross_val_score(rfc, cv_train_features, train_label_names, cv=5)
rfc_bow_cv_mean_score = np.mean(rfc_bow_cv_scores)
print('CV Accuracy (5-fold):', rfc_bow_cv_scores)
print('Mean CV Accuracy:', rfc_bow_cv_mean_score)
rfc_bow_test_score = rfc.score(cv_test_features, test_label_names)
print('Test Accuracy:', rfc_bow_test_score)

CV Accuracy (5-fold): [0.5108031  0.51957586 0.5175367  0.52895595 0.52528548]
Mean CV Accuracy: 0.5204314189968803
Test Accuracy: 0.5347682119205298


### Gradient Boosting Machines

In [38]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)
gbc.fit(cv_train_features, train_label_names)
gbc_bow_cv_scores = cross_val_score(gbc, cv_train_features, train_label_names, cv=5)
gbc_bow_cv_mean_score = np.mean(gbc_bow_cv_scores)
print('CV Accuracy (5-fold):', gbc_bow_cv_scores)
print('Mean CV Accuracy:', gbc_bow_cv_mean_score)
gbc_bow_test_score = gbc.score(cv_test_features, test_label_names)
print('Test Accuracy:', gbc_bow_test_score)

# It is interesting to see that simpler models like Naοve Bayes and Logistic Regression
# performed much better than the ensemble models!!!

CV Accuracy (5-fold): [0.54830819 0.52732463 0.55628059 0.5632137  0.55872757]
Mean CV Accuracy: 0.5507709373414317
Test Accuracy: 0.5529801324503312


## TF-IDF Features with Classification Models
We use TF-IDF features to train our classification models. Assuming TF-IDF weighs down unimportant features, we might get better performing models. 

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

# build BOW features on train articles
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0)
tv_train_features = tv.fit_transform(train_corpus)

In [41]:
# transform test articles into features
tv_test_features = tv.transform(test_corpus)

In [42]:
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

TFIDF model:> Train features shape: (12261, 74354)  Test features shape: (6040, 74354)


#### We now build several classifiers on these features using the training data and test their performance on the test dataset using all the classification models. We also check model accuracies using five-fold cross validation, just like we did earlier.

#### Naive Bayes

In [43]:
mnb = MultinomialNB(alpha=1)
mnb.fit(tv_train_features, train_label_names)
mnb_tfidf_cv_scores = cross_val_score(mnb, tv_train_features, train_label_names, cv=5)
mnb_tfidf_cv_mean_score = np.mean(mnb_tfidf_cv_scores)
print('CV Accuracy (5-fold):', mnb_tfidf_cv_scores)
print('Mean CV Accuracy:', mnb_tfidf_cv_mean_score)
mnb_tfidf_test_score = mnb.score(tv_test_features, test_label_names)
print('Test Accuracy:', mnb_tfidf_test_score)

CV Accuracy (5-fold): [0.71300448 0.68760196 0.7137031  0.71411093 0.71044046]
Mean CV Accuracy: 0.7077721856048691
Test Accuracy: 0.7115894039735099


#### Logistic Regression
(time inefficiency)

In [44]:
lr = LogisticRegression(penalty='l2', max_iter=100, C=1, random_state=42)
lr.fit(tv_train_features, train_label_names)
lr_tfidf_cv_scores = cross_val_score(lr, tv_train_features, train_label_names, cv=5)
lr_tfidf_cv_mean_score = np.mean(lr_tfidf_cv_scores)
print('CV Accuracy (5-fold):', lr_tfidf_cv_scores)
print('Mean CV Accuracy:', lr_tfidf_cv_mean_score)
lr_tfidf_test_score = lr.score(tv_test_features, test_label_names)
print('Test Accuracy:', lr_tfidf_test_score)

CV Accuracy (5-fold): [0.74276396 0.71818923 0.75040783 0.74551387 0.73694943]
Mean CV Accuracy: 0.7387648642771211
Test Accuracy: 0.7506622516556292


#### Support Vector Machines

In [49]:
svm = LinearSVC(penalty='l2', C=1, random_state=42)
svm.fit(tv_train_features, train_label_names)
svm_tfidf_cv_scores = cross_val_score(svm, tv_train_features, train_label_names, cv=5)
svm_tfidf_cv_mean_score = np.mean(svm_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svm_tfidf_cv_scores)
print('Mean CV Accuracy:', svm_tfidf_cv_mean_score)
svm_tfidf_test_score = svm.score(tv_test_features, test_label_names)
print('Test Accuracy:', svm_tfidf_test_score)

CV Accuracy (5-fold): [0.75540155 0.73898858 0.7593801  0.76835237 0.75163132]
Mean CV Accuracy: 0.7547507829079019
Test Accuracy: 0.7695364238410596


#### SVM with Stochastic Gradient Descent

In [45]:
svm_sgd = SGDClassifier(loss='hinge', penalty='l2', max_iter=5, random_state=42)
svm_sgd.fit(tv_train_features, train_label_names)
svmsgd_tfidf_cv_scores = cross_val_score(svm_sgd, tv_train_features, train_label_names, cv=5)
svmsgd_tfidf_cv_mean_score = np.mean(svmsgd_tfidf_cv_scores)
print('CV Accuracy (5-fold):', svmsgd_tfidf_cv_scores)
print('Mean CV Accuracy:', svmsgd_tfidf_cv_mean_score)
svmsgd_tfidf_test_score = svm_sgd.score(tv_test_features, test_label_names)
print('Test Accuracy:', svmsgd_tfidf_test_score)

CV Accuracy (5-fold): [0.75662454 0.73735726 0.76101142 0.77324633 0.75652529]
Mean CV Accuracy: 0.7569529670031503
Test Accuracy: 0.7675496688741722


#### Random Forest

In [46]:
rfc = RandomForestClassifier(n_estimators=10, random_state=42)
rfc.fit(tv_train_features, train_label_names)
rfc_tfidf_cv_scores = cross_val_score(rfc, tv_train_features, train_label_names, cv=5)
rfc_tfidf_cv_mean_score = np.mean(rfc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', rfc_tfidf_cv_scores)
print('Mean CV Accuracy:', rfc_tfidf_cv_mean_score)
rfc_tfidf_test_score = rfc.score(tv_test_features, test_label_names)
print('Test Accuracy:', rfc_tfidf_test_score)

CV Accuracy (5-fold): [0.52588667 0.51631321 0.52079935 0.52814029 0.52446982]
Mean CV Accuracy: 0.5231218689502949
Test Accuracy: 0.5413907284768212


#### Gradient Boosting

In [47]:
gbc = GradientBoostingClassifier(n_estimators=10, random_state=42)
gbc.fit(tv_train_features, train_label_names)
gbc_tfidf_cv_scores = cross_val_score(gbc, tv_train_features, train_label_names, cv=5)
gbc_tfidf_cv_mean_score = np.mean(gbc_tfidf_cv_scores)
print('CV Accuracy (5-fold):', gbc_tfidf_cv_scores)
print('Mean CV Accuracy:', gbc_tfidf_cv_mean_score)
gbc_tfidf_test_score = gbc.score(tv_test_features, test_label_names)
print('Test Accuracy:', gbc_tfidf_test_score)

CV Accuracy (5-fold): [0.54830819 0.53181077 0.56158238 0.57911909 0.55668842]
Mean CV Accuracy: 0.5555017693153305
Test Accuracy: 0.5556291390728477


### It’s interesting to see that the overall accuracy of several models (with TF-IDF) increases by quite a bit, including logistic regression, Naïve Bayes, and SVM. Interestingly, the ensemble models don’t perform as well. Using more estimators might improve them, but still wouldn’t be as good as the other models and it would take a huge amount of training time.

### Comparative Model Performance Evaluation
We can now do a nice comparison of all the models we have tried so far with the two different feature engineering techniques. We will build a dataframe from our modeling results and compare the results.

In [50]:
pd.DataFrame([['Naive Bayes', mnb_bow_cv_mean_score, mnb_bow_test_score, 
               mnb_tfidf_cv_mean_score, mnb_tfidf_test_score],
              ['Logistic Regression', lr_bow_cv_mean_score, lr_bow_test_score, 
               lr_tfidf_cv_mean_score, lr_tfidf_test_score],
              ['Linear SVM', svm_bow_cv_mean_score, svm_bow_test_score, 
               svm_tfidf_cv_mean_score, svm_tfidf_test_score],
              ['Linear SVM (SGD)', svmsgd_bow_cv_mean_score, svmsgd_bow_test_score, 
               svmsgd_tfidf_cv_mean_score, svmsgd_tfidf_test_score],
              ['Random Forest', rfc_bow_cv_mean_score, rfc_bow_test_score, 
               rfc_tfidf_cv_mean_score, rfc_tfidf_test_score],
              ['Gradient Boosted Machines', gbc_bow_cv_mean_score, gbc_bow_test_score, 
               gbc_tfidf_cv_mean_score, gbc_tfidf_test_score]],
             columns=['Model', 'CV Score (TF)', 'Test Score (TF)', 'CV Score (TF-IDF)', 'Test Score (TF-IDF)'],
             ).T

# Comparative model performance evaluation
# the best performing models were SVM followed by Logistic Regression and Naïve Bayes. 
# Ensemble models didn’t perform as well on this # dataset.

Unnamed: 0,0,1,2,3,4,5
Model,Naive Bayes,Logistic Regression,Linear SVM,Linear SVM (SGD),Random Forest,Gradient Boosted Machines
CV Score (TF),0.667074,0.679145,0.639916,0.641138,0.520431,0.550771
Test Score (TF),0.681126,0.69106,0.658278,0.653477,0.534768,0.55298
CV Score (TF-IDF),0.707772,0.738765,0.754751,0.756953,0.523122,0.555502
Test Score (TF-IDF),0.711589,0.750662,0.769536,0.76755,0.541391,0.555629


## Model Tuning

### Tuning our Multinomial Naive Bayes model

In [51]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer

mnb_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                        ('mnb', MultinomialNB())
                       ])

param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2)],
              'mnb__alpha': [1e-5, 1e-4, 1e-2, 1e-1, 1]
}

gs_mnb = GridSearchCV(mnb_pipeline, param_grid, cv=5, verbose=2)
gs_mnb = gs_mnb.fit(train_corpus, train_label_names)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 1) .....................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ...... mnb__alpha=1e-05, tfidf__ngram_range=(1, 1), total=   2.2s
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 1) .....................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.1s remaining:    0.0s


[CV] ...... mnb__alpha=1e-05, tfidf__ngram_range=(1, 1), total=   1.7s
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 1) .....................
[CV] ...... mnb__alpha=1e-05, tfidf__ngram_range=(1, 1), total=   1.7s
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 1) .....................
[CV] ...... mnb__alpha=1e-05, tfidf__ngram_range=(1, 1), total=   1.7s
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 1) .....................
[CV] ...... mnb__alpha=1e-05, tfidf__ngram_range=(1, 1), total=   1.7s
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 2) .....................
[CV] ...... mnb__alpha=1e-05, tfidf__ngram_range=(1, 2), total=   7.4s
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 2) .....................
[CV] ...... mnb__alpha=1e-05, tfidf__ngram_range=(1, 2), total=   7.1s
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 2) .....................
[CV] ...... mnb__alpha=1e-05, tfidf__ngram_range=(1, 2), total=   8.0s
[CV] mnb__alpha=1e-05, tfidf__ngram_range=(1, 2) .....................
[CV] .

[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  4.2min finished


In [53]:
# We can now inspect the hyperparameter values chosen for our best estimator/model
# using the following code.
gs_mnb.best_estimator_.get_params()

{'memory': None,
 'steps': [('tfidf',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                   dtype=<class 'numpy.float64'>, encoding='utf-8',
                   input='content', lowercase=True, max_df=1.0, max_features=None,
                   min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                   smooth_idf=True, stop_words=None, strip_accents=None,
                   sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                   tokenizer=None, use_idf=True, vocabulary=None)),
  ('mnb', MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True))],
 'verbose': False,
 'tfidf': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                 dtype=<class 'numpy.float64'>, encoding='utf-8',
                 input='content', lowercase=True, max_df=1.0, max_features=None,
                 min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                 smooth_idf=True, stop_words=Non

In [55]:
# Now you might be wondering how these hyperparameters specifically were selected
# for the best estimator. Well, it decided this based on the model performance, with those
# hyperparameter values on the five-folds of validation data during cross-validation.
cv_results = gs_mnb.cv_results_
results_df = pd.DataFrame({'rank': cv_results['rank_test_score'],
                           'params': cv_results['params'], 
                           'cv score (mean)': cv_results['mean_test_score'], 
                           'cv score (std)': cv_results['std_test_score']} 
              )
results_df = results_df.sort_values(by=['rank'], ascending=True)
pd.set_option('display.max_colwidth', 100)
results_df

# Model performances across different hyperparameter values in the
# hyperparameter space

# you can see how the best hyperparameters including bi-gram TF-IDF features gave the best cross-validation accuracy.
# Note that we are never tuning our models based on test data scores, because that would end up biasing our
# model toward the test dataset.

Unnamed: 0,rank,params,cv score (mean),cv score (std)
5,1,"{'mnb__alpha': 0.01, 'tfidf__ngram_range': (1, 2)}",0.767066,0.007905
4,2,"{'mnb__alpha': 0.01, 'tfidf__ngram_range': (1, 1)}",0.765271,0.009751
7,3,"{'mnb__alpha': 0.1, 'tfidf__ngram_range': (1, 2)}",0.753853,0.010498
6,4,"{'mnb__alpha': 0.1, 'tfidf__ngram_range': (1, 1)}",0.753608,0.011194
3,5,"{'mnb__alpha': 0.0001, 'tfidf__ngram_range': (1, 2)}",0.751978,0.008194
1,6,"{'mnb__alpha': 1e-05, 'tfidf__ngram_range': (1, 2)}",0.743414,0.00709
2,7,"{'mnb__alpha': 0.0001, 'tfidf__ngram_range': (1, 1)}",0.738194,0.011726
0,8,"{'mnb__alpha': 1e-05, 'tfidf__ngram_range': (1, 1)}",0.727183,0.01306
8,9,"{'mnb__alpha': 1, 'tfidf__ngram_range': (1, 1)}",0.710219,0.010015
9,10,"{'mnb__alpha': 1, 'tfidf__ngram_range': (1, 2)}",0.70239,0.009959


In [57]:
# check our tuned model’s performance on the test data.
best_mnb_test_score = gs_mnb.score(test_corpus, test_label_names)
print('Test Accuracy :', best_mnb_test_score)
# Looks like we have achieved a model accuracy of 77.3%, which is an improvement of
# 6% over the base model!

Test Accuracy : 0.7791390728476821


### Tuning our Logistic Regression model

In [58]:
# Let’s look at how it performs for logistic regression now.
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC

In [59]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

lr_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                        ('lr', LogisticRegression(penalty='l2', max_iter=100, random_state=42))
                       ])

param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2)],
              'lr__C': [1, 5, 10]
}

gs_lr = GridSearchCV(lr_pipeline, param_grid, cv=5, verbose=2)
gs_lr = gs_lr.fit(train_corpus, train_label_names)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] lr__C=1, tfidf__ngram_range=(1, 1) ..............................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............... lr__C=1, tfidf__ngram_range=(1, 1), total=  36.3s
[CV] lr__C=1, tfidf__ngram_range=(1, 1) ..............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   36.2s remaining:    0.0s


[CV] ............... lr__C=1, tfidf__ngram_range=(1, 1), total=  34.8s
[CV] lr__C=1, tfidf__ngram_range=(1, 1) ..............................
[CV] ............... lr__C=1, tfidf__ngram_range=(1, 1), total=  36.6s
[CV] lr__C=1, tfidf__ngram_range=(1, 1) ..............................
[CV] ............... lr__C=1, tfidf__ngram_range=(1, 1), total=  35.9s
[CV] lr__C=1, tfidf__ngram_range=(1, 1) ..............................
[CV] ............... lr__C=1, tfidf__ngram_range=(1, 1), total=  33.7s
[CV] lr__C=1, tfidf__ngram_range=(1, 2) ..............................
[CV] ............... lr__C=1, tfidf__ngram_range=(1, 2), total= 4.8min
[CV] lr__C=1, tfidf__ngram_range=(1, 2) ..............................
[CV] ............... lr__C=1, tfidf__ngram_range=(1, 2), total= 4.3min
[CV] lr__C=1, tfidf__ngram_range=(1, 2) ..............................
[CV] ............... lr__C=1, tfidf__ngram_range=(1, 2), total= 3.6min
[CV] lr__C=1, tfidf__ngram_range=(1, 2) ..............................
[CV] .

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed: 94.2min finished


In [98]:
gs_lr.best_estimator_

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
 ...alty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [99]:
best_lr_test_score = gs_lr.score(test_corpus, test_label_names)
print('Test Accuracy :', best_lr_test_score)
# We get an overall test accuracy of approximately 77%, which is almost a 2.5%
# improvement from the base logistic regression model.

Test Accuracy : 0.766926005628


### Let’s tune our top two SVM models—the regular Linear SVM model and the SVM with Stochastic Gradient Descent.

In [83]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

svm_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                        ('svm', LinearSVC(random_state=42))
                       ])

param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2)],
              'svm__C': [0.01, 0.1, 1, 5]
}

gs_svm = GridSearchCV(svm_pipeline, param_grid, cv=5, verbose=2)
gs_svm = gs_svm.fit(train_corpus, train_label_names)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] tfidf__ngram_range=(1, 1), svm__C=0.01 ..........................
[CV] ........... tfidf__ngram_range=(1, 1), svm__C=0.01, total=   2.7s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    4.0s remaining:    0.0s


[CV] tfidf__ngram_range=(1, 1), svm__C=0.01 ..........................
[CV] ........... tfidf__ngram_range=(1, 1), svm__C=0.01, total=   3.1s
[CV] tfidf__ngram_range=(1, 1), svm__C=0.01 ..........................
[CV] ........... tfidf__ngram_range=(1, 1), svm__C=0.01, total=   3.0s
[CV] tfidf__ngram_range=(1, 1), svm__C=0.01 ..........................
[CV] ........... tfidf__ngram_range=(1, 1), svm__C=0.01, total=   2.9s
[CV] tfidf__ngram_range=(1, 1), svm__C=0.01 ..........................
[CV] ........... tfidf__ngram_range=(1, 1), svm__C=0.01, total=   3.3s
[CV] tfidf__ngram_range=(1, 2), svm__C=0.01 ..........................
[CV] ........... tfidf__ngram_range=(1, 2), svm__C=0.01, total=  11.6s
[CV] tfidf__ngram_range=(1, 2), svm__C=0.01 ..........................
[CV] ........... tfidf__ngram_range=(1, 2), svm__C=0.01, total=  12.5s
[CV] tfidf__ngram_range=(1, 2), svm__C=0.01 ..........................
[CV] ........... tfidf__ngram_range=(1, 2), svm__C=0.01, total=  11.3s
[CV] t

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  8.3min finished


In [84]:
gs_svm.best_estimator_.get_params()

{'memory': None,
 'steps': [('tfidf',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
           stop_words=None, strip_accents=None, sublinear_tf=False,
           token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
           vocabulary=None)),
  ('svm', LinearSVC(C=5, class_weight=None, dual=True, fit_intercept=True,
        intercept_scaling=1, loss='squared_hinge', max_iter=1000,
        multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
        verbose=0))],
 'svm': LinearSVC(C=5, class_weight=None, dual=True, fit_intercept=True,
      intercept_scaling=1, loss='squared_hinge', max_iter=1000,
      multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
      verbose=0),
 'svm__C': 5,
 'svm__class_weight': No

In [85]:
best_svm_test_score = gs_svm.score(test_corpus, test_label_names)
print('Test Accuracy :', best_svm_test_score)
# This is definitely the highest overall accuracy we have obtained so far! However, not a
# huge improvement from the default linear SVM model performance.

Test Accuracy : 0.77685813607


In [67]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

sgd_pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                        ('sgd', SGDClassifier(random_state=42))
                       ])

param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2)],
              'sgd__alpha': [1e-7, 1e-6, 1e-5, 1e-4]
}

gs_sgd = GridSearchCV(sgd_pipeline, param_grid, cv=5, verbose=2)
gs_sgd = gs_sgd.fit(train_corpus, train_label_names)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] tfidf__ngram_range=(1, 1), sgd__alpha=1e-07 .....................
[CV] ...... tfidf__ngram_range=(1, 1), sgd__alpha=1e-07, total=   2.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.2s remaining:    0.0s


[CV] tfidf__ngram_range=(1, 1), sgd__alpha=1e-07 .....................
[CV] ...... tfidf__ngram_range=(1, 1), sgd__alpha=1e-07, total=   2.3s
[CV] tfidf__ngram_range=(1, 1), sgd__alpha=1e-07 .....................
[CV] ...... tfidf__ngram_range=(1, 1), sgd__alpha=1e-07, total=   3.4s
[CV] tfidf__ngram_range=(1, 1), sgd__alpha=1e-07 .....................
[CV] ...... tfidf__ngram_range=(1, 1), sgd__alpha=1e-07, total=   3.0s
[CV] tfidf__ngram_range=(1, 1), sgd__alpha=1e-07 .....................
[CV] ...... tfidf__ngram_range=(1, 1), sgd__alpha=1e-07, total=   2.3s
[CV] tfidf__ngram_range=(1, 2), sgd__alpha=1e-07 .....................
[CV] ...... tfidf__ngram_range=(1, 2), sgd__alpha=1e-07, total=   9.0s
[CV] tfidf__ngram_range=(1, 2), sgd__alpha=1e-07 .....................
[CV] ...... tfidf__ngram_range=(1, 2), sgd__alpha=1e-07, total=  10.2s
[CV] tfidf__ngram_range=(1, 2), sgd__alpha=1e-07 .....................
[CV] ...... tfidf__ngram_range=(1, 2), sgd__alpha=1e-07, total=   9.8s
[CV] t

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:  6.6min finished


In [69]:
gs_sgd.best_estimator_.get_params()

{'memory': None,
 'sgd': SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
        eta0=0.0, fit_intercept=True, l1_ratio=0.15,
        learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
        n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
        tol=None, verbose=0, warm_start=False),
 'sgd__alpha': 0.0001,
 'sgd__average': False,
 'sgd__class_weight': None,
 'sgd__epsilon': 0.1,
 'sgd__eta0': 0.0,
 'sgd__fit_intercept': True,
 'sgd__l1_ratio': 0.15,
 'sgd__learning_rate': 'optimal',
 'sgd__loss': 'hinge',
 'sgd__max_iter': None,
 'sgd__n_iter': None,
 'sgd__n_jobs': 1,
 'sgd__penalty': 'l2',
 'sgd__power_t': 0.5,
 'sgd__random_state': 42,
 'sgd__shuffle': True,
 'sgd__tol': None,
 'sgd__verbose': 0,
 'sgd__warm_start': False,
 'steps': [('tfidf',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, m

In [70]:
best_sgd_test_score = gs_sgd.score(test_corpus, test_label_names)
print('Test Accuracy :', best_sgd_test_score)
# The SVM with SGD gives us a tuned model accuracy of 76.8%.

Test Accuracy : 0.768415825195


### Model Performance Evaluation
Choosing the best model for deployment depends on a number of factors, like the model speed, accuracy, ease of use, understanding, and so on.
The Naïve Bayes model is the fastest to train and, even though the SVM model
might be slightly better on the test dataset in terms of accuracy, SVMs are notoriously slow and often hard to scale. Let’s take a detailed performance evaluation of our best, tuned Naïve Bayes model on the test dataset. We use our nifty model_evaluation_utils module for the purpose of model evaluation.

In [25]:
import model_evaluation_utils as meu

In [26]:
mnb_predictions = gs_mnb.predict(test_corpus)
unique_classes = list(set(test_label_names))
meu.get_metrics(true_labels=test_label_names, predicted_labels=mnb_predictions)

Accuracy: 0.7735
Precision: 0.7825
Recall: 0.7735
F1 Score: 0.7696


In [27]:
# It is good to see good consistency with the classification metrics. Besides seeing the
# holistic view of model performance metrics, often a more granular view into per-class
# model performance metrics helps.
meu.display_classification_report(true_labels=test_label_names, 
                                  predicted_labels=mnb_predictions, classes=unique_classes)

                          precision    recall  f1-score   support

 comp.os.ms-windows.misc       0.76      0.72      0.74       315
      talk.politics.misc       0.72      0.68      0.70       244
           comp.graphics       0.64      0.75      0.69       289
          comp.windows.x       0.79      0.84      0.81       287
      talk.religion.misc       0.67      0.21      0.32       199
comp.sys.ibm.pc.hardware       0.69      0.76      0.72       324
   comp.sys.mac.hardware       0.78      0.77      0.77       295
               sci.crypt       0.79      0.85      0.82       302
   talk.politics.mideast       0.85      0.87      0.86       326
            misc.forsale       0.83      0.77      0.80       314
                 sci.med       0.88      0.88      0.88       322
         rec.motorcycles       0.88      0.74      0.80       351
         sci.electronics       0.80      0.72      0.76       307
        rec.sport.hockey       0.88      0.92      0.90       308
      tal

In [35]:
# This gives us a nice overview into the model performance for each newsgroup class
# and interestingly some categories like religion, Christianity, and atheism have slightly
# lower performance. Could it be that the model is getting some of these mixed up? The
# confusion matrix is a great way to test this assumption.
label_data_map = {v:k for k, v in data_labels_map.items()}
label_map_df = pd.DataFrame(list(label_data_map.items()), columns=['Label Name', 'Label Number'])
label_map_df
# Mapping between class label names and numbers

Unnamed: 0,Label Name,Label Number
0,alt.atheism,0
1,comp.graphics,1
2,comp.os.ms-windows.misc,2
3,comp.sys.ibm.pc.hardware,3
4,comp.sys.mac.hardware,4
5,comp.windows.x,5
6,misc.forsale,6
7,rec.autos,7
8,rec.motorcycles,8
9,rec.sport.baseball,9


In [37]:
# We can now build a confusion matrix to show the correct and misclassified instances
# of each class label, which we represent by numbers for display purposes, due to the long names.
unique_class_nums = label_map_df['Label Number'].values
mnb_prediction_class_nums = [label_data_map[item] for item in mnb_predictions]
meu.display_confusion_matrix_pretty(true_labels=test_label_nums, 
                                   predicted_labels=mnb_prediction_class_nums, classes=unique_class_nums)
# The diagonal of our confusion matrix has the meat of the numbers, which indicates
# that most of our predictions match the actual class labels
# Interestingly, class labels 0, 15, and 19 seem to have a lot of misclassifications.

Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:
Unnamed: 0_level_1,Unnamed: 1_level_1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
Actual:,0,153,1,0,0,0,2,0,3,2,1,3,1,0,1,4,57,10,15,5,10
Actual:,1,2,218,10,11,5,16,4,0,2,2,0,4,2,0,9,2,1,0,1,0
Actual:,2,1,22,226,24,6,15,3,1,0,1,1,3,4,3,3,1,1,0,0,0
Actual:,3,1,15,22,245,20,2,9,3,0,0,0,1,5,0,0,0,1,0,0,0
Actual:,4,0,7,10,22,227,5,5,1,0,0,1,9,4,0,0,2,0,0,1,1
Actual:,5,1,26,10,4,0,241,1,0,0,0,0,1,0,0,1,1,1,0,0,0
Actual:,6,0,2,4,17,13,0,242,7,1,0,1,3,8,1,5,4,3,2,1,0
Actual:,7,2,3,3,3,3,2,9,242,16,1,0,4,9,5,3,4,12,3,3,1
Actual:,8,0,3,1,0,2,3,6,25,260,2,7,2,3,6,4,8,10,1,8,0
Actual:,9,0,3,0,2,2,4,0,0,1,297,12,6,0,1,1,3,3,1,0,0


In [38]:
unique_classes = label_map_df['Label Name'].values
meu.display_confusion_matrix_pretty(true_labels=test_label_names, 
                                    predicted_labels=mnb_predictions, classes=unique_classes)

Unnamed: 0_level_0,Unnamed: 1_level_0,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:,Predicted:
Unnamed: 0_level_1,Unnamed: 1_level_1,alt.atheism,comp.graphics,comp.os.ms-windows.misc,comp.sys.ibm.pc.hardware,comp.sys.mac.hardware,comp.windows.x,misc.forsale,rec.autos,rec.motorcycles,rec.sport.baseball,rec.sport.hockey,sci.crypt,sci.electronics,sci.med,sci.space,soc.religion.christian,talk.politics.guns,talk.politics.mideast,talk.politics.misc,talk.religion.misc
Actual:,alt.atheism,153,1,0,0,0,2,0,3,2,1,3,1,0,1,4,57,10,15,5,10
Actual:,comp.graphics,2,218,10,11,5,16,4,0,2,2,0,4,2,0,9,2,1,0,1,0
Actual:,comp.os.ms-windows.misc,1,22,226,24,6,15,3,1,0,1,1,3,4,3,3,1,1,0,0,0
Actual:,comp.sys.ibm.pc.hardware,1,15,22,245,20,2,9,3,0,0,0,1,5,0,0,0,1,0,0,0
Actual:,comp.sys.mac.hardware,0,7,10,22,227,5,5,1,0,0,1,9,4,0,0,2,0,0,1,1
Actual:,comp.windows.x,1,26,10,4,0,241,1,0,0,0,0,1,0,0,1,1,1,0,0,0
Actual:,misc.forsale,0,2,4,17,13,0,242,7,1,0,1,3,8,1,5,4,3,2,1,0
Actual:,rec.autos,2,3,3,3,3,2,9,242,16,1,0,4,9,5,3,4,12,3,3,1
Actual:,rec.motorcycles,0,3,1,0,2,3,6,25,260,2,7,2,3,6,4,8,10,1,8,0
Actual:,rec.sport.baseball,0,3,0,2,2,4,0,0,1,297,12,6,0,1,1,3,3,1,0,0


In [39]:
label_map_df[label_map_df['Label Number'].isin([0, 15, 19])]

Unnamed: 0,Label Name,Label Number
0,alt.atheism,0
15,soc.religion.christian,15
19,talk.religion.misc,19


In [59]:
train_idx, test_idx = train_test_split(np.array(range(len(data_df['Article']))), test_size=0.33, random_state=42)
test_idx

array([ 4105, 12650,  7039, ...,  4772,  7803,  9616])

In [61]:
predict_probas = gs_mnb.predict_proba(test_corpus).max(axis=1)
test_df = data_df.iloc[test_idx]
test_df['Predicted Name'] = mnb_predictions
test_df['Predicted Confidence'] = predict_probas
test_df.head()
# Adding additional metadata to our test dataset with model
# predictions and confidence scores

Unnamed: 0,Article,Clean Article,Target Label,Target Name,Predicted Name,Predicted Confidence
4105,Just a little nitpicking. Wasn't it the government that required\r\na standard railway gauge ? D...,little nitpicking not government require standard railway gauge not improve thing please not mis...,11,sci.crypt,sci.crypt,0.975658
12650,\r\nIt means that the EFF's public stance is complicated with issues irrelevant\r\nto the encryp...,mean eff public stance complicate issue irrelevant encryption issue per se may well people care ...,11,sci.crypt,sci.crypt,0.9886
7039,\r\n\r\n\r\n\r\n\r\nSo after I've flashed my lights at the chap in front and he doesn't\r\n'pass...,flash light chap front not pass next major highway lane direction keep extreme right block folk ...,7,rec.autos,rec.motorcycles,0.729504
3310,: I think most of the problems mainly arose from Manager Gene Mauch's\r\n: ineptitude in managin...,think problem mainly arise manager gene mauchs ineptitude manage pitching staff stretch abuse ji...,9,rec.sport.baseball,rec.sport.baseball,0.999942
16360,"OK... quick scenario... you're at home, not bothering anybody... next thing you\r\nknow, somebod...",ok quick scenario home not bother anybody next thing know somebody come crash upstairs window he...,16,talk.politics.guns,talk.politics.guns,0.998837


In [77]:
pd.set_option('display.max_colwidth', 200)
res_df = (test_df[(test_df['Target Name'] == 'talk.religion.misc') & (test_df['Predicted Name'] == 'soc.religion.christian')]
       .sort_values(by=['Predicted Confidence'], ascending=False).head(5))
res_df
# Looking at mode misclassification instances for religion.misc and
# religion.christian

Unnamed: 0,Article,Clean Article,Target Label,Target Name,Predicted Name,Predicted Confidence
8968,"\r\nZoroaster is far older than Daniel. If anything, one could claim that,\r\nin a sense, Daniel is a descendant of Zoroaster; as Daniel, though being\r\nHebrew, has assimilated into Zoroastrianis...",zoroaster far old daniel anything one could claim sense daniel descendant zoroaster daniel though hebrew assimilate zoroastrianism successfully introduce religion tanakh judaism however majority b...,19,talk.religion.misc,soc.religion.christian,0.999557
3299,"There were some recent developments in the dispute about Masonry among\r\nSouthern Baptists. I posted a summary over in bit.listserv.christia, and\r\nI suppose that it might be useful here. Note...",recent development dispute masonry among southern baptists post summary bit listserv christia suppose may useful note not necessarily agree disagree follow present information short summary southe...,19,talk.religion.misc,soc.religion.christian,0.999384
4367,:\r\n (lots of stuff about the Nicene Creed deleted which can be read in the\r\n original basenote. I will also leave it up to other LDS netters to\r\n take Mr. Weiss to task on using Mormon Do...,lot stuff nicene creed delete read original basenote also leave lds netter take mr weiss task use mormon doctrine declare difinitive word lds church teach doctrine hopefully lds netter amiable exp...,19,talk.religion.misc,soc.religion.christian,0.999254
12608,: >: >> Gilligan = Sloth\r\n: >: >> Skipper = Anger\r\n: >: >> Thurston Howell III = Greed\r\n: >: >> Lovey Howell = Gluttony\r\n: >: >> Ginger = Lust\r\n: >: >> Professor = Pride\r\n: >: >> Mary ...,gilligan sloth skipper anger thurston howell iii greed lovey howell gluttony ginger lust professor pride mary ann envy assorted monkeys secular humanism assorted headhunters godless heathen savage...,19,talk.religion.misc,soc.religion.christian,0.998755
16758,"\r\n\r\nThere were many injustices in the middle ages. And this is truely sad.\r\nI would hate to see a day when churches put people to death or torchured\r\nthem for practicing homosexuality, or...",many injustice middle age truely sad would hate see day church put people death torchur practice homosexuality crime church not call take government world may homosexual treat cruelly today not me...,19,talk.religion.misc,soc.religion.christian,0.998243


In [78]:
pd.set_option('display.max_colwidth', 200)
res_df = (test_df[(test_df['Target Name'] == 'talk.religion.misc') & (test_df['Predicted Name'] == 'alt.atheism')]
       .sort_values(by=['Predicted Confidence'], ascending=False).head(5))
res_df

Unnamed: 0,Article,Clean Article,Target Label,Target Name,Predicted Name,Predicted Confidence
914,"\r\n\r\nAtoms are not objective. They aren't even real. What scientists call\r\nan atom is nothing more than a mathematical model that describes \r\ncertain physical, observable properties of ou...",atom not objective not even real scientist call atom nothing mathematical model describe certain physical observable property surrounding subjective objective though approach scientist take discus...,19,talk.religion.misc,alt.atheism,0.996075
2334,In <1ren9a$94q@morrow.stanford.edu> salem@pangea.Stanford.EDU (Bruce Salem) \r\n\r\n\r\n\r\nThis brings up another something I have never understood. I asked this once\r\nbefore and got a few int...,renaqmorrow stanford edu salempangea stanford edu bruce salem bring another something never understand ask get interesting response somehow not seem satisfied would nt not consider good source may...,19,talk.religion.misc,alt.atheism,0.996051
11117,"\r\n\r\n\tUnless God admits that he didn't do it....\r\n\r\n\t=)\r\n\r\n\r\n--- \r\n\r\n "" I'd Cheat on Hillary Too.""",unless god admit not would cheat hillary,19,talk.religion.misc,alt.atheism,0.965725
12386,"\r\nAh, you taking everything as literal quotation. No wonder you're confused.\r\n\r\nFirst, can I ask that we decide on a definition of ""objective""?\r\n\r\n\r\nAnd?\r\n\r\n\r\nI'd guess that it ...",ah take everything literal quotation no wonder confused first ask decide definition objective would guess may may case people unable evaluate complex moral issue rather leave behave immorally may ...,19,talk.religion.misc,alt.atheism,0.772658
9360,"\r\nYes, as a philosophy weak atheism is worthless. This is true in\r\nexactly the same sense that as a philosophy Christians' disbelief in\r\nZeus is worthless. Atheists construct their persona...",yes philosophy weak atheism worthless true exactly sense philosophy christian disbelief zeus worthless atheists construct personal philosophy many different source build non god base idea way chri...,19,talk.religion.misc,alt.atheism,0.757788
