#  Validation of the model

In [25]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import  AdaBoostClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from dataset.dataset import get_twenty_dataset, get_IMDB_dataset
from main_dataset import main

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook
plt.style.use('seaborn')
%matplotlib inline

Get dataset

In [2]:
x_train_20, y_train_20, x_test_20, y_test_20 = get_twenty_dataset()
x_train_imdb, y_train_imdb, x_test_imdb, y_test_imdb = get_IMDB_dataset()

20 news results

In [47]:
model_dict_20=main(x_train_20, y_train_20, x_test_20, y_test_20)

multinomialNB model
0.7320764737121614
logistic regression model
Decision Tree model
0.46999468932554433
SVM model
0.8036378120021243
AdaBoost model
0.5
Random forest model
0.688396176314392
MLPClassifier model
0.7705788635156665


IMDB data

In [4]:
model_dict_imdb=main(x_train_imdb, y_train_imdb, x_test_imdb, y_test_imdb)

multinomialNB model
0.82956
logistic regression model
Decision Tree model
0.70416
SVM model
0.8772
AdaBoost model
0.83
Random forest model
0.8366
MLPClassifier model
0.87668


# Pipeline

In [48]:
# Define classifier
clf_NB = MultinomialNB()
# clf_LR = LogisticRegression()
clf_DT = DecisionTreeClassifier()
clf_SVC = LinearSVC()
clf_ADB = AdaBoostClassifier()
clf_RDF = RandomForestClassifier()
clf_NN = MLPClassifier()

models_dict = {'NB':clf_NB,'DT':clf_DT,'SVC':clf_SVC,'ADB':clf_ADB,'RDF':clf_RDF,'NN':clf_NN}
#models_dict = {'NB':clf_NB,'LR':clf_LR}
models_list = list(models_dict.items())

In [50]:
# Define tunable parameters ranges for different classifiers
para_NB = {'NB__alpha':np.arange(0,2,0.2),'NB__fit_prior':[True,False]}
# para_LR = {'LR__penalty':['l1','l2','elasticnet'],
#           'LR__solver':['newton-cg','lbfgs','liblinear','sag','saga'], 
#           'LR__warm_start':[True,False]}
para_DT = {'DT__criterion':['gini','entropy'],
           'DT__max_features': ['sqrt','log2',None]}
para_SVC = {'SVC__penalty': ['l1','l2'],
            'SVC__loss': ['hinge','squared_hinge']}
para_ADB = {'ADB__n_estimators': [30,50,80,100,120],
            'ADB__learning_rate':np.arange(0,1.2,0.2)}
para_RDF = {'RDF__criterion':['gini','entropy'],
            'RDF__max_features': ['sqrt','log2',None]}
para_NN = {'NN__hidden_layer_sizes':[(5,),(12,),(25,),(50,),(100,)],
           'NN__activation':['identity','logistic','tanh','relu'],
           'NN__solver':['lbfgs','sgd','adam'],
           'NN__learning_rate':['constant','invscaling','adaptive']}
params_list = [para_NB,para_DT,para_SVC,para_ADB,para_RDF,para_NN]

In [52]:
print( len(models_list) == len(params_list) )

True


This is the model specifics which are the same for both datasets

In [None]:
# No preprocessing scalar added
grid_dict_20 = {}
grid_dict_imdb = {}

k = 10 # Magic number
n_jobs =10

for idx,model_item in tqdm_notebook(enumerate(models_list),total=len(models_list)):
    pipline = Pipeline([model_item])
    print('#'*20)
    # For 20 news
    grid = GridSearchCV(pipline,param_grid=params_list[idx],cv=k,
                        error_score=0.0,n_jobs=n_jobs,verbose=1)
    grid.fit(x_train_20, y_train_20)
    print('Model ',model_item[0],' for 20 news score = %3.4f'%(grid.score(x_test_20, y_test_20)))
    print('Model ',model_item[0],' for 20 news best params', grid.best_params_)
    grid_dict_20[model_item[0]] = grid
    print('-'*10)
    # For imdb
    grid = GridSearchCV(pipline,param_grid=params_list[idx],cv=k,
                        error_score=0.0,n_jobs=n_jobs,verbose=1)
    grid.fit(x_train_imdb, y_train_imdb)
    print('Model ',model_item[0],' for IMDB score = %3.4f'%(grid.score(x_test_imdb, y_test_imdb)))
    print('Model ',model_item[0],' for IMDB best params', grid.best_params_)
    grid_dict_imdb[model_item[0]] = grid
    
    


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

####################
Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    1.6s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    6.1s
[Parallel(n_jobs=10)]: Done 200 out of 200 | elapsed:    6.6s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.


Model  NB  for 20 news score = 0.7463
Model  NB  for 20 news best params {'NB__alpha': 0.0, 'NB__fit_prior': True}
----------
Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=10)]: Done  60 tasks      | elapsed:    1.1s
[Parallel(n_jobs=10)]: Done 200 out of 200 | elapsed:    2.4s finished
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.


Model  NB  for IMDB score = 0.8317
Model  NB  for IMDB best params {'NB__alpha': 1.8, 'NB__fit_prior': True}
####################
Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   21.6s
[Parallel(n_jobs=10)]: Done  60 out of  60 | elapsed:   57.7s finished


Model  DT  for 20 news score = 0.4810
Model  DT  for 20 news best params {'DT__criterion': 'gini', 'DT__max_features': None}
----------
Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   35.0s
[Parallel(n_jobs=10)]: Done  60 out of  60 | elapsed:  1.3min finished


Model  DT  for IMDB score = 0.7037
Model  DT  for IMDB best params {'DT__criterion': 'gini', 'DT__max_features': None}
####################
Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  40 out of  40 | elapsed:   24.4s finished


Model  SVC  for 20 news score = 0.8036
Model  SVC  for 20 news best params {'SVC__loss': 'squared_hinge', 'SVC__penalty': 'l2'}
----------
Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  21 out of  40 | elapsed:    1.5s remaining:    1.4s
[Parallel(n_jobs=10)]: Done  40 out of  40 | elapsed:    3.9s finished


Model  SVC  for IMDB score = 0.8826
Model  SVC  for IMDB best params {'SVC__loss': 'hinge', 'SVC__penalty': 'l2'}
####################
Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done 300 out of 300 | elapsed: 49.0min finished


Model  ADB  for 20 news score = 0.5046
Model  ADB  for 20 news best params {'ADB__learning_rate': 0.6000000000000001, 'ADB__n_estimators': 100}
----------
Fitting 10 folds for each of 30 candidates, totalling 300 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done 155 tasks      | elapsed: 20.9min
[Parallel(n_jobs=10)]: Done 300 out of 300 | elapsed: 41.8min finished


Model  ADB  for IMDB score = 0.8382
Model  ADB  for IMDB best params {'ADB__learning_rate': 0.8, 'ADB__n_estimators': 120}
####################
Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:  2.5min
[Parallel(n_jobs=10)]: Done  60 out of  60 | elapsed:  6.1min finished


Model  RDF  for 20 news score = 0.5586
Model  RDF  for 20 news best params {'RDF__criterion': 'gini', 'RDF__max_features': None}
----------
Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:  4.3min
[Parallel(n_jobs=10)]: Done  60 out of  60 | elapsed:  9.4min finished


Model  RDF  for IMDB score = 0.7762
Model  RDF  for IMDB best params {'RDF__criterion': 'gini', 'RDF__max_features': None}
####################
Fitting 10 folds for each of 180 candidates, totalling 1800 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed: 21.2min
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed: 158.0min


In [27]:
grid.fit()

GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('scaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('NB',
                                        MultinomialNB(alpha=1.0,
                                                      class_prior=None,
                                                      fit_prior=True))],
                                verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'NB__alpha': array([0. , 0.2, 0.4, 0.6, 0.8, 1. , 1.2, 1.4, 1.6, 1.8]),
                         'NB__fit_prior': [True, False]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)