#  Validation of the model

In [21]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import  AdaBoostClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from dataset.dataset import get_twenty_dataset, get_IMDB_dataset
from main_dataset import main

from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')
from sklearn.externals import joblib

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from tqdm import tqdm_notebook
plt.style.use('seaborn')
%matplotlib inline

Get dataset

In [10]:
# We compare the condition where datasets are
# 1. tf-idf
# 2. tf-idf + remove_stopping_word
# 3. tf-idf + remove_stopping_word + PCA (k=2,3,4,5,6)
twenty_dataset_dict = {}
IMDB_dataset_dict = {}

# 1.
twenty_dataset_dict[0]= list(get_twenty_dataset())
IMDB_dataset_dict [0] = list(get_IMDB_dataset())

# 2.
twenty_dataset_dict[1]= list(get_twenty_dataset(remove_stop_word=True))
IMDB_dataset_dict [1] = list(get_IMDB_dataset(remove_stop_word=True))

# 3. 
for i in range(2,7):
    print(i)
    twenty_dataset_dict[i]= list(get_twenty_dataset(remove_stop_word=True,preprocessing_trick='PCA',n_components=i))
    IMDB_dataset_dict [i] = list(get_IMDB_dataset(remove_stop_word=True,preprocessing_trick='PCA',n_components=i)) 
# x_train_20, y_train_20, x_test_20, y_test_20 = get_twenty_dataset()
# x_train_imdb, y_train_imdb, x_test_imdb, y_test_imdb = get_IMDB_dataset()

2
3
4
5
6


In [20]:
dataset_list = ['tf-idf','tf-idf, remove stopping word','PCA-2','PCA-3','PCA-4','PCA-5','PCA-6']

20 news results

In [47]:
model_dict_20=main(x_train_20, y_train_20, x_test_20, y_test_20)

multinomialNB model
0.7320764737121614
logistic regression model
Decision Tree model
0.46999468932554433
SVM model
0.8036378120021243
AdaBoost model
0.5
Random forest model
0.688396176314392
MLPClassifier model
0.7705788635156665


IMDB data

In [4]:
model_dict_imdb=main(x_train_imdb, y_train_imdb, x_test_imdb, y_test_imdb)

multinomialNB model
0.82956
logistic regression model
Decision Tree model
0.70416
SVM model
0.8772
AdaBoost model
0.83
Random forest model
0.8366
MLPClassifier model
0.87668


# Pipeline

In [25]:
# Define classifier
clf_NB = MultinomialNB()
clf_LR = LogisticRegression()
clf_DT = DecisionTreeClassifier()
clf_SVC = LinearSVC()
clf_ADB = AdaBoostClassifier()
clf_RDF = RandomForestClassifier()
clf_NN = MLPClassifier(solver='lbfgs',hidden_layer_sizes=(30,1024),max_iter=1) # Computationally heavy to tune NN
clf_XG = XGBClassifier()

#models_dict = {'NB':clf_NB,'LR': clf_LR,'DT':clf_DT,'SVC':clf_SVC,'ADB':clf_ADB,'RDF':clf_RDF,'NN':clf_NN,'XG':clf_XG}
models_dict = {'NN':clf_NN}
models_list = list(models_dict.items())

In [33]:
# Define tunable parameters ranges for different classifiers
para_NB = {'NB__alpha':np.arange(0,2,0.2),'NB__fit_prior':[True,False]}
para_LR = {'LR__penalty':['l1','l2','elasticnet'],
           'LR__warm_start':[True,False]}
para_DT = {'DT__criterion':['gini','entropy'],
           'DT__max_features': ['sqrt','log2',None]}
para_SVC = {'SVC__penalty': ['l1','l2'],
            'SVC__loss': ['hinge','squared_hinge']}
para_ADB = {'ADB__n_estimators': [30,50,80,100,120],
            'ADB__learning_rate':np.arange(0,1.2,0.2)}
para_RDF = {'RDF__criterion':['gini','entropy'],
            'RDF__max_features': ['sqrt','log2',None]}
para_NN = {'NN__solver':['lbfgs'], 'NN__hidden_layer_sizes' :[(30,1024)],'NN__max_iter':[1] }
para_XG = {'XG__learning_rate':[0.1]}
#para_NN = {'NN__hidden_layer_sizes':[(5,),(12,),(25,),(50,),(100,)],
#           'NN__activation':['identity','logistic','tanh','relu'],
#           'NN__solver':['lbfgs','sgd','adam'],
#           'NN__learning_rate':['constant','invscaling','adaptive']}
#params_list = [para_NB,para_LR,para_DT,para_SVC,para_ADB,para_RDF,para_NN,para_XG]
params_list = [para_NN]

In [27]:
print( len(models_list) == len(params_list) )

True


This is the model specifics which are the same for both datasets

In [None]:
# No preprocessing scalar added
grid_dict_20 = {}
grid_dict_imdb = {}
scores = np.zeros((2,len(models_list),len(dataset_list)))

k = 5 # Magic number
n_jobs =5

for idx,model_item in tqdm_notebook(enumerate(models_list),total=len(models_list)):
    pipline = Pipeline([model_item])
    print('#'*20)
    # For 20 news
    grid_20 = GridSearchCV(pipline,param_grid=params_list[idx],cv=k,
                        error_score=0.0,n_jobs=n_jobs)
    # For imdb
    grid_imdb = GridSearchCV(pipline,param_grid=params_list[idx],cv=k,
                        error_score=0.0,n_jobs=n_jobs)
    
    for dataset_idx in range(len(twenty_dataset_dict)):
        grid_20.fit(twenty_dataset_dict[dataset_idx][0],twenty_dataset_dict[dataset_idx][1])
        scores[0,idx,dataset_idx] = grid_20.score(twenty_dataset_dict[dataset_idx][2], twenty_dataset_dict[dataset_idx][3])
        print('Model ',model_item[0],' for 20 news ' , dataset_list[dataset_idx] ,'score = %3.4f'%(scores[0,idx,dataset_idx]))
        print('Model ',model_item[0],' for 20 news  ' , dataset_list[dataset_idx] ,'best params', grid_20.best_params_)
        grid_dict_20[model_item[0]] = grid_20
         
        
    
        grid_imdb.fit(IMDB_dataset_dict[dataset_idx][0],IMDB_dataset_dict[dataset_idx][1])
        scores[1,idx,dataset_idx] = grid_imdb.score(IMDB_dataset_dict[dataset_idx][2], IMDB_dataset_dict[dataset_idx][3])
        print('Model ',model_item[0],' for IMDB ' , dataset_list[dataset_idx] ,' score = %3.4f'%(scores[1,idx,dataset_idx]))
        print('Model ',model_item[0],' for IMDB ' , dataset_list[dataset_idx] ,' best params', grid_imdb.best_params_)
        grid_dict_imdb[model_item[0]] = grid_imdb
        print('-'*10)
    
joblib.dump(grid_dict_20,'grid_dict_20.asv')
joblib.dump(grid_dict_imdb,'grid_dict_imdb.asv')
joblib.dump(scores,'scores.asv')

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

####################
Model  NN  for 20 news  tf-idf score = 0.0539
Model  NN  for 20 news   tf-idf best params {'NN__hidden_layer_sizes': (30, 1024), 'NN__max_iter': 1, 'NN__solver': 'lbfgs'}
----------
Model  NN  for IMDB  tf-idf  score = 0.5044
Model  NN  for IMDB  tf-idf  best params {'NN__hidden_layer_sizes': (30, 1024), 'NN__max_iter': 1, 'NN__solver': 'lbfgs'}
Model  NN  for 20 news  tf-idf, remove stopping word score = 0.0524
Model  NN  for 20 news   tf-idf, remove stopping word best params {'NN__hidden_layer_sizes': (30, 1024), 'NN__max_iter': 1, 'NN__solver': 'lbfgs'}
----------
Model  NN  for IMDB  tf-idf, remove stopping word  score = 0.5000
Model  NN  for IMDB  tf-idf, remove stopping word  best params {'NN__hidden_layer_sizes': (30, 1024), 'NN__max_iter': 1, 'NN__solver': 'lbfgs'}
Model  NN  for 20 news  PCA-2 score = 0.0526
Model  NN  for 20 news   PCA-2 best params {'NN__hidden_layer_sizes': (30, 1024), 'NN__max_iter': 1, 'NN__solver': 'lbfgs'}
----------
Model  NN  for 