# Model building and evaluation


This notebook trains and examins different ML classifiers on training data dataset. 

* Logistic regression
* Decision tree
* K Nearest neighbours
* SVM
* AdaBoost classifier
* Gradient Boost classifier
* Random forest
* Extra tree classifier
* XG Boost
* ANN

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier

import pickle

from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight

## Training, validation and test set

In [2]:
df=pd.read_csv('./data/wdbc_final.csv')


In [3]:
X=df[['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_stdEr', 'texture_stdEr', 'perimeter_stdEr', 'area_stdEr',
       'smoothness_stdEr', 'compactness_stdEr', 'concavity_stdEr',
       'concave_points_stdEr', 'symmetry_stdEr', 'fractal_dimension_stdEr',
       'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
       'smoothness_worst', 'compactness_worst', 'concavity_worst',
       'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst']]
y=df['isMalignant']

In [4]:
X_train_temp, X_test, y_train_temp, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train, X_cv, y_train, y_cv = train_test_split(X_train_temp,y_train_temp,test_size=0.25,random_state=44)

In [5]:
X_test.shape

(114, 30)

## Normalize

In [6]:
scaler=MinMaxScaler()
scaler.fit(X_train)

MinMaxScaler()

In [7]:
scaler.transform(X_cv).shape

(114, 30)

In [8]:
def normalize(df,normalizer):
    df[['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean',
       'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_stdEr', 'texture_stdEr', 'perimeter_stdEr', 'area_stdEr',
       'smoothness_stdEr', 'compactness_stdEr', 'concavity_stdEr',
       'concave_points_stdEr', 'symmetry_stdEr', 'fractal_dimension_stdEr',
       'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
       'smoothness_worst', 'compactness_worst', 'concavity_worst',
       'concave_points_worst', 'symmetry_worst', 'fractal_dimension_worst']]=normalizer.transform(df)
    return df

In [9]:
X_train=normalize(X_train,scaler)
X_cv=normalize(X_cv,scaler)
X_test=normalize(X_test,scaler)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

## Model building

In [10]:
def bestModel(model, X_train, X_cv, y_train, y_cv, grid):
    model_cv = GridSearchCV(model, grid, cv=4)
    model_cv.fit(X_train, y_train)
    print("tuned hpyerparameters :(best parameters) ", model_cv.best_params_)
    print("accuracy :", model_cv.best_score_)
    mod_best= model_cv.best_params_
    return mod_best


def testModel(model, X_train, X_cv, y_train, y_cv):
    model.fit(X_train,y_train)
    y_pred_tr = model.predict(X_train)
    y_pred_cv = model.predict(X_cv)
    precision_tr, recall_tr, fscore_tr, support_tr = precision_recall_fscore_support(
        y_train, y_pred_tr, average='macro')
    precision_cv, recall_cv, fscore_cv, support_cv = precision_recall_fscore_support(
        y_cv, y_pred_cv, average='macro')
    report= np.array([precision_tr, recall_tr, fscore_tr, precision_cv, recall_cv, fscore_cv])
    return report

def getModel():
    modelList=[]    
    gridList=[]
    
    modelList.append(('LogReg',LogisticRegression()))
    gridList.append(('LogReg',{'C': np.logspace(-5, 0.1, 20),'solver': ['newton-cg', 'lbfgs', 'saga'],'class_weight':['auto'], 'max_iter':[150,200,250]}))
    
    modelList.append(('DecTree',DecisionTreeClassifier()))
    gridList.append(('DecTree',{'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random'],'max_features':['auto', 'sqrt', 'log2']}))
    
    modelList.append(('KNN',KNeighborsClassifier()))
    gridList.append(('KNN',{'n_neighbors': np.int0(np.linspace(3, 22, 20)),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}))
    
    modelList.append(('LinDiscrAn',LinearDiscriminantAnalysis()))
    gridList.append(('LinDiscrAn',{'solver' : ['svd', 'lsqr', 'eigen']}))
    
    modelList.append(('GNB',GaussianNB()))
    gridList.append(('GNB',{}))
    
    modelList.append(('SVM',SVC()))
    gridList.append(('SVM',{'C': np.logspace(-5, 2, 20),
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}))
    
    modelList.append(('AdBst',AdaBoostClassifier()))
    gridList.append(('AdBst',{'n_estimators' :np.int0(np.linspace(10,110,11)),'algorithm': ['SAMME', 'SAMME.R']}))
    
    modelList.append(('GrBst',GradientBoostingClassifier()))
    gridList.append(('GrBst',{'loss' :['deviance', 'exponential'],'n_estimators' :np.int0(np.linspace(50,150,11)),'max_features':['auto', 'sqrt', 'log2']}))
    
    modelList.append(('RnFrst',RandomForestClassifier()))
    gridList.append(('RnFrst',{'n_estimators': np.int0(np.linspace(50,150,11)),'max_features':['auto', 'sqrt', 'log2'],'criterion':['gini', 'entropy']}))
    
    modelList.append(('XTreeClsfr',ExtraTreesClassifier()))
    gridList.append(('XTreeClsfr',{'n_estimators': np.int0(np.linspace(50,150,11)),'max_features':['auto', 'sqrt', 'log2'],
    'criterion': ['gini', 'entropy']}))
    
    modelList.append(('XGBst',XGBClassifier()))
    gridList.append(('XGBst',{'n_estimators': np.int0(np.linspace(50, 70, 100))}))

    return modelList,gridList    

In [11]:
modList,gridList=getModel()

In [12]:
bestparams=[]
for i in range(len(modList)):
    bestparams.append((modList[i][0],bestModel(modList[i][1],X_train, X_cv, y_train, y_cv, gridList[i][1])))

tuned hpyerparameters :(best parameters)  {'C': 1.2589254117941673, 'class_weight': 'auto', 'max_iter': 150, 'solver': 'newton-cg'}
accuracy : 0.9706224350205199
tuned hpyerparameters :(best parameters)  {'criterion': 'gini', 'max_features': 'sqrt', 'splitter': 'best'}
accuracy : 0.9354309165526675
tuned hpyerparameters :(best parameters)  {'algorithm': 'auto', 'n_neighbors': 6, 'weights': 'distance'}
accuracy : 0.967749658002736
tuned hpyerparameters :(best parameters)  {'solver': 'svd'}
accuracy : 0.9530437756497949
tuned hpyerparameters :(best parameters)  {}
accuracy : 0.9354993160054719
tuned hpyerparameters :(best parameters)  {'C': 3.359818286283781, 'kernel': 'linear'}
accuracy : 0.985328317373461
tuned hpyerparameters :(best parameters)  {'algorithm': 'SAMME', 'n_estimators': 80}
accuracy : 0.9823871409028728
tuned hpyerparameters :(best parameters)  {'loss': 'deviance', 'max_features': 'auto', 'n_estimators': 130}
accuracy : 0.9823529411764707
tuned hpyerparameters :(best par



























































































































































































































































































































































































































tuned hpyerparameters :(best parameters)  {'n_estimators': 50}
accuracy : 0.9706224350205199


In [13]:
bestparams

[('LogReg',
  {'C': 1.2589254117941673,
   'class_weight': 'auto',
   'max_iter': 150,
   'solver': 'newton-cg'}),
 ('DecTree',
  {'criterion': 'gini', 'max_features': 'sqrt', 'splitter': 'best'}),
 ('KNN', {'algorithm': 'auto', 'n_neighbors': 6, 'weights': 'distance'}),
 ('LinDiscrAn', {'solver': 'svd'}),
 ('GNB', {}),
 ('SVM', {'C': 3.359818286283781, 'kernel': 'linear'}),
 ('AdBst', {'algorithm': 'SAMME', 'n_estimators': 80}),
 ('GrBst', {'loss': 'deviance', 'max_features': 'auto', 'n_estimators': 130}),
 ('RnFrst',
  {'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 110}),
 ('XTreeClsfr',
  {'criterion': 'gini', 'max_features': 'sqrt', 'n_estimators': 50}),
 ('XGBst', {'n_estimators': 50})]