# Model Training

In this section we will be training various models using different classifiers. Out of them all, we will be choosing the best classifier to give us the most accurate prediction.

In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score

#To check CODE-Run Time of the Block
%%time 

KFold_Score = pd.DataFrame()
classifiers = ['Linear SVM', 'Radial SVM', 'LogisticRegression', 
               'RandomForestClassifier', 'AdaBoostClassifier', 
               'XGBoostClassifier', 'KNeighborsClassifier','GradientBoostingClassifier']
models = [svm.SVC(kernel='linear'),
          svm.SVC(kernel='rbf'),
          LogisticRegression(max_iter = 1000),
          RandomForestClassifier(n_estimators=200, random_state=0),
          AdaBoostClassifier(random_state = 0),
          xgb.XGBClassifier(n_estimators=100),
          KNeighborsClassifier(),
          GradientBoostingClassifier(random_state=0)
         ]
j = 0
for i in models:
    model = i
    cv = KFold(n_splits=5, random_state=0, shuffle=True)
    KFold_Score[classifiers[j]] = (cross_val_score(model, #Training_Data, np.ravel(#Training_Labels)
                                                   ,scoring = 'accuracy', cv=cv))
    j = j+1

#### Using K-Folds Cross validation to evaluate the prformance of our models

In [None]:
mean = pd.DataFrame(KFold_Score.mean(), index= classifiers)
KFold_Score = pd.concat([KFold_Score,mean.T])
KFold_Score.index=['Fold 1','Fold 2','Fold 3','Fold 4','Fold 5','Mean']
KFold_Score.T.sort_values(by=['Mean'], ascending = False)

# Hyperparameter Tuning

This is the most important section of this project. Here, the ultimate goal is to find an optimal combination of hyperparameters that minimizes a predefined loss function to give better results.

In [None]:
rfc = RandomForestClassifier(random_state=0)

In [None]:
param_grid = { 
    'n_estimators': [ 200,300],
    'max_features': ['auto', 'sqrt'],
    'max_depth' : [6,7,8],
    'criterion' :['gini', 'entropy']
}

In [None]:
#To check CODE-Run Time of the Block
%%time 
from sklearn.model_selection import GridSearchCV
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(#Training_Data,#Training_Labels)
CV_rfc.best_params_

#### Note: Perform numerous permutations with various hyperparametersand select ones giving the best result.

In [None]:
rfc1=RandomForestClassifier(random_state=0, n_estimators= 200, criterion = 'gini',max_features = 'auto',max_depth = 8)
rfc1.fit(#Training_Data,#Training_Labels)