In [60]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score

import warnings
warnings.simplefilter('ignore')
classification_results = pd.DataFrame(index=None, columns=['model','train roc_auc','test roc_auc'])
models_after_pca_results = pd.DataFrame(index=None, columns=['model','train roc_auc','test roc_auc'])

In [2]:
df = pd.read_csv('Twitter-Absolute-Sigma-500.data', header = None)

In [3]:
df.columns = ['NCD_0', 'NCD_1', 'NCD_2', 'NCD_3', 'NCD_4', 'NCD_5', 'NCD_6', 'AI_0', 'AI_1', 'AI_2', 'AI_3', 'AI_4', 'AI_5', 'AI_6', 
'AS(NA)_0','AS(NA)_1', 'AS(NA)_2', 'AS(NA)_3', 'AS(NA)_4', 'AS(NA)_5', 'AS(NA)_6', 'BL_0', 'BL_1', 'BL_2', 'BL_3', 'BL_4', 'BL_5', 'BL_6', 
'NAC_0', 'NAC_1', 'NAC_2', 'NAC_3', 'NAC_4', 'NAC_5', 'NAC_6', 'AS(NAC)_0', 'AS(NAC)_1', 'AS(NAC)_2', 'AS(NAC)_3', 'AS(NAC)_4', 'AS(NAC)_5', 'AS(NAC)_6', 
'CS_0', 'CS_1', 'CS_2', 'CS_3', 'CS_4', 'CS_5', 'CS_6', 'AT_0', 'AT_1', 'AT_2', 'AT_3', 'AT_4', 'AT_5', 'AT_6', 'NA_0', 'NA_1', 'NA_2', 'NA_3', 'NA_4', 'NA_5', 'NA_6',
'ADL_0', 'ADL_1', 'ADL_2', 'ADL_3', 'ADL_4', 'ADL_5', 'ADL_6', 'NAD_0', 'NAD_1', 'NAD_2', 'NAD_3', 'NAD_4', 'NAD_5', 'NAD_6','buzz']

In [4]:
X = df.drop('buzz', axis = 1)
y = df['buzz']

In [5]:
_, sample_data, _, sample_target = train_test_split(X, y, shuffle = True, test_size = 0.1)

In [6]:
X_train_unscaled, X_test_unscaled, y_train, y_test = train_test_split(sample_data, sample_target, test_size = 0.3, random_state = 101)

In [7]:
MinMax = MinMaxScaler(feature_range= (0,1))
X_train = MinMax.fit_transform(X_train_unscaled)
X_test = MinMax.transform(X_test_unscaled)

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

knn = KNeighborsClassifier()
log = LogisticRegression()
rbfsvc = SVC(kernel = 'rbf', probability=True)
dtc = DecisionTreeClassifier()
rfc = RandomForestClassifier()

## Hard Voting

In [9]:
hard_voting = VotingClassifier(estimators=[('KNN',knn),('LR',log)], voting = 'hard')
hard_voting.fit(X_train,y_train)
train_roc_auc = roc_auc_score(y_train, hard_voting.predict(X_train))
test_roc_auc = roc_auc_score(y_test, hard_voting.predict(X_test))
classification_results = classification_results.append(pd.Series({'model':'Hard Voting', 'train roc_auc':train_roc_auc,'test roc_auc':test_roc_auc}),ignore_index=True )
classification_results

Unnamed: 0,model,train roc_auc,test roc_auc
0,Hard Voting,0.888833,0.884916


## Soft Voting

In [11]:
soft_voting = VotingClassifier(estimators=[('RBFSVC',rbfsvc),('DT',dtc)], voting = 'soft')
soft_voting.fit(X_train,y_train)
train_roc_auc = roc_auc_score(y_train, soft_voting.predict(X_train))
test_roc_auc = roc_auc_score(y_test, soft_voting.predict(X_test))
classification_results = classification_results.append(pd.Series({'model':'Soft Voting', 'train roc_auc':train_roc_auc,'test roc_auc':test_roc_auc}),ignore_index=True )
classification_results

Unnamed: 0,model,train roc_auc,test roc_auc
0,Hard Voting,0.888833,0.884916
1,Soft Voting,1.0,0.920272


## Bagging - Decision Tree

In [12]:
from sklearn.ensemble import BaggingClassifier

param_grid = {'n_estimators' : [10, 50, 100, 500], 'max_samples' : [10, 20, 30, 40, 50]}
grid_search = GridSearchCV(BaggingClassifier(dtc,bootstrap = True), param_grid, cv = 5, return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            ...n_estimators=10, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 50, 100, 500], 'max_samples': [10, 20, 30, 40, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [13]:
grid_search.cv_results_

{'mean_fit_time': array([0.12977915, 0.68290462, 1.36062841, 6.45355821, 0.15335751,
        0.64009137, 1.35727935, 6.6448173 , 0.15804353, 0.70188279,
        1.3774148 , 6.8383522 , 0.16266026, 0.70656681, 1.38899384,
        6.58144236, 0.16011744, 0.69368019, 1.19927182, 6.05659833]),
 'std_fit_time': array([0.02377996, 0.01210767, 0.01401412, 0.12620255, 0.01286651,
        0.01591603, 0.02031439, 0.23949006, 0.00120876, 0.00448182,
        0.00822794, 0.02150069, 0.00972758, 0.00510917, 0.01045145,
        0.27693736, 0.02063746, 0.05553047, 0.06152371, 0.32605943]),
 'mean_score_time': array([0.02168117, 0.11023703, 0.20655451, 1.00772748, 0.01970139,
        0.09251308, 0.19836617, 1.02021332, 0.02499576, 0.10466914,
        0.20936322, 1.03726406, 0.02209492, 0.10476847, 0.21651783,
        0.95087872, 0.0260582 , 0.08671231, 0.18505392, 0.91446848]),
 'std_score_time': array([0.0078644 , 0.01587337, 0.00356222, 0.11882401, 0.00453022,
        0.01224312, 0.01044737, 0.023842

In [14]:
grid_search.best_score_

0.9636511320946289

In [15]:
grid_search.best_params_

{'max_samples': 30, 'n_estimators': 500}

In [17]:
bag = BaggingClassifier(dtc, n_estimators = 500, max_samples = 50, bootstrap= True, n_jobs= -1)
bag.fit(X_train, y_train)
train_roc_auc = roc_auc_score(y_train, bag.predict(X_train))
test_roc_auc = roc_auc_score(y_test, bag.predict(X_test))
classification_results = classification_results.append(pd.Series({'model':'Bagging - Decision Tree', 'train roc_auc':train_roc_auc,'test roc_auc':test_roc_auc}),ignore_index=True )
classification_results

Unnamed: 0,model,train roc_auc,test roc_auc
0,Hard Voting,0.888833,0.884916
1,Soft Voting,1.0,0.920272
2,Bagging - Decision Tree,0.935932,0.936783


## Bagging - K Neighbour Classifier

In [21]:
from sklearn.ensemble import BaggingClassifier

param_grid = {'n_estimators' : [10, 50, 100, 500], 'max_samples' : [10, 20, 30, 40, 50]}
grid_search = GridSearchCV(BaggingClassifier(knn,bootstrap = True), param_grid, cv = 5, return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=BaggingClassifier(base_estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=10, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 50, 100, 500], 'max_samples': [10, 20, 30, 40, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [22]:
grid_search.cv_results_

{'mean_fit_time': array([0.01975231, 0.04350533, 0.06560397, 0.27274318, 0.02247286,
        0.0397408 , 0.09316487, 0.54583955, 0.02256503, 0.04936924,
        0.08118753, 0.34476233, 0.0228497 , 0.04839196, 0.08238091,
        0.38117895, 0.01973615, 0.04935002, 0.09095731, 0.49260592]),
 'std_fit_time': array([0.00674533, 0.00882529, 0.00487219, 0.03687276, 0.00515786,
        0.00726704, 0.03751283, 0.42987636, 0.00120142, 0.00440701,
        0.00271361, 0.00164112, 0.00249589, 0.00062971, 0.00099864,
        0.05090446, 0.00401978, 0.0066053 , 0.00745868, 0.2038035 ]),
 'mean_score_time': array([0.02865629, 0.13213592, 0.24436564, 1.37808585, 0.09304419,
        0.44310431, 0.85491896, 5.22882142, 0.10340004, 0.56337752,
        1.0271863 , 5.17931848, 0.1253531 , 0.59900184, 1.19684691,
        6.07639918, 0.12872667, 0.7171349 , 1.39110308, 7.10195699]),
 'std_score_time': array([0.0071742 , 0.01668649, 0.01241856, 0.09315952, 0.02612465,
        0.07378793, 0.10126983, 2.313187

In [23]:
grid_search.best_score_

0.9479134937557112

In [24]:
grid_search.best_params_

{'max_samples': 50, 'n_estimators': 100}

In [25]:
bag = BaggingClassifier(knn, n_estimators= 10, max_samples= 50, bootstrap= True, n_jobs= -1)
bag.fit(X_train, y_train)
train_roc_auc = roc_auc_score(y_train, bag.predict(X_train))
test_roc_auc = roc_auc_score(y_test, bag.predict(X_test))
classification_results = classification_results.append(pd.Series({'model':'Bagging - K Neighbour Classifier', 'train roc_auc':train_roc_auc,'test roc_auc':test_roc_auc}),ignore_index=True )
classification_results

Unnamed: 0,model,train roc_auc,test roc_auc
0,Hard Voting,0.888833,0.884916
1,Soft Voting,1.0,0.920272
2,Bagging - Decision Tree,0.935932,0.936783
3,Bagging - K Neighbour Classifier,0.878873,0.881044


## Pasting - Random Forest

In [26]:
from sklearn.ensemble import BaggingClassifier

param_grid = {'n_estimators' : [10, 50, 100, 500], 'max_samples' : [10, 20, 30, 40, 50]}
grid_search = GridSearchCV(BaggingClassifier(rfc,bootstrap = False), param_grid, cv = 5, return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=BaggingClassifier(base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_spl...n_estimators=10, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 50, 100, 500], 'max_samples': [10, 20, 30, 40, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [27]:
grid_search.cv_results_

{'mean_fit_time': array([0.22484093, 1.03568273, 2.14000378, 9.93242502, 0.21130538,
        0.98438058, 1.94523435, 9.72066269, 0.2089242 , 0.983814  ,
        1.96589742, 9.79284344, 0.21132388, 1.05140224, 1.96695709,
        9.75696678, 0.20628552, 0.99451623, 2.14086199, 9.90314355]),
 'std_fit_time': array([0.02383353, 0.03456638, 0.18403414, 0.36008711, 0.00700549,
        0.0096613 , 0.01287173, 0.04385713, 0.0062345 , 0.0019082 ,
        0.02646807, 0.08831521, 0.00693892, 0.08426867, 0.01750595,
        0.04252588, 0.00619172, 0.00887946, 0.27799905, 0.26238042]),
 'mean_score_time': array([0.02380209, 0.12686505, 0.23293943, 1.12112412, 0.01562228,
        0.11096625, 0.2251102 , 1.17911205, 0.02666097, 0.11521535,
        0.23714347, 1.15465603, 0.02833486, 0.1154006 , 0.23081145,
        1.13944325, 0.01562543, 0.11433682, 0.26694846, 1.14764771]),
 'std_score_time': array([7.60822468e-03, 2.21538063e-03, 1.06576218e-02, 4.92882315e-03,
        2.33170933e-05, 3.23392804e-

In [28]:
grid_search.best_score_

0.9631434663417606

In [29]:
grid_search.best_params_

{'max_samples': 50, 'n_estimators': 500}

In [30]:
bag = BaggingClassifier(rfc, n_estimators= 500, max_samples= 50, bootstrap= False, n_jobs= -1)
bag.fit(X_train, y_train)
train_roc_auc = roc_auc_score(y_train, bag.predict(X_train))
test_roc_auc = roc_auc_score(y_test, bag.predict(X_test))
classification_results = classification_results.append(pd.Series({'model':'Pasting - Random Forest', 'train roc_auc':train_roc_auc,'test roc_auc':test_roc_auc}),ignore_index=True )
classification_results

Unnamed: 0,model,train roc_auc,test roc_auc
0,Hard Voting,0.888833,0.884916
1,Soft Voting,1.0,0.920272
2,Bagging - Decision Tree,0.935932,0.936783
3,Bagging - K Neighbour Classifier,0.878873,0.881044
4,Pasting - Random Forest,0.932693,0.930335


## Pasting - Logistic Regression

In [31]:
from sklearn.ensemble import BaggingClassifier

param_grid = {'n_estimators' : [10, 50, 100, 500], 'max_samples' : [10, 20, 30, 40, 50]}
grid_search = GridSearchCV(BaggingClassifier(log,bootstrap = False), param_grid, cv = 5, return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=BaggingClassifier(base_estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start...n_estimators=10, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 50, 100, 500], 'max_samples': [10, 20, 30, 40, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [32]:
grid_search.cv_results_

{'mean_fit_time': array([ 0.48509312,  2.35701814,  4.48525267, 22.40768738,  0.50841246,
         2.50289922,  4.97013874, 26.30314426,  0.55486646,  2.6876574 ,
         5.42155786, 27.43700919,  0.60473251,  2.97045894,  5.75983944,
        29.77450995,  0.5839715 ,  2.98622737,  5.86596637, 30.1448153 ]),
 'std_fit_time': array([0.03002651, 0.06217139, 0.05560955, 0.14011707, 0.01437611,
        0.06545011, 0.0312365 , 1.2973628 , 0.0195018 , 0.05776777,
        0.10215334, 0.68074005, 0.02157809, 0.05138634, 0.1486553 ,
        0.57306788, 0.02530829, 0.03059622, 0.08512506, 0.65821614]),
 'mean_score_time': array([0.01514764, 0.04686103, 0.1055397 , 0.5056601 , 0.01152034,
        0.05386739, 0.10838084, 0.52765846, 0.00874519, 0.05933928,
        0.10619526, 0.53082361, 0.00903964, 0.05241399, 0.10832486,
        0.51938615, 0.01459403, 0.06069617, 0.10954452, 0.52473855]),
 'std_score_time': array([0.00092189, 0.00987873, 0.00841578, 0.01270995, 0.00605067,
        0.00868483, 

In [33]:
grid_search.best_score_

0.8174433952685551

In [34]:
grid_search.best_params_

{'max_samples': 50, 'n_estimators': 10}

In [35]:
bag = BaggingClassifier(log, n_estimators= 50, max_samples= 50, bootstrap= False, n_jobs= -1)
bag.fit(X_train, y_train)
train_roc_auc = roc_auc_score(y_train, bag.predict(X_train))
test_roc_auc = roc_auc_score(y_test, bag.predict(X_test))
classification_results = classification_results.append(pd.Series({'model':'Pasting - Logistic Regression', 'train roc_auc':train_roc_auc,'test roc_auc':test_roc_auc}),ignore_index=True )
classification_results

Unnamed: 0,model,train roc_auc,test roc_auc
0,Hard Voting,0.888833,0.884916
1,Soft Voting,1.0,0.920272
2,Bagging - Decision Tree,0.935932,0.936783
3,Bagging - K Neighbour Classifier,0.878873,0.881044
4,Pasting - Random Forest,0.932693,0.930335
5,Pasting - Logistic Regression,0.530124,0.525749


## Adaboost Boosting - Decision Tree

In [36]:
from sklearn.ensemble import AdaBoostClassifier

param_grid = {'learning_rate':[0.0001,0.001,0.01,0.1,1], 'n_estimators' : [10, 20, 50, 100, 500]}
grid_search = GridSearchCV(AdaBoostClassifier(dtc,random_state = 0), param_grid, cv=5,return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1.0, n_estimators=50, random_state=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1], 'n_estimators': [10, 20, 50, 100, 500]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [37]:
grid_search.cv_results_

{'mean_fit_time': array([0.2413208 , 0.24292331, 0.23545971, 0.23826222, 0.23910599,
        0.23536663, 0.23421841, 0.23175397, 0.23930402, 0.23385649,
        0.23969669, 0.23195128, 0.23902621, 0.23887172, 0.2674562 ,
        0.27437224, 0.23484135, 0.23123741, 0.22868781, 0.22730031,
        0.22640886, 0.23399601, 0.22048979, 0.22828321, 0.23038478]),
 'std_fit_time': array([0.0212578 , 0.01725528, 0.01144751, 0.01872353, 0.01488463,
        0.00962046, 0.01407418, 0.01201197, 0.01447986, 0.01419632,
        0.02581867, 0.01397204, 0.01094062, 0.01950227, 0.03410138,
        0.05262286, 0.01785451, 0.0182902 , 0.01874397, 0.01144955,
        0.0145785 , 0.01711909, 0.01809192, 0.01274191, 0.0143567 ]),
 'mean_score_time': array([0.00099101, 0.00039873, 0.00039287, 0.00352325, 0.00099802,
        0.00039835, 0.0007977 , 0.00039873, 0.00059729, 0.00040469,
        0.00079746, 0.00352917, 0.00099821, 0.00120869, 0.00159402,
        0.00392218, 0.00018678, 0.        , 0.00040493, 0.00

In [38]:
grid_search.best_score_

0.9473042948522693

In [39]:
grid_search.best_params_

{'learning_rate': 0.0001, 'n_estimators': 10}

In [40]:
ada = AdaBoostClassifier(dtc, learning_rate= 0.0001, n_estimators = 10, random_state= 0)
ada.fit(X_train, y_train)
train_roc_auc = roc_auc_score(y_train, ada.predict(X_train))
test_roc_auc = roc_auc_score(y_test, ada.predict(X_test))
classification_results = classification_results.append(pd.Series({'model':'Adaboost - Decision Tree', 'train roc_auc':train_roc_auc,'test roc_auc':test_roc_auc}),ignore_index=True )
classification_results

Unnamed: 0,model,train roc_auc,test roc_auc
0,Hard Voting,0.888833,0.884916
1,Soft Voting,1.0,0.920272
2,Bagging - Decision Tree,0.935932,0.936783
3,Bagging - K Neighbour Classifier,0.878873,0.881044
4,Pasting - Random Forest,0.932693,0.930335
5,Pasting - Logistic Regression,0.530124,0.525749
6,Adaboost - Decision Tree,1.0,0.919821


## Adaboost Boosting - Random Forest

In [43]:
from sklearn.ensemble import AdaBoostClassifier

param_grid = {'learning_rate':[0.0001,0.001,0.01,0.1,1], 'n_estimators' : [10, 20, 50, 100, 500]}
grid_search = GridSearchCV(AdaBoostClassifier(base_estimator = rfc,random_state = 0), param_grid, cv=5,return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min...bose=0,
            warm_start=False),
          learning_rate=1.0, n_estimators=50, random_state=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1], 'n_estimators': [10, 20, 50, 100, 500]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [44]:
grid_search.cv_results_

{'mean_fit_time': array([  1.670153  ,   3.06756763,   7.57516127,  15.19484153,
         72.85951509,   1.53450732,   3.01384211,   7.24910703,
         14.08495355,  68.54751196,   1.42685785,   2.61154995,
          7.03852715,  22.06063738, 156.16618981,   1.77788935,
          4.72213435,  14.27071953,  28.16727247,  86.04754596,
          1.81622958,   4.00489521,   7.83665228,  12.47291369,
         46.86594896]),
 'std_fit_time': array([0.05386648, 0.02573118, 0.07037394, 0.14696135, 0.57088863,
        0.01930856, 0.03386031, 0.08596216, 0.39313509, 1.07410509,
        0.02046261, 0.0102131 , 0.30395709, 1.16949776, 3.91384771,
        0.04693239, 0.21067864, 0.78571824, 0.99901189, 4.39413578,
        0.08107483, 0.14543227, 0.35769585, 0.5230324 , 2.05577232]),
 'mean_score_time': array([0.02385569, 0.04609866, 0.11944399, 0.24818769, 1.18882957,
        0.02686357, 0.05019593, 0.12326336, 0.24856029, 1.20692444,
        0.02812386, 0.0385705 , 0.1179635 , 0.29522405, 1.5987

In [45]:
grid_search.best_score_

0.9640572646969235

In [46]:
grid_search.best_params_

{'learning_rate': 0.01, 'n_estimators': 100}

In [47]:
ada = AdaBoostClassifier(rfc, learning_rate= 0.001, n_estimators= 100, random_state= 0)
ada.fit(X_train, y_train)
train_roc_auc = roc_auc_score(y_train, ada.predict(X_train))
test_roc_auc = roc_auc_score(y_test, ada.predict(X_test))
classification_results = classification_results.append(pd.Series({'model':'Adaboost - Random Forest', 'train roc_auc':train_roc_auc,'test roc_auc':test_roc_auc}),ignore_index=True )
classification_results

Unnamed: 0,model,train roc_auc,test roc_auc
0,Hard Voting,0.888833,0.884916
1,Soft Voting,1.0,0.920272
2,Bagging - Decision Tree,0.935932,0.936783
3,Bagging - K Neighbour Classifier,0.878873,0.881044
4,Pasting - Random Forest,0.932693,0.930335
5,Pasting - Logistic Regression,0.530124,0.525749
6,Adaboost - Decision Tree,1.0,0.919821
7,Adaboost - Random Forest,1.0,0.937242


## Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(random_state=10, n_estimators= 500)
param_grid = {'max_features':['auto', 'log2'], 'learning_rate' : [0.01,0.1], 'max_depth':[5,10,15,30,50]}
grid_search = GridSearchCV(gb, param_grid, cv=5, return_train_score = True)
grid_search.fit(X_train, y_train)

In [51]:
grid_search.cv_results_

{'mean_fit_time': array([ 43.3860775 ,   5.69939017, 142.8126152 ,  25.79972849,
        135.08218708,  28.13151908,  73.59621277,  11.37023578,
         70.11884913,  11.56008286,  22.35116339,   3.11227937,
         16.1486165 ,   3.20535011,  10.40681796,   2.50421867,
         10.13273392,   2.06711731,  10.32526836,   2.07825036]),
 'std_fit_time': array([2.24302265, 0.0998958 , 3.29282462, 1.02261689, 8.56219158,
        1.34511403, 4.73677862, 0.44128721, 1.59290964, 0.51134915,
        0.42843245, 0.01917664, 1.1086524 , 0.20278538, 0.37740434,
        0.25198292, 0.2863303 , 0.13736855, 0.58492585, 0.04368617]),
 'mean_score_time': array([0.03033991, 0.03887081, 0.06420059, 0.06724553, 0.07340651,
        0.08659844, 0.0355124 , 0.04307427, 0.03590174, 0.0442821 ,
        0.01852918, 0.02016401, 0.01117792, 0.01236629, 0.00777974,
        0.01096287, 0.00797925, 0.00816879, 0.00797787, 0.00950379]),
 'std_score_time': array([0.00857074, 0.00628658, 0.01566469, 0.01101909, 0.01

In [52]:
grid_search.best_score_

0.9674078586658544

In [53]:
grid_search.best_params_

{'learning_rate': 0.01, 'max_depth': 5, 'max_features': 'log2'}

In [49]:
gb = GradientBoostingClassifier(max_features = 'log2', learning_rate = 0.01 , max_depth = 5, n_estimators = 500, random_state = 10)
gb.fit(X_train, y_train)
train_roc_auc = roc_auc_score(y_train, gb.predict(X_train))
test_roc_auc = roc_auc_score(y_test, gb.predict(X_test))
classification_results = classification_results.append(pd.Series({'model':'Gradient Boosting', 'train roc_auc':train_roc_auc,'test roc_auc':test_roc_auc}),ignore_index=True )
classification_results

Unnamed: 0,model,train roc_auc,test roc_auc
0,Hard Voting,0.888833,0.884916
1,Soft Voting,1.0,0.920272
2,Bagging - Decision Tree,0.935932,0.936783
3,Bagging - K Neighbour Classifier,0.878873,0.881044
4,Pasting - Random Forest,0.932693,0.930335
5,Pasting - Logistic Regression,0.530124,0.525749
6,Adaboost - Decision Tree,1.0,0.919821
7,Adaboost - Random Forest,1.0,0.937242
8,Gradient Boosting,0.97695,0.939785


## Dimensionality Reduction using PCA

In [50]:
from sklearn.decomposition import PCA

In [51]:
pca = PCA(n_components=0.99)
pca.fit(sample_data)

X_pca = pca.transform(sample_data)

In [52]:
X_pca.shape

(14071, 7)

In [53]:
X_train_unscaled, X_test_unscaled, y_train, y_test = train_test_split(X_pca, sample_target, test_size = 0.3, random_state = 0)

In [54]:
MinMax = MinMaxScaler(feature_range= (0,1))
X_train = MinMax.fit_transform(X_train_unscaled)
X_test = MinMax.transform(X_test_unscaled)

## Logistic Regression

In [55]:
from sklearn.model_selection import GridSearchCV

param_grid = {'penalty': ['l1', 'l2']}
grid_search = GridSearchCV(log, param_grid, cv = 5, return_train_score= True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2']}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [56]:
grid_search.cv_results_

{'mean_fit_time': array([0.08664641, 0.00936646]),
 'std_fit_time': array([0.03876081, 0.00764769]),
 'mean_score_time': array([0., 0.]),
 'std_score_time': array([0., 0.]),
 'param_penalty': masked_array(data=['l1', 'l2'],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'params': [{'penalty': 'l1'}, {'penalty': 'l2'}],
 'split0_test_score': array([0.95583756, 0.86700508]),
 'split1_test_score': array([0.96192893, 0.85228426]),
 'split2_test_score': array([0.96345178, 0.86903553]),
 'split3_test_score': array([0.96345178, 0.86852792]),
 'split4_test_score': array([0.96140173, 0.85830371]),
 'mean_test_score': array([0.96121434, 0.86303178]),
 'std_test_score': array([0.00280969, 0.00663196]),
 'rank_test_score': array([1, 2]),
 'split0_train_score': array([0.96281254, 0.86216525]),
 'split1_train_score': array([0.96141642, 0.86533824]),
 'split2_train_score': array([0.96103566, 0.86203833]),
 'split3_train_score': array([0.96116258, 0.86178449]),

In [57]:
grid_search.best_params_

{'penalty': 'l1'}

In [58]:
grid_search.best_score_

0.961214336480861

In [61]:
log = LogisticRegression(penalty = 'l1')
log.fit(X_train,y_train)
predict_test = log.predict(X_test)
predict_train = log.predict(X_train)
train_roc_auc = roc_auc_score(y_train, predict_train)
test_roc_auc = roc_auc_score(y_test, predict_test)
models_after_pca_results = models_after_pca_results.append(pd.Series({'model':'Logistic Regression','train roc_auc':train_roc_auc,'test roc_auc':test_roc_auc}),ignore_index=True)
models_after_pca_results

Unnamed: 0,model,train roc_auc,test roc_auc
0,Logistic Regression,0.924776,0.919709


## K Neighbors Classifier

In [62]:
param_grid = {'n_neighbors' : [3, 5, 7]}
grid_search = GridSearchCV(knn, param_grid, cv = 5, return_train_score= True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_neighbors': [3, 5, 7]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [63]:
grid_search.cv_results_

{'mean_fit_time': array([0.004918  , 0.00618916, 0.00624995]),
 'std_fit_time': array([0.00637909, 0.00311908, 0.0076546 ]),
 'mean_score_time': array([0.04723759, 0.05177913, 0.06119866]),
 'std_score_time': array([0.00996338, 0.00590932, 0.00572104]),
 'param_n_neighbors': masked_array(data=[3, 5, 7],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'n_neighbors': 3}, {'n_neighbors': 5}, {'n_neighbors': 7}],
 'split0_test_score': array([0.95736041, 0.95786802, 0.95786802]),
 'split1_test_score': array([0.94974619, 0.95329949, 0.95380711]),
 'split2_test_score': array([0.96345178, 0.95989848, 0.96395939]),
 'split3_test_score': array([0.95888325, 0.96040609, 0.96294416]),
 'split4_test_score': array([0.95581513, 0.95530726, 0.95784662]),
 'mean_test_score': array([0.95705148, 0.95735608, 0.95928521]),
 'std_test_score': array([0.0044569 , 0.00270864, 0.00372369]),
 'rank_test_score': array([3, 2, 1]),
 'split0_train_score': arr

In [67]:
grid_search.best_score_

0.9584729414153721

In [64]:
grid_search.best_params_

{'n_neighbors': 7}

In [65]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)
predict_test = knn.predict(X_test)
predict_train = knn.predict(X_train)
train_roc_auc = roc_auc_score(y_train, predict_train)
test_roc_auc = roc_auc_score(y_test, predict_test)

models_after_pca_results = models_after_pca_results.append(pd.Series({'model':'K Neighbors Classifier','train roc_auc':train_roc_auc,'test roc_auc':test_roc_auc}),ignore_index=True)
models_after_pca_results

Unnamed: 0,model,train roc_auc,test roc_auc
0,Logistic Regression,0.924776,0.919709
1,K Neighbors Classifier,0.938259,0.91734


## Decision Tree Classifier

In [66]:
param_grid = {'max_depth': [5, 10, 20, 50, 100]}

grid_search = GridSearchCV(dtc, param_grid, cv = 5, return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [5, 10, 20, 50, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [67]:
grid_search.best_params_

{'max_depth': 5}

In [68]:
grid_search.cv_results_

{'mean_fit_time': array([0.02795916, 0.04056458, 0.03969178, 0.03836284, 0.03675027]),
 'std_fit_time': array([0.00139834, 0.00898075, 0.00625183, 0.00649589, 0.00883464]),
 'mean_score_time': array([0.00039825, 0.        , 0.00040975, 0.        , 0.0020256 ]),
 'std_score_time': array([0.00048776, 0.        , 0.00081949, 0.        , 0.00312783]),
 'param_max_depth': masked_array(data=[5, 10, 20, 50, 100],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 5},
  {'max_depth': 10},
  {'max_depth': 20},
  {'max_depth': 50},
  {'max_depth': 100}],
 'split0_test_score': array([0.95837563, 0.95380711, 0.94923858, 0.94771574, 0.94873096]),
 'split1_test_score': array([0.95736041, 0.95025381, 0.94568528, 0.94670051, 0.95025381]),
 'split2_test_score': array([0.96192893, 0.95329949, 0.95025381, 0.94923858, 0.95076142]),
 'split3_test_score': array([0.96142132, 0.95583756, 0.95126904, 0.95431472, 0.94923858]),
 '

In [69]:
grid_search.best_score_

0.959691339222256

In [70]:
dt = DecisionTreeClassifier(max_depth= 5, random_state= 10)
dt.fit(X_train, y_train)
predict_test = dt.predict(X_test)
predict_train = dt.predict(X_train)
train_roc_auc = roc_auc_score(y_train, predict_train)
test_roc_auc = roc_auc_score(y_test, predict_test)

models_after_pca_results = models_after_pca_results.append(pd.Series({'model':'Decision Tree Classifier','train roc_auc':train_roc_auc,'test roc_auc':test_roc_auc}),ignore_index=True)
models_after_pca_results

Unnamed: 0,model,train roc_auc,test roc_auc
0,Logistic Regression,0.924776,0.919709
1,K Neighbors Classifier,0.938259,0.91734
2,Decision Tree Classifier,0.947853,0.935755


## Linear SVC

In [71]:
param_grid = {'C': [0.001, 0.01, 0.1, 0.5, 1, 10, 50, 100, 1000], 'max_iter':[1000,10000] }

grid_search = GridSearchCV(SVC(random_state=0,kernel='linear'), param_grid, cv=5, return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 0.1, 0.5, 1, 10, 50, 100, 1000], 'max_iter': [1000, 10000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [72]:
grid_search.cv_results_

{'mean_fit_time': array([0.19976215, 0.33441319, 0.18255801, 0.33037748, 0.1796473 ,
        0.34166055, 0.17531443, 0.29364214, 0.19254031, 0.25624352,
        0.13503461, 0.12847724, 0.09686346, 0.09794197, 0.09887023,
        0.09536171, 0.09274654, 0.09579458]),
 'std_fit_time': array([0.01276717, 0.00399962, 0.00634824, 0.00680219, 0.00989382,
        0.01167818, 0.00755489, 0.00622513, 0.00570726, 0.00699311,
        0.01202528, 0.00435739, 0.00624374, 0.01003101, 0.0132265 ,
        0.00327852, 0.01042843, 0.01054329]),
 'mean_score_time': array([0.03048377, 0.04991732, 0.0281291 , 0.05056438, 0.03077693,
        0.04927216, 0.0281302 , 0.04061112, 0.0303401 , 0.03437634,
        0.02812772, 0.02104006, 0.0187346 , 0.01562381, 0.01774921,
        0.01217103, 0.01023712, 0.015487  ]),
 'std_score_time': array([3.29905672e-03, 4.49464278e-03, 6.23811282e-03, 6.04278535e-03,
        9.10489699e-04, 4.78635074e-03, 6.23937996e-03, 7.65221670e-03,
        1.27438208e-03, 6.24136799e-

In [73]:
grid_search.best_score_

0.9639557315463498

In [74]:
grid_search.best_params_

{'C': 1000, 'max_iter': 1000}

In [75]:
svc = SVC(C = 1000, max_iter = 10000, kernel = 'linear', random_state = 0)
svc.fit(X_train, y_train)
predict_test = svc.predict(X_test)
predict_train = svc.predict(X_train)
train_roc_auc = roc_auc_score(y_train, predict_train)
test_roc_auc = roc_auc_score(y_test, predict_test)

models_after_pca_results = models_after_pca_results.append(pd.Series({'model':'Linear SVC','train roc_auc':train_roc_auc,'test roc_auc':test_roc_auc}),ignore_index=True)
models_after_pca_results

Unnamed: 0,model,train roc_auc,test roc_auc
0,Logistic Regression,0.924776,0.919709
1,K Neighbors Classifier,0.938259,0.91734
2,Decision Tree Classifier,0.947853,0.935755
3,Linear SVC,0.934417,0.935448


## Polynomial SVC

In [76]:
param_grid = {'C': [0.001, 0.01, 0.1, 0.5, 1, 10, 50, 100, 1000], 'max_iter':[1000,10000], 'gamma':[0.001, 0.01, 0.1, 0.5, 1, 10] }

grid_search = GridSearchCV(SVC(random_state=0,kernel='poly'), param_grid, cv=5, return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 0.1, 0.5, 1, 10, 50, 100, 1000], 'max_iter': [1000, 10000], 'gamma': [0.001, 0.01, 0.1, 0.5, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [77]:
grid_search.cv_results_

{'mean_fit_time': array([0.24786978, 0.4168632 , 0.23278327, 0.40611963, 0.22777781,
        0.406037  , 0.22987013, 0.40433683, 0.23049984, 0.4812839 ,
        0.22897997, 0.26783714, 0.2353447 , 0.42025337, 0.22946625,
        0.44740067, 0.23194904, 0.4120172 , 0.22688274, 0.47588067,
        0.23519678, 0.48581185, 0.14424663, 0.15184865, 0.25045366,
        0.41910028, 0.22702293, 0.4054894 , 0.22752304, 0.40654335,
        0.21868806, 0.47790208, 0.23248882, 0.39676533, 0.11516409,
        0.10953484, 0.22527881, 0.41402936, 0.23271179, 0.40689569,
        0.22409029, 0.4361165 , 0.2262805 , 0.43256001, 0.22484312,
        0.30885348, 0.11036444, 0.12578578, 0.22623458, 0.40631123,
        0.22639651, 0.40833669, 0.22864943, 0.47667656, 0.22758236,
        0.38915386, 0.22091188, 0.25695133, 0.10871096, 0.16148863,
        0.22328091, 0.4050693 , 0.22804775, 0.39961839, 0.22550302,
        0.48556108, 0.22493553, 0.22137508, 0.14455738, 0.13902106,
        0.05107594, 0.39407563,

In [78]:
grid_search.best_score_

0.9647679967509392

In [79]:
grid_search.best_params_

{'C': 1, 'gamma': 10, 'max_iter': 10000}

In [80]:
svc = SVC(C = 1000, max_iter = 1000, gamma = 0.5, kernel = 'poly', random_state = 0)
svc.fit(X_train, y_train)
predict_test = svc.predict(X_test)
predict_train = svc.predict(X_train)
train_roc_auc = roc_auc_score(y_train, predict_train)
test_roc_auc = roc_auc_score(y_test, predict_test)

models_after_pca_results = models_after_pca_results.append(pd.Series({'model':'Polynomial SVC','train roc_auc':train_roc_auc,'test roc_auc':test_roc_auc}),ignore_index=True)
models_after_pca_results

Unnamed: 0,model,train roc_auc,test roc_auc
0,Logistic Regression,0.924776,0.919709
1,K Neighbors Classifier,0.938259,0.91734
2,Decision Tree Classifier,0.947853,0.935755
3,Linear SVC,0.934417,0.935448
4,Polynomial SVC,0.933561,0.932625


## Radial SVC

In [81]:
param_grid = {'C':[0.001, 0.01, 0.1, 0.5, 1, 10, 50, 100, 1000], 'max_iter':[1000,10000], 'gamma':[0.001, 0.01, 0.1, 0.5, 1, 10]}

grid_search = GridSearchCV(SVC(random_state=0,kernel='rbf'), param_grid, cv=5, return_train_score=True)
grid_search.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 0.1, 0.5, 1, 10, 50, 100, 1000], 'max_iter': [1000, 10000], 'gamma': [0.001, 0.01, 0.1, 0.5, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [82]:
grid_search.cv_results_

{'mean_fit_time': array([0.39905863, 0.63788795, 0.38721762, 0.64894514, 0.37655778,
        0.65067644, 0.39604416, 0.64979467, 0.38621979, 0.67525468,
        0.37665095, 0.70127506, 0.42380085, 0.67527366, 0.37988067,
        0.78539443, 0.36969705, 0.64388905, 0.37115917, 0.65215278,
        0.38516235, 0.63452754, 0.3728188 , 0.6254581 , 0.37999368,
        0.71247821, 0.37458334, 0.64188366, 0.37638745, 0.64067297,
        0.37436342, 0.63707495, 0.37296743, 0.61974277, 0.37377996,
        0.41548653, 0.37716365, 0.64426394, 0.37379436, 0.64606609,
        0.46016865, 1.02249455, 0.47491102, 0.6370882 , 0.42742248,
        0.5874167 , 2.04351816, 0.89558244, 0.87614946, 1.29780822,
        0.69663534, 1.29247146, 0.76871171, 1.11738472, 0.68728185,
        0.94673014, 0.83897533, 0.89917994, 0.5004148 , 0.50665717,
        0.76612787, 1.34813428, 0.76829443, 1.24861889, 0.64090743,
        0.70318213, 0.46196795, 0.46495757, 0.40440736, 0.40250735,
        0.31156573, 0.31042366,

In [83]:
grid_search.best_params_

{'C': 1000, 'gamma': 1, 'max_iter': 10000}

In [84]:
grid_search.best_score_

0.9648695299015129

In [85]:
svc = SVC(C = 1000, max_iter = 1000, gamma = 0.1, kernel = 'rbf', random_state = 0)
svc.fit(X_train, y_train)
predict_test = svc.predict(X_test)
predict_train = svc.predict(X_train)
train_roc_auc = roc_auc_score(y_train, predict_train)
test_roc_auc = roc_auc_score(y_test, predict_test)

models_after_pca_results = models_after_pca_results.append(pd.Series({'model':'Radial SVM','train roc_auc':train_roc_auc,'test roc_auc':test_roc_auc}),ignore_index=True)
models_after_pca_results

Unnamed: 0,model,train roc_auc,test roc_auc
0,Logistic Regression,0.924776,0.919709
1,K Neighbors Classifier,0.938259,0.91734
2,Decision Tree Classifier,0.947853,0.935755
3,Linear SVC,0.934417,0.935448
4,Polynomial SVC,0.933561,0.932625
5,Radial SVM,0.933102,0.93114


	##old results
    
    model           	       train roc_auc	test roc_auc	train accuracy	test accuracy
    0	Logistic Regression	    0.926068	0.920349	0.964666	0.959972
    1	K Neighbors Classifier	0.946538	0.929003	0.972383	0.959972
    2	Linear SVM	            0.949529	0.940198	0.969845	0.964709
    3	Polynomial SVM       	0.945089	0.932248	0.970048	0.963761
    4	Radial SVM	            0.939443	0.928746	0.968626	0.961630
    5  Decision Tree Classifier 0.951153	0.923784	0.975023	0.958550

The accuracy is good for the models after PCA and was good before also but computation time decreased drastically after applying PCA

## Deep Learning Model

In [86]:
import keras
from keras.models import Sequential
from keras.layers import Dense
import numpy

Using TensorFlow backend.


In [94]:
def create_model():
    
    model = Sequential()
    model.add(Dense(12, input_dim=7, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [95]:
from sklearn.metrics import roc_auc_score

In [96]:
from keras.wrappers.scikit_learn import KerasClassifier

model = KerasClassifier(build_fn = create_model , verbose = 0)
param_grid = {'epochs':[50, 100, 200] , 'batch_size':[20, 50, 100]}
grid_search = GridSearchCV(model , param_grid , cv =5)

grid_search.fit(X_train, y_train)
print('Best parameters {}'.format(grid_search.best_params_))

print('The Train ROC AUC score is',roc_auc_score(y_train, grid_search.predict(X_train)))
print('The Test ROC AUC score is',roc_auc_score(y_test, grid_search.predict(X_test)))

Best parameters {'batch_size': 20, 'epochs': 50}
The Train ROC AUC score is 0.9342328284876374
The Test ROC AUC score is 0.9292143951425881
