In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib as mplt
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_curve, roc_auc_score, f1_score

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV
from sklearn.experimental import enable_halving_search_cv 
from sklearn.model_selection import HalvingGridSearchCV

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

from sklearn.pipeline import Pipeline

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import Adamax

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier


from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

import warnings
warnings.filterwarnings('ignore')

In [2]:
data_train = pd.read_csv("./data/train.csv", sep = ",") ##Add your own path to access data
data_train=data_train.drop(['id'], axis=1) 
data_train.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f91,f92,f93,f94,f95,f96,f97,f98,f99,target
0,0.106643,3.59437,132.804,3.18428,0.081971,1.18859,3.73238,2.26627,2.09959,0.01233,...,1.09862,0.013331,-0.011715,0.052759,0.0654,4.21125,1.97877,0.085974,0.240496,0
1,0.125021,1.67336,76.5336,3.37825,0.0994,5.09366,1.27562,-0.471318,4.54594,0.037706,...,3.46017,0.017054,0.124863,0.154064,0.606848,-0.267928,2.57786,-0.020877,0.024719,0
2,0.03633,1.49747,233.546,2.19435,0.026914,3.12694,5.05687,3.84946,1.80187,0.056995,...,4.883,0.085222,0.032396,0.116092,-0.001689,-0.520069,2.14112,0.124464,0.148209,0
3,-0.014077,0.246,779.967,1.89064,0.006948,1.53112,2.698,4.51733,4.50332,0.123494,...,3.47439,-0.017103,-0.0081,0.062013,0.041193,0.511657,1.9686,0.040017,0.044873,0
4,-0.003259,3.71542,156.128,2.14772,0.018284,2.09859,4.15492,-0.038236,3.37145,0.034166,...,1.91059,-0.042943,0.105616,0.125072,0.037509,1.04379,1.07481,-0.012819,0.072798,1


In [3]:
X=data_train.drop(columns = 'target')
y=data_train['target']

X_train, X_valid, y_train, y_test = train_test_split(X, y, 
                                                    test_size = 0.25, random_state=0)

X_train = X_train.reset_index(drop = True)
X_valid = X_valid.reset_index(drop = True)

znormalizer = StandardScaler()
robust_scaler = RobustScaler()
num_cols = X.select_dtypes(['integer', 'float']).columns

znormalizer.fit(X_train[num_cols])
robust_scaler.fit(X_train[num_cols])

X_train_norm = pd.DataFrame(znormalizer.transform(X_train[num_cols]), columns = num_cols)
X_valid_norm = pd.DataFrame(znormalizer.transform(X_valid[num_cols]), columns = num_cols)

X__train_robust = pd.DataFrame(robust_scaler.transform(X_train[num_cols]), columns = num_cols)
X_valid_robust = pd.DataFrame(robust_scaler.transform(X_valid[num_cols]), columns = num_cols)

In [4]:

znormalizer.fit(X[num_cols])
robust_scaler.fit(X[num_cols])

X_norm = pd.DataFrame(znormalizer.transform(X[num_cols]), columns = num_cols)
X_robust = pd.DataFrame(robust_scaler.transform(X[num_cols]), columns = num_cols)


In [6]:
##Best Logistic Regression (Kaggle Score 0.74554)

best_LR= LogisticRegression(C=0.0002, 
                solver='saga', 
                penalty='l2', 
                fit_intercept=False,
                max_iter=400
                )

best_LR.fit(X_norm, y)
#best_LR.fit(X_train_norm, y_train)

y_hat_ALL_logit_proba = best_LR.predict_proba(X_norm)[::,1]
#y_hat_train_logit_proba = best_LR.predict_proba(X_train_norm)[::,1]
#y_hat_valid_logit_proba = best_LR.predict_proba(X_valid_norm)[::,1]

auc_score_ALL_logit = roc_auc_score(y, y_hat_ALL_logit_proba) * 100
#auc_score_train_logit = roc_auc_score(y_test, y_hat_train_logit_proba) * 100
#auc_score_valid_logit = roc_auc_score(y_test, y_hat_valid_logit_proba) * 100

print("ROC_AUC Score = {:.2f}%  of Logistic Regression Model on ALL the data.".format(auc_score_ALL_logit))
#print("ROC_AUC Score = {:.2f}%  of Logistic Regression Model on the training data.".format(auc_score_train_logit))
#print("ROC_AUC Score = {:.2f}%  of Logistic Regression Model on the validation data.".format(auc_score_valid_logit))

ROC_AUC Score = 74.92%  of Logistic Regression Model on ALL the data.


In [7]:
###Best LinearSVC (Kaggle Score 0.74582)

best_LinearSVC=LinearSVC(penalty='l2', 
                         loss='hinge', 
                         #tol=0.0001, 
                         C=0.001, 
                         fit_intercept=False, 
                         #verbose=0, 
                         #random_state=None, 
                         max_iter=1500)

best_LinearSVC.fit(X_norm, y)
#best_LinearSVC.fit(X__train_norm, y_train)

cclf = CalibratedClassifierCV(base_estimator=best_LinearSVC, method='sigmoid', cv='prefit')

cclf.fit(X_norm, y)
#cclf.fit(X_train_norm, y_train)

y_hat_ALL_cclf_proba=cclf.predict_proba(X_norm)[::,1]
#y_hat_train_cclf_proba=cclf.predict_proba(X_train_norm)[::,1]
#y_hat_valid_cclf_proba=cclf.predict_proba(X_valid_norm)[::,1]

auc_score_ALL_cclf = roc_auc_score(y, y_hat_ALL_cclf_proba) * 100
#auc_score_train_cclf = roc_auc_score(y_test, y_hat_train_cclf_proba) * 100
#auc_score_valid_cclf = roc_auc_score(y_test, y_hat_valid_cclf_proba) * 100

print("ROC_AUC Score = {:.2f}%  of LinearSVC Model on ALL the data.".format(auc_score_ALL_cclf))
#print("ROC_AUC Score = {:.2f}%  of LinearSVC Model on the training data.".format(auc_score_train_cclf))
#print("ROC_AUC Score = {:.2f}%  of LinearSVC Model on the validation data.".format(auc_score_valid_cclf))

ROC_AUC Score = 74.93%  of LinearSVC Model on ALL the data.


In [8]:
###Best XGBClassifier (Kaggle Score 0.74013)

best_XGBC= XGBClassifier(objective='binary:logistic', 
                         eval_metric='aucpr', 
                         gamma=0.05, 
                         subsample=0.6, 
                         min_child_weight=1, 
                         random_state =0, 
                         booster='gbtree', 
                         learning_rate=0.05, 
                         max_depth=3, 
                         reg_lambda=0.1, 
                         reg_alpha=0.3, 
                         n_estimators=1500)

best_XGBC.fit(X_norm, y)
#best_XGBC.fit(X_train_norm, y)

y_hat_ALL_xgbc_proba = best_XGBC.predict_proba(X_norm)[::,1]
#y_hat_train_xgbc_proba = best_XGBC.predict_proba(X_train_norm)[::,1]
#y_hat_valid_xgbc_proba = best_XGBC.predict_proba(X_valid_norm)[::,1]

auc_score_ALL_xgbc = roc_auc_score(y, y_hat_ALL_xgbc_proba) * 100
#auc_score_train_xgbc = roc_auc_score(y_test, y_hat_train_xgbc_proba) * 100
#auc_score_valid_xgbc = roc_auc_score(y_test, y_hat_valid_xgbc_proba) * 100

print("ROC_AUC Score = {:.2f}%  of XGBoost Classifier Model on ALL the data.".format(auc_score_ALL_xgbc))
#print("ROC_AUC Score = {:.2f}%  of XGBoost Classifier Model on the training data.".format(auc_score_train_xgbc))
#print("ROC_AUC Score = {:.2f}%  of XGBoost Classifier Model on the validation data.".format(auc_score_valid_xgbc))

ROC_AUC Score = 76.05%  of XGBoost Classifier Model on ALL the data.


In [9]:
### Best MLPClassifier (Kaggle Score 0.74800)

best_mlpc=MLPClassifier(hidden_layer_sizes=(4,2),
                   activation='relu', 
                   solver='adam', 
                   alpha=0.00005, 
                   batch_size=4096, 
                   learning_rate='adaptive', 
                   learning_rate_init=0.00005, 
                   max_iter=2000, 
                   shuffle=True, 
                   tol=0.00001, 
                   verbose=False, 
                   momentum=0.9, 
                   early_stopping=False, 
                   validation_fraction=0.1, 
                   beta_1=0.9, 
                   beta_2=0.999, 
                   epsilon=1e-08, 
                   n_iter_no_change=10,
                   random_state=0
                   )

best_mlpc.fit(X_norm, y)
#best_mlpc.fit(X_train_norm, y_train)

y_hat_ALL_mlpc_proba=best_mlpc.predict_proba(X_norm)[::,1]
#y_hat_train_mlpc_proba=best_mlpc.predict_proba(X_train_norm)[::,1]
#y_hat_valid_mlpc_proba=best_mlpc.predict_proba(X_valid_norm)[::,1]

auc_score_ALL_mlpc = roc_auc_score(y, y_hat_ALL_mlpc_proba) * 100
#auc_score_train_mlpc = roc_auc_score(y_test, y_hat_train_mlpc_proba) * 100
#auc_score_valid_mlpc = roc_auc_score(y_test, y_hat_valid_mlpc_proba) * 100

print("ROC_AUC Score = {:.2f}%  of NN Model on ALL the data.".format(auc_score_ALL_mlpc))
#print("ROC_AUC Score = {:.2f}%  of NN Model on the training data.".format(auc_score_train_mlpc))
#print("ROC_AUC Score = {:.2f}%  of NN Model on the validation data.".format(auc_score_valid_mlpc))


ROC_AUC Score = 75.36%  of NN Model on ALL the data.


In [10]:
base_predictions_ALL = pd.DataFrame( {'LogReg': y_hat_ALL_logit_proba.ravel(),
     'LinearSVC': y_hat_ALL_cclf_proba.ravel(),
     'XGBClassifier': y_hat_ALL_xgbc_proba.ravel(),
      'MLPClassifier': y_hat_ALL_mlpc_proba.ravel(),
        'target': y
    })

#base_predictions_train = pd.DataFrame( {'LogReg': y_hat_train_logit_proba.ravel(),
#     'LinearSVC': y_hat_train_cclf_proba.ravel(),
#     'XGBClassifier': y_hat_train_xgbc_proba.ravel(),
#     'MLPClassifier': y_hat_train_mlpc_proba.ravel(),
#     'target': y_train
#    })

#base_predictions_valid = pd.DataFrame( {'LogReg': y_hat_valid_logit_proba.ravel(),
#     'LinearSVC': y_hat_valid_cclf_proba.ravel(),
#     'XGBClassifier': y_hat_valid_xgbc_proba.ravel(),
#     'MLPClassifier': y_hat_valid_mlpc_proba.ravel(),
#     'target': y_test
#    })

base_predictions_ALL.head()                               

Unnamed: 0,LogReg,LinearSVC,XGBClassifier,MLPClassifier,target
0,0.143061,0.101987,0.190536,0.265471,0
1,0.211016,0.170361,0.157204,0.241808,0
2,0.299475,0.266636,0.292053,0.247996,0
3,0.745033,0.778167,0.681992,0.749023,0
4,0.686405,0.729042,0.661039,0.749023,1


In [61]:
#Stacked Model Predictions:
#(C=0.001, Kaggle Score=0.74718)
#(C=0.005, Kaggle Score=0.74820)
#(C=0.01, Kaggle Score=0.74825)
#(C=0.1, Kaggle Score=0.0.74770)
#(C=0.01 minus XGBC, Kaggle Score=0.74760)
#(C=1 minus XGBC, Kaggle Score=0.74873)
#(C=0.5 minus XGBC, Kaggle Score=0.74866)
#(C=1 minus XGBC & LogReg, Kaggle Score=0.74837)
#(C=1 minus XGBC & linearSVC, Kaggle Score=0.74834)


final_stack_LR_ALL=LogisticRegression(C=1, 
                solver='lbfgs', 
                penalty='l2',  
                fit_intercept=True, 
                max_iter=200,
                random_state=0
                )

final_stack_LR_ALL.fit(base_predictions_ALL.drop(columns = ['XGBClassifier', 'target']), base_predictions_ALL['target']) 
#final_stack_LR_train.fit(base_predictions_train.drop(columns = ['target']), base_predictions_train['target']) 

y_hat_ALL_stack_LR_proba=final_stack_LR_ALL.predict_proba(base_predictions_ALL.drop(columns = ['XGBClassifier', 'target']))[::,1] 
#y_hat_train_stack_LR_proba=final_stack_LR_ALL.predict_proba(base_predictions_train.drop(columns = ['target']))[::,1] 
#y_hat_valid_stack_LR_proba=final_stack_LR_ALL.predict_proba(base_predictions_valid.drop(columns = ['target']))[::,1] 

auc_score_ALL_stack_LR = roc_auc_score(y, y_hat_ALL_stack_LR_proba) * 100
#auc_score_train_stack_LR = roc_auc_score(y_train, y_hat_train_stack_LR_proba) * 100
#auc_score_valid_stack_LR = roc_auc_score(y_test, y_hat_valid_stack_LR_proba) * 100

print("ROC_AUC Score = {:.2f}%  of Stacked LogReg Model on ALL the data.".format(auc_score_ALL_stack_LR))
#print("ROC_AUC Score = {:.2f}%  of Stacked LogReg Model on the training data.".format(auc_score_train_stack_LR))
#print("ROC_AUC Score = {:.2f}%  of Stacked LogReg Model on the validation data.".format(auc_score_valid_stack_LR))


ROC_AUC Score = 75.55%  of Stacked LogReg Model on ALL the data.


In [62]:
print("ROC_AUC Score = {:.2f}%  of Logistic Regression Model on ALL the data.".format(auc_score_ALL_logit))
print("ROC_AUC Score = {:.2f}%  of LinearSVC Model on ALL the data.".format(auc_score_ALL_cclf))
print("ROC_AUC Score = {:.2f}%  of XGBoost Classifier Model on ALL the data.".format(auc_score_ALL_xgbc))
print("ROC_AUC Score = {:.2f}%  of NN Model on ALL the data.".format(auc_score_ALL_mlpc))
print("ROC_AUC Score = {:.2f}%  of Stacked LogReg Model on ALL the data.".format(auc_score_ALL_stack_LR))

ROC_AUC Score = 74.92%  of Logistic Regression Model on ALL the data.
ROC_AUC Score = 74.93%  of LinearSVC Model on ALL the data.
ROC_AUC Score = 76.05%  of XGBoost Classifier Model on ALL the data.
ROC_AUC Score = 75.36%  of NN Model on ALL the data.
ROC_AUC Score = 75.55%  of Stacked LogReg Model on ALL the data.


In [63]:
# Prediction of Stacked Model for the Kaggle Test Dataset 

data_test = pd.read_csv("./data/test.csv", sep = ",")


znormalizer.fit(X[num_cols])
data_test_norm = pd.DataFrame(znormalizer.transform(data_test[num_cols]), columns = num_cols)

y_hat_test_logit_proba = best_LR.predict_proba(data_test_norm)[::,1]
y_hat_test_cclf_proba = cclf.predict_proba(data_test_norm)[::,1]
y_hat_test_xgbc_proba = best_XGBC.predict_proba(data_test_norm)[::,1]
y_hat_test_mlpc_proba = best_mlpc.predict_proba(data_test_norm)[::,1]

base_predictions_test = pd.DataFrame( {'LogReg': y_hat_test_logit_proba.ravel(),
     'LinearSVC': y_hat_test_cclf_proba.ravel(),
     'XGBClassifier': y_hat_test_xgbc_proba.ravel(),
      'MLPClassifier': y_hat_test_mlpc_proba.ravel()
    })


test_predict = final_stack_LR_ALL.predict_proba(base_predictions_test.drop(columns = ['XGBClassifier']))[::,1]
test_predict=test_predict.astype(float)
array=np.array(test_predict).tolist()
df_stack_LogReg=pd.DataFrame(data_test['id'])
df_stack_LogReg['id'] = df_stack_LogReg['id'].astype(int)
df_stack_LogReg['target'] = np.array(array)
df_stack_LogReg.to_csv('Tab-Nov-2021_stacked_LogReg_final.csv', sep=',', encoding='utf-8', index=False)

In [26]:
# Prediction of MLPClassifier (fitted above and used in the stacked model) for the Kaggle Test Dataset

test_predict = best_mlpc.predict_proba(data_test_norm)[::,1]
test_predict=test_predict.astype(float)
array=np.array(test_predict).tolist()
df_mlpc=pd.DataFrame(data_test['id'])
df_mlpc['id'] = df_mlpc['id'].astype(int)
df_mlpc['target'] = np.array(array)
df_mlpc.to_csv('Tab-Nov-2021_MLPClassifier_final.csv', sep=',', encoding='utf-8', index=False)