In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib as mplt
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_curve, roc_auc_score, f1_score

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline

from sklearn.ensemble import StackingClassifier

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import Adamax

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier

from xgboost import XGBClassifier 

import warnings
warnings.filterwarnings('ignore')
 


In [2]:
data_train = pd.read_csv("./data/train.csv", sep = ",") ##Add your own path to access data
data_train=data_train.drop(['id'], axis=1) 
data_train.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f91,f92,f93,f94,f95,f96,f97,f98,f99,target
0,0.106643,3.59437,132.804,3.18428,0.081971,1.18859,3.73238,2.26627,2.09959,0.01233,...,1.09862,0.013331,-0.011715,0.052759,0.0654,4.21125,1.97877,0.085974,0.240496,0
1,0.125021,1.67336,76.5336,3.37825,0.0994,5.09366,1.27562,-0.471318,4.54594,0.037706,...,3.46017,0.017054,0.124863,0.154064,0.606848,-0.267928,2.57786,-0.020877,0.024719,0
2,0.03633,1.49747,233.546,2.19435,0.026914,3.12694,5.05687,3.84946,1.80187,0.056995,...,4.883,0.085222,0.032396,0.116092,-0.001688,-0.520069,2.14112,0.124464,0.148209,0
3,-0.014077,0.246,779.967,1.89064,0.006948,1.53112,2.698,4.51733,4.50332,0.123494,...,3.47439,-0.017103,-0.0081,0.062013,0.041193,0.511657,1.9686,0.040017,0.044873,0
4,-0.003259,3.71542,156.128,2.14772,0.018284,2.09859,4.15492,-0.038236,3.37145,0.034166,...,1.91059,-0.042943,0.105616,0.125072,0.037509,1.04379,1.07481,-0.012819,0.072798,1


In [3]:
X_train, X_test, y_train, y_test = train_test_split(data_train.drop(columns = 'target'), data_train['target'], 
                                                    test_size = 0.25)

X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)

In [4]:
znormalizer = StandardScaler()
robust_scaler = RobustScaler()

X=data_train.drop(columns = 'target')
y=data_train['target']
num_cols = X_train.select_dtypes(['integer', 'float']).columns

znormalizer.fit(X_train[num_cols])
robust_scaler.fit(X_train[num_cols])

X_train_norm = pd.DataFrame(znormalizer.transform(X_train[num_cols]), columns = num_cols)
X_test_norm = pd.DataFrame(znormalizer.transform(X_test[num_cols]), columns = num_cols)

X_train_robust = pd.DataFrame(robust_scaler.transform(X_train[num_cols]), columns = num_cols)
X_test_robust = pd.DataFrame(robust_scaler.transform(X_test[num_cols]), columns = num_cols)


In [5]:
##Best Logistic Regression
best_LR= LogisticRegression(C=0.0002, 
                solver='saga', 
                penalty='l2', 
                fit_intercept=False,
                max_iter=400
                )

best_LR.fit(X_train_norm, y_train)

y_hat_train_logit_proba = best_LR.predict_proba(X_train_norm)[::,1]
y_hat_test_logit_proba = best_LR.predict_proba(X_test_norm)[::,1]

auc_score_train_logit = roc_auc_score(y_train, y_hat_train_logit_proba) * 100
auc_score_test_logit = roc_auc_score(y_test, y_hat_test_logit_proba) * 100

print("ROC_AUC Score = {:.8f}%  of Logistic Regression Model on the training data.".format(auc_score_train_logit))
print("ROC_AUC Score = {:.8f}%  of Logistic Regression Model on the validation data.".format(auc_score_test_logit))


ROC_AUC Score = 74.92015638%  of Logistic Regression Model on the training data.
ROC_AUC Score = 74.92631427%  of Logistic Regression Model on the validation data.


In [6]:
###Best XGBClassifier
best_XGBC= XGBClassifier(objective='binary:logistic', 
                         eval_metric='aucpr', 
                         gamma=0.05, 
                         subsample=0.6, 
                         min_child_weight=1, 
                         random_state =0, 
                         booster='gbtree', 
                         learning_rate=0.05, 
                         max_depth=3, 
                         reg_lambda=0.1, 
                         reg_alpha=0.3, 
                         n_estimators=1500)

best_XGBC.fit(X_train_norm, y_train)

y_hat_train_xgbc_proba = best_XGBC.predict_proba(X_train_norm)[::,1]
y_hat_test_xgbc_proba = best_XGBC.predict_proba(X_test_norm)[::,1]

auc_score_train_xgbc = roc_auc_score(y_train, y_hat_train_xgbc_proba) * 100
auc_score_test_xgbc = roc_auc_score(y_test, y_hat_test_xgbc_proba) * 100

print("ROC_AUC Score = {:.8f}%  of XGBoost Classifier Model on the training data.".format(auc_score_train_xgbc))
print("ROC_AUC Score = {:.8f}%  of XGBoost Classifier Model on the validation data.".format(auc_score_test_xgbc))


In [None]:
###Best LinearSVC

best_LinearSVC=LinearSVC(penalty='l2', 
                         loss='hinge', 
                         #tol=0.0001, 
                         C=0.001, 
                         fit_intercept=False, 
                         #verbose=0, 
                         #random_state=None, 
                         max_iter=1500)

best_LinearSVC.fit(X_train_norm, y_train)

cclf = CalibratedClassifierCV(base_estimator=best_LinearSVC, method='sigmoid', cv='prefit')

cclf.fit(X_train_norm, y_train)

y_hat_train_cclf_proba=cclf.predict_proba(X_train_norm)[::,1]
y_hat_test_cclf_proba=cclf.predict_proba(X_test_norm)[::,1]

auc_score_train_cclf = roc_auc_score(y_train, y_hat_train_cclf_proba) * 100
auc_score_test_cclf = roc_auc_score(y_test, y_hat_test_cclf_proba) * 100

print("ROC_AUC Score = {:.8f}%  of LinearSVC Model on the training data.".format(auc_score_train_cclf))
print("ROC_AUC Score = {:.8f}%  of LinearSVC Model on the validation data.".format(auc_score_test_cclf))

In [None]:
###Best Keras Classifier
# create model

def create_keras_sequential_model(optimizer='adam', init='glorot_uniform'):
	model = Sequential()
	model.add(Dense(128, input_dim=100, kernel_initializer=init, activation='relu'))
	model.add(Dropout(0.2))
	model.add(Dense(64, kernel_initializer=init, activation='relu'))
	model.add(Dropout(0.2))
	model.add(Dense(32, kernel_initializer=init, activation='relu'))
	model.add(Dropout(0.2))
	model.add(Dense(16, kernel_initializer=init, activation='relu'))
	model.add(Dropout(0.2))	
	model.add(Dense(1, kernel_initializer=init, activation='sigmoid'))

	# Compile model
	model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

	return model

# https://github.com/keras-team/keras/issues/13669
# http://rasbt.github.io/mlxtend/
best_keras = KerasClassifier(build_fn=create_keras_sequential_model,
                             optimizer='adam', 
                             epochs=140, 
                             batch_size=2048, 
                             init='glorot_uniform', 
                             verbose=0)
best_keras._estimator_type = 'classifier'

best_keras.fit(X_train_norm, y_train)

y_hat_train_keras_proba=best_keras.predict_proba(X_train_norm)[::,1]
y_hat_test_keras_proba=best_keras.predict_proba(X_test_norm)[::,1]

auc_score_train_keras = roc_auc_score(y_train, y_hat_train_keras_proba) * 100
auc_score_test_keras = roc_auc_score(y_test, y_hat_test_keras_proba) * 100

print("ROC_AUC Score = {:.8f}%  of NN Model on the training data.".format(auc_score_train_keras))
print("ROC_AUC Score = {:.8f}%  of NN Model on the validation data.".format(auc_score_test_keras))


In [None]:
####Add Ensemble Model Code here



## Ensemble - StackingClassifier

In [None]:
estimators = [('LR', best_LR), ('XGBC', best_XGBC), ('LinearSVC', best_LinearSVC), ('keras', best_keras)]

clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())

clf.fit(X_train, y_train)

print("Stacking model score: %.8f" % clf.score(X_test, y_test))