In [59]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib as mplt
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_curve, roc_auc_score, f1_score

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.pipeline import Pipeline

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier

import tensorflow as tf

from tensorflow.keras.optimizers import Adam


In [60]:
np.random.seed(42)
tf.random.set_seed(42)

In [79]:
train_data = pd.read_csv('Data/train.csv')
test_data = pd.read_csv('Data/test.csv')
submission_file = pd.read_csv('Data/sample_submission.csv')

In [62]:
train_data.drop(columns=['id'], axis=1, inplace=True)

In [63]:
print(train_data.shape)
print(test_data.shape)

(600000, 101)
(540000, 101)


In [64]:
X, y = train_data.drop(columns = ['target']), train_data['target']


In [65]:
dtypes = train_data.dtypes
dtypes = dtypes[dtypes != 'object']
features = list(set(dtypes.index) - set(['target']))

len(features)

100

In [66]:
train_data.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f91,f92,f93,f94,f95,f96,f97,f98,f99,target
0,0.106643,3.59437,132.804,3.18428,0.081971,1.18859,3.73238,2.26627,2.09959,0.01233,...,1.09862,0.013331,-0.011715,0.052759,0.0654,4.21125,1.97877,0.085974,0.240496,0
1,0.125021,1.67336,76.5336,3.37825,0.0994,5.09366,1.27562,-0.471318,4.54594,0.037706,...,3.46017,0.017054,0.124863,0.154064,0.606848,-0.267928,2.57786,-0.020877,0.024719,0
2,0.03633,1.49747,233.546,2.19435,0.026914,3.12694,5.05687,3.84946,1.80187,0.056995,...,4.883,0.085222,0.032396,0.116092,-0.001688,-0.520069,2.14112,0.124464,0.148209,0
3,-0.014077,0.246,779.967,1.89064,0.006948,1.53112,2.698,4.51733,4.50332,0.123494,...,3.47439,-0.017103,-0.0081,0.062013,0.041193,0.511657,1.9686,0.040017,0.044873,0
4,-0.003259,3.71542,156.128,2.14772,0.018284,2.09859,4.15492,-0.038236,3.37145,0.034166,...,1.91059,-0.042943,0.105616,0.125072,0.037509,1.04379,1.07481,-0.012819,0.072798,1


In [67]:
X.describe()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
count,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,...,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0,600000.0
mean,0.306508,2.49759,306.644536,2.647901,0.17785,2.556832,2.69965,2.571593,2.538273,0.13437,...,0.071252,2.444471,0.15526,0.059407,0.144932,0.106419,2.547853,2.590159,0.158881,0.123048
std,0.52245,1.554018,551.743893,1.544529,0.417488,1.562527,1.564,1.549361,1.532988,0.421892,...,0.112654,1.542509,0.548397,0.119426,0.462015,0.209128,1.558427,1.525091,0.43619,0.264896
min,-3.79745,-1.22396,-1842.53,-1.36856,-3.20621,-1.16977,-1.05931,-1.28197,-1.24202,-2.57784,...,-3.67699,-1.2177,-9.76177,-4.66624,-3.1015,-1.27654,-1.58474,-1.25473,-3.9935,-2.78338
25%,0.026222,1.186237,43.5734,1.442028,0.019709,1.261038,1.38582,1.333848,1.292163,0.019563,...,0.020496,1.214177,0.018904,0.024483,0.017055,0.025461,1.247888,1.348078,0.013536,0.018105
50%,0.097788,2.5165,133.626,2.63413,0.061586,2.590425,2.801255,2.557985,2.47588,0.058752,...,0.054546,2.386845,0.068906,0.056649,0.063439,0.062151,2.60194,2.68209,0.058058,0.058471
75%,0.397184,3.78763,302.26225,3.90764,0.112712,3.813662,3.996913,3.82345,3.80436,0.101046,...,0.091619,3.693872,0.125165,0.088162,0.113114,0.102016,3.820665,3.83952,0.110718,0.104872
max,8.7815,6.22672,6119.28,6.52115,8.26547,6.51507,6.58678,6.25877,6.38967,7.07846,...,6.48294,6.57389,18.4128,10.2118,8.62327,3.65722,6.25436,6.1453,10.767,5.98811


In [68]:
y.describe()

count    600000.000000
mean          0.506010
std           0.499964
min           0.000000
25%           0.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: target, dtype: float64

In [69]:
X = X.astype(float)
y = y.astype(int)

X.fillna(X.mean(), inplace=True)
y.fillna(y.mean(), inplace=True)

print(f'check for null value in X: {X.isnull().sum().sum()}')
print(f'check for null value in y: {y.isnull().sum().sum()}')

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

In [75]:
scaler = StandardScaler()

num_cols = X_train.select_dtypes(['integer', 'float']).columns

X_train = pd.DataFrame(scaler.fit_transform(X_train[num_cols]), columns=num_cols)
X_test = pd.DataFrame(scaler.fit_transform(X_test[num_cols]), columns=num_cols)

In [77]:
# Function to create model, required for KerasClassifier
def create_model(optimizer='rmsprop', init='glorot_uniform'):

	# create model
	model = Sequential()
	model.add(Dense(128, input_dim=100, kernel_initializer=init, activation='relu'))
	model.add(Dropout(0.2))
	model.add(Dense(64, kernel_initializer=init, activation='relu'))
	model.add(Dropout(0.2))
	model.add(Dense(32, kernel_initializer=init, activation='relu'))
	model.add(Dropout(0.2))
	model.add(Dense(16, kernel_initializer=init, activation='relu'))
	model.add(Dropout(0.2))	
	model.add(Dense(1, kernel_initializer=init, activation='sigmoid'))

	# create model
	#model = Sequential()
	#model.add(Dense(32, input_dim=100, kernel_initializer=init, activation='relu'))
	#model.add(Dense(16, kernel_initializer=init, activation='relu'))
	#model.add(Dense(1, kernel_initializer=init, activation='sigmoid'))	
	
	# create model
	#model = Sequential()
	#model.add(Dense(128, input_dim=100, kernel_initializer=init, activation='relu'))
	#model.add(Dense(64, kernel_initializer=init, activation='relu'))
	#model.add(Dense(32, kernel_initializer=init, activation='relu'))
	#model.add(Dense(16, kernel_initializer=init, activation='relu'))
	#model.add(Dense(1, kernel_initializer=init, activation='sigmoid'))

	# Compile model
	model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
	return model

# create model
model = KerasClassifier(build_fn=create_model, verbose=0)

# grid search epochs, batch size and optimizer
#optimizers = ['rmsprop', 'adam']
#init = ['glorot_uniform', 'normal', 'uniform']
#epochs = [100, 150]
#batches = [1024, 2048]

optimizers = ['adam']
init = ['glorot_uniform']
epochs = [100] #[1000]
batches = [1024]

param_grid = dict(optimizer=optimizers, epochs=epochs, batch_size=batches, init=init)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=8)
grid_result = grid.fit(X_train, y_train)

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
	print("%f (%f) with: %r" % (mean, stdev, param))

y_hat_train = grid.predict(X_train)
y_hat_test = grid.predict(X_test)

train_score = accuracy_score(y_train, y_hat_train, normalize=False)
print(f'trian score: {train_score / y_train.shape[0]}')

test_score = accuracy_score(y_test, y_hat_test, normalize=False)
print(f'test score: {test_score / y_test.shape[0]}')

precision_train_logit_grid = precision_score(y_train, y_hat_train) * 100
precision_test_logit_grid = precision_score(y_test, y_hat_test) * 100

recall_train_logit_grid = recall_score(y_train, y_hat_train) * 100
recall_test_logit_grid = recall_score(y_test, y_hat_test) * 100

f1_score_train_logit=f1_score(y_train, y_hat_train) * 100
f1_score_test_logit=f1_score(y_test, y_hat_test) * 100

auc_score_train_logit_grid = roc_auc_score(y_train, y_hat_train) * 100
auc_score_test_logit_grid = roc_auc_score(y_test, y_hat_test) * 100

print("Precision = {:.2f}% , recall = {:.2f}% and f1_score={:.2f}% of the Logistic Regression Model on the training data.".format(precision_train_logit_grid, recall_train_logit_grid, f1_score_train_logit))
print("Precision = {:.2f}% , recall = {:.2f}% and f1_score={:.2f}% of the Logistic Regression Model on the validation data.".format(precision_test_logit_grid, recall_test_logit_grid, f1_score_test_logit_grid))
print("ROC_AUC Score = {:.2f}%  of Logistic Regression Model on the training data.".format(auc_score_train_logit_grid))
print("ROC_AUC Score = {:.2f}%  of Logistic Regression Model on the validation data.".format(auc_score_test_logit_grid))

  model = KerasClassifier(build_fn=create_model, verbose=0)
2021-11-25 10:54:02.492529: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-25 10:54:02.492974: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-25 10:54:02.493914: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To ena

Best: 0.708082 using {'batch_size': 2048, 'epochs': 1000, 'init': 'glorot_uniform', 'optimizer': 'adam'}
0.708082 (0.002871) with: {'batch_size': 2048, 'epochs': 1000, 'init': 'glorot_uniform', 'optimizer': 'adam'}
trian score: 0.7860022222222223
test score: 0.71152
Precision = 78.26% , recall = 79.89% and f1_score=79.07% of the Logistic Regression Model on the training data.
Precision = 71.13% , recall = 72.42% and f1_score=75.00% of the Logistic Regression Model on the validation data.
ROC_AUC Score = 78.58%  of Logistic Regression Model on the training data.
ROC_AUC Score = 71.14%  of Logistic Regression Model on the validation data.


In [80]:
data_test_norm = pd.DataFrame(scaler.transform(test_data[num_cols]), columns = num_cols)

test_predict = grid.predict_proba(data_test_norm)[::,1]
test_predict=test_predict.astype(float)
array=np.array(test_predict).tolist()
df=pd.DataFrame(test_data['id'])
df['id'] = df['id'].astype(int)
df['target'] = np.array(array)
df.to_csv('results/keras_results.csv', sep=',', encoding='utf-8', index=False)