In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib as mplt
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_curve, roc_auc_score, f1_score

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

from sklearn.pipeline import Pipeline

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier

import tensorflow as tf

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import Adamax


In [3]:
np.random.seed(42)
tf.random.set_seed(42)

In [4]:
train_data = pd.read_csv('Data/train.csv')
test_data = pd.read_csv('Data/test.csv')
submission_file = pd.read_csv('Data/sample_submission.csv')

In [5]:
train_data.drop(columns=['id'], axis=1, inplace=True)

In [6]:
print(train_data.shape)
print(test_data.shape)

(600000, 101)
(540000, 101)


In [7]:
X, y = train_data.drop(columns = ['target']), train_data['target']


In [8]:
dtypes = train_data.dtypes
dtypes = dtypes[dtypes != 'object']
features = list(set(dtypes.index) - set(['target']))

len(features)

100

In [9]:
X.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f90,f91,f92,f93,f94,f95,f96,f97,f98,f99
0,0.106643,3.59437,132.804,3.18428,0.081971,1.18859,3.73238,2.26627,2.09959,0.01233,...,0.010739,1.09862,0.013331,-0.011715,0.052759,0.0654,4.21125,1.97877,0.085974,0.240496
1,0.125021,1.67336,76.5336,3.37825,0.0994,5.09366,1.27562,-0.471318,4.54594,0.037706,...,0.135838,3.46017,0.017054,0.124863,0.154064,0.606848,-0.267928,2.57786,-0.020877,0.024719
2,0.03633,1.49747,233.546,2.19435,0.026914,3.12694,5.05687,3.84946,1.80187,0.056995,...,0.11731,4.883,0.085222,0.032396,0.116092,-0.001688,-0.520069,2.14112,0.124464,0.148209
3,-0.014077,0.246,779.967,1.89064,0.006948,1.53112,2.698,4.51733,4.50332,0.123494,...,-0.015347,3.47439,-0.017103,-0.0081,0.062013,0.041193,0.511657,1.9686,0.040017,0.044873
4,-0.003259,3.71542,156.128,2.14772,0.018284,2.09859,4.15492,-0.038236,3.37145,0.034166,...,0.013781,1.91059,-0.042943,0.105616,0.125072,0.037509,1.04379,1.07481,-0.012819,0.072798


In [10]:
y.describe()

count    600000.000000
mean          0.506010
std           0.499964
min           0.000000
25%           0.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: target, dtype: float64

In [11]:
X = X.astype(float)
y = y.astype(int)

X.fillna(X.mean(), inplace=True)
y.fillna(y.mean(), inplace=True)

print(f'check for null value in X: {X.isnull().sum().sum()}')
print(f'check for null value in y: {y.isnull().sum().sum()}')

check for null value in X: 0
check for null value in y: 0


## Define Common Functions

In [12]:
def report_results(model_name, y_test, y_train, grid_search_model, grid_search_results):

	# summarize results
	print("Best: %f using %s" % (grid_search_results.best_score_, grid_search_results.best_params_))
	means = grid_search_results.cv_results_['mean_test_score']
	stds = grid_search_results.cv_results_['std_test_score']
	params = grid_search_results.cv_results_['params']

	for mean, stdev, param in zip(means, stds, params):
		print("%f (%f) with: %r" % (mean, stdev, param))

	y_hat_train = grid_search_model.predict(X_train)
	y_hat_test = grid_search_model.predict(X_test)

	train_score = accuracy_score(y_train, y_hat_train, normalize=False)
	print(f'trian score: {train_score / y_train.shape[0]}')

	test_score = accuracy_score(y_test, y_hat_test, normalize=False)
	print(f'test score: {test_score / y_test.shape[0]}')

	precision_train_score = precision_score(y_train, y_hat_train) * 100
	precision_test_score = precision_score(y_test, y_hat_test) * 100

	recall_train_score = recall_score(y_train, y_hat_train) * 100
	recall_test_score = recall_score(y_test, y_hat_test) * 100

	f1_train_score = f1_score(y_train, y_hat_train) * 100
	f1_test_score = f1_score(y_test, y_hat_test) * 100

	auc_train_score = roc_auc_score(y_train, y_hat_train) * 100
	auc_test_score = roc_auc_score(y_test, y_hat_test) * 100

	print("Precision = {:.2f}% , recall = {:.2f}% and f1_score={:.2f}% of the % model on the training data.".format(precision_train_score, recall_train_score, f1_train_score, model_name))
	print("Precision = {:.2f}% , recall = {:.2f}% and f1_score={:.2f}% of the % model on the validation data.".format(precision_test_score, recall_test_score, f1_test_score, model_name))
	print("ROC_AUC Score = {:.2f}%  of the % model on the training data.".format(auc_train_score, model_name))
	print("ROC_AUC Score = {:.2f}%  of the % model on the validation data.".format(auc_test_score, model_name))


## Initial Baseline Implementation using KerasClassifier

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

scaler = StandardScaler()

num_cols = X_train.select_dtypes(['integer', 'float']).columns

X_train = pd.DataFrame(scaler.fit_transform(X_train[num_cols]), columns=num_cols)
X_test = pd.DataFrame(scaler.fit_transform(X_test[num_cols]), columns=num_cols)

def create_model(optimizer='adam', init='glorot_uniform', learning_rate=0.001):

	# create model
	model = Sequential()
	model.add(Dense(128, input_dim=100, kernel_initializer=init, activation='relu'))
	model.add(Dropout(0.2))
	model.add(Dense(64, kernel_initializer=init, activation='relu'))
	model.add(Dropout(0.2))
	#model.add(Dense(32, kernel_initializer=init, activation='relu'))
	#model.add(Dropout(0.2))
	model.add(Dense(16, kernel_initializer=init, activation='relu'))
	model.add(Dropout(0.2))	
	#model.add(Dense(8, kernel_initializer=init, activation='relu'))
	#model.add(Dropout(0.2))	
	model.add(Dense(1, kernel_initializer=init, activation='sigmoid'))

	# Compile model
	model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=learning_rate), metrics=['accuracy'])
	return model

# create model
# KerasClassifier is deprecated, use scikeras instead
# https://github.com/adriangb/scikeras
# https://www.adriangb.com/scikeras/stable/quickstart.html#training-a-model
model = KerasClassifier(build_fn=create_model, verbose=0)

# grid search epochs, batch size and optimizer
optimizers = ['adam']
init = ['glorot_uniform']
epochs = [140]
batches = [2048]
learning_rate = [0.001]

param_grid = dict(optimizer=optimizers, epochs=epochs, batch_size=batches, init=init, learning_rate=learning_rate)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=4)
grid_result = grid.fit(X_train, y_train)

report_results('KerasClassifier', y_test, y_train, grid, grid_result)

  model = KerasClassifier(build_fn=create_model, verbose=0)
2021-11-29 13:46:57.925483: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-29 13:46:57.931056: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-29 13:46:57.931055: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To ena

Best: 0.743164 using {'batch_size': 2048, 'epochs': 140, 'init': 'glorot_uniform', 'learning_rate': 0.001, 'optimizer': 'adam'}
0.743164 (0.002188) with: {'batch_size': 2048, 'epochs': 140, 'init': 'glorot_uniform', 'learning_rate': 0.001, 'optimizer': 'adam'}
trian score: 0.7501955555555555
test score: 0.7453933333333334
Precision = 74.87% , recall = 76.19% and f1_score=75.53% of the % model on the training data.
Precision = 74.43% , recall = 75.73% and f1_score=75.07% of the % model on the validation data.
ROC_AUC Score = 75.01%  of the % model on the training data.
ROC_AUC Score = 74.52%  of the % model on the validation data.


In [14]:
# re-train with best parameter

X_train, y_train = X, y

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

scaler = StandardScaler()

num_cols = X_train.select_dtypes(['integer', 'float']).columns

X_train = pd.DataFrame(scaler.fit_transform(X_train[num_cols]), columns=num_cols)

# create model
model = KerasClassifier(build_fn=create_model, verbose=0)

optimizers = ['adam']
init = ['glorot_uniform']
epochs = [140]
batches = [2048]
learning_rate = [0.001]

param_grid = dict(optimizer=optimizers, epochs=epochs, batch_size=batches, init=init)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

# public score: 0.74687 (epoch=120) => 0.74720 (epoch=140)

  model = KerasClassifier(build_fn=create_model, verbose=0)
2021-11-29 13:58:12.805557: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-29 13:58:12.806410: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-29 13:58:12.807252: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To ena

In [22]:
# re-train with best parameter

X_train, y_train = X, y

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)

scaler = StandardScaler()

num_cols = X_train.select_dtypes(['integer', 'float']).columns

X_train = pd.DataFrame(scaler.fit_transform(X_train[num_cols]), columns=num_cols)

# create model
model = KerasClassifier(build_fn=create_model, verbose=1)

optimizers = ['adam']
init = ['glorot_uniform']
epochs = [130]
batches = [2048]
learning_rate = [0.001]

param_grid = dict(optimizer=optimizers, epochs=epochs, batch_size=batches, init=init)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

report_results('KerasClassifier', y_test, y_train, grid, grid_result)

# public score: 0.74712

  model = KerasClassifier(build_fn=create_model, verbose=1)
2021-11-29 14:40:05.663738: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-29 14:40:05.663755: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-11-29 14:40:05.664624: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To ena

Epoch 1/130
Epoch 1/130
Epoch 1/130
Epoch 1/130
Epoch 1/130
Epoch 2/130
  1/235 [..............................] - ETA: 4s - loss: 0.6186 - accuracy: 0.6924Epoch 2/130
Epoch 2/130
Epoch 2/130
Epoch 2/130
Epoch 3/130
Epoch 3/130
Epoch 3/130
  1/235 [..............................] - ETA: 2s - loss: 0.5909 - accuracy: 0.7256Epoch 3/130
Epoch 3/130
Epoch 4/130
Epoch 4/130
Epoch 4/130
Epoch 4/130
Epoch 5/130
Epoch 5/130
Epoch 5/130
 21/235 [=>............................] - ETA: 2s - loss: 0.5775 - accuracy: 0.7380Epoch 5/130
Epoch 6/130
Epoch 6/130
 16/235 [=>............................] - ETA: 3s - loss: 0.5727 - accuracy: 0.7415Epoch 6/130
Epoch 6/130
Epoch 6/130
Epoch 7/130
Epoch 7/130
Epoch 7/130
Epoch 7/130
Epoch 7/130
Epoch 8/130
Epoch 8/130
Epoch 8/130
Epoch 8/130
Epoch 8/130
Epoch 9/130
Epoch 9/130
Epoch 9/130
Epoch 9/130
Epoch 9/130
Epoch 10/130
Epoch 10/130
Epoch 10/130
Epoch 10/130
Epoch 10/130
Epoch 11/130
Epoch 11/130
Epoch 11/130
Epoch 11/130
Epoch 11/130
Epoch 12/130
Epoch

## Kaggle Submission

In [23]:
data_test_norm = pd.DataFrame(scaler.transform(test_data[num_cols]), columns = num_cols)

test_predict = grid.predict_proba(data_test_norm)[::,1]
test_predict = test_predict.astype(float)
array = np.array(test_predict).tolist()
df = pd.DataFrame(test_data['id'])
df['id'] = df['id'].astype(int)
df['target'] = np.array(array)
df.to_csv('results/keras_results.csv', sep=',', encoding='utf-8', index=False)

In [24]:
%cd results/

!kaggle  competitions  submit -c tabular-playground-series-nov-2021 -f keras_results.csv -m "keras implementation"

%cd ..

/Users/kyle/Documents/github-data-research-team/kaggle-competitions/tabular-playground-series-nov-2021/results
100%|███████████████████████████████████████| 13.6M/13.6M [00:20<00:00, 700kB/s]
Successfully submitted to Tabular Playground Series - Nov 2021/Users/kyle/Documents/github-data-research-team/kaggle-competitions/tabular-playground-series-nov-2021


In [25]:
!kaggle competitions submissions -c tabular-playground-series-nov-2021 -q

fileName           date                 description                         status    publicScore  privateScore  
-----------------  -------------------  ----------------------------------  --------  -----------  ------------  
keras_results.csv  2021-11-30 00:08:30  keras implementation                complete  0.74712      None          
keras_results.csv  2021-11-29 22:37:15  keras implementation                complete  0.74526      None          
keras_results.csv  2021-11-29 22:19:13  keras implementation                complete  0.74720      None          
keras_results.csv  2021-11-28 23:45:05  keras implementation                complete  0.74584      None          
keras_results.csv  2021-11-28 23:07:15  keras implementation                complete  0.74687      None          
keras_results.csv  2021-11-28 05:52:28  keras implementation                complete  0.74629      None          
keras_results.csv  2021-11-28 03:51:50  keras implementation                complete  0.

## 