In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from tensorflow.keras import models, layers
import keras_tuner

In [None]:
dataset = pd.read_csv('../data/raw/train.csv')

X = dataset.drop(['Exited'], axis=1)
y = dataset['Exited']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=0.7, random_state=42, stratify=y)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

X_train

### Preprocessing data pipeline

In [None]:
preprocessor = ColumnTransformer(
    transformers = [
        ('geography_encoder', OneHotEncoder(), ['Geography']),
        ('gender_encoder', OrdinalEncoder(), ['Gender']),
        ('std_encoder', StandardScaler(), [
            'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary'
        ]),
        ('passthrough', 'passthrough', ['HasCrCard', 'IsActiveMember'])
    ]
)

In [None]:
X_train_clean = preprocessor.fit_transform(X_train)
X_val_clean = preprocessor.transform(X_val)
X_test_clean = preprocessor.transform(X_test)

### Model creation

In [None]:
def build_model(hp):
    model = models.Sequential(
        layers = [
            layers.Dense(
                units=hp.Choice('units_0', values=[16, 32, 64, 128, 256]),
                activation=hp.Choice('activation_0', values=['relu', 'tanh', 'sigmoid'])
            ),
            layers.Dense(
                units=hp.Choice('units_1', values=[16, 32, 64, 128, 256]),
                activation=hp.Choice('activation_1', values=['relu', 'tanh', 'sigmoid'])
            ),
            layers.Dense(
                units=hp.Choice('units_2', values=[16, 32, 64, 128, 256]),
                activation=hp.Choice('activation_2', values=['relu', 'tanh', 'sigmoid'])
            ),
            layers.Dense(
                units=hp.Choice('units_3', values=[16, 32, 64, 128, 256]),
                activation=hp.Choice('activation_3', values=['relu', 'tanh', 'sigmoid']),
                kernel_regularizer=hp.Choice('kernel_regularizer_3', values=['l1', 'l2', 'l1_l2'])
            ),
            layers.Dense(
                units=hp.Choice('units_4', values=[16, 32, 64, 128, 256]),
                activation=hp.Choice('activation_4', values=['relu', 'tanh', 'sigmoid'])
            ),
            layers.Dense(
                units=hp.Choice('units_5', values=[16, 32, 64, 128, 256]),
                activation=hp.Choice('activation_5', values=['relu', 'tanh', 'sigmoid'])
            ),
            layers.Dense(1, activation='sigmoid')
        ]
    )

    model.compile(
        optimizer=hp.Choice('optimizer', values=['adam', 'sgd']),
        loss='binary_crossentropy',
        metrics='AUC'
    )

    return model

In [None]:
tuner = keras_tuner.tuners.Hyperband(
    hypermodel=build_model,
    objective=keras_tuner.Objective('val_auc', direction='max'),
    max_epochs=50,
    directory='../models/',
    project_name='bank_churn_binary_classification',
    seed=42,
    overwrite=True
)

In [None]:
tuner.search(
    X_train_clean, y_train, batch_size=512, validation_data=(X_val_clean, y_val)
)

### Model validation

In [None]:
best_model = tuner.get_best_models(1)[0]
y_val_predicted = best_model.predict(X_val_clean)

In [None]:
fpr, tpr, thresholds = roc_curve(y_val, y_val_predicted)
threshold = thresholds[np.argmax(tpr * (1 - fpr))]

In [None]:
plt.plot(fpr, thresholds, label='FPR')
plt.plot(1-tpr, thresholds, label='1 - TPR')
plt.plot([threshold, threshold], [0, 1], label='Optime thresshold')
plt.xlabel('Threshold')
plt.title('ROC Curve')
plt.legend()
plt.plot()

In [None]:
y_test_predicted = (best_model.predict(X_test_clean) > threshold).astype(int)
auc_score_test = roc_auc_score(y_test, y_test_predicted)
print(f'ROC AUC Score: {auc_score_test}')

### Export solution

In [None]:
submission_dataset = pd.read_csv('../data/raw/sample_submission.csv')
X_submission = pd.read_csv('../data/raw/test.csv')
X_submission_clean = preprocessor.transform(X_submission)

y_submission_predicted = (best_model.predict(X_submission_clean) > threshold).astype(int)

submission_dataset['Exited'] = y_submission_predicted

file_name = f'submission'
submission_dataset.to_csv(f'../data/processed/{file_name}.csv', index=False)

print(f'File {file_name} saved')