<a href="https://colab.research.google.com/github/devyulbae/Kaggle/blob/main/Kag_BinClass_for_Bank_Churn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# readme
# for Kaggle competition
# name: Binary Classification with a Bank Churn Dataset
# period: 02.Jan.2024 ~ 01.Feb.2024
#
#
#

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

## Explore Dataset

In [None]:
# colab
use_colab = True
assert use_colab in [True, False]

# google drive
from google.colab import drive
drive.mount("/content/drive/")

In [None]:
# the save point
if use_colab:
    checkpoint_dir ='./drive/MyDrive/train_ckpt/bankChurn/exp1'
    if not os.path.isdir(checkpoint_dir):
        os.makedirs(checkpoint_dir)
else:
    checkpoint_dir = 'bankChurn/exp1'

In [None]:
train = pd.read_csv("/content/drive/MyDrive/datas/bank_churn_train.csv")
test = pd.read_csv("/content/drive/MyDrive/datas/bank_churn_test.csv")
train.head(5)

### From the given dataset we have been given following columns:
  ID: A serial number provided to each row

  Customer ID: A unique identifier for each customer

  Surname: The customer's surname or last name

  Credit Score: A numerical value representing the customer's credit score

  Geography: The country where the customer resides (France, Spain or Germany)

  Gender: The customer's gender (Male or Female)

  Age: The customer's age.

  Tenure: The number of years the customer has been with the bank

  Balance: The customer's account balance

  NumOfProducts: The number of bank products the customer uses (e.g., savings account, credit card)

  HasCrCard: Whether the customer has a credit card (1 = yes, 0 = no)

  IsActiveMember: Whether the customer is an active member (1 = yes, 0 = no)

  EstimatedSalary: The estimated salary of the customer

  Exited: Whether the customer has churned (1 = yes, 0 = no)

In [None]:
print("Shape: ", train.shape)
print(train.dtypes)

In [None]:
print("Shape: ", test.shape)
print(test.dtypes)

## Dealing With Categorical Data

In [None]:
train['Age'] = train['Age'].astype('int64')
train['HasCrCard'] = train['HasCrCard'].astype('int64')

test['Age'] = test['Age'].astype('int64')
test['HasCrCard'] = test['HasCrCard'].astype('int64')

## Data Visualization

In [None]:
# use sweetviz
!pip install sweetviz

In [None]:
import sweetviz as sv
sweetviz_report = sv.analyze(train)
sweetviz_report.show_html('sweetviz_report.html')

In [None]:
zeroval=(train['Balance']==0).sum()
print("zeroval =",zeroval)
nonzeroval=(train['Balance']!=0).sum()
print("nonzeroval",nonzeroval)

In [None]:
# Handling the Gender Column
train['Gender'].nunique()

In [None]:
train['Gender'].head(3)

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train['Gender'] = label_encoder.fit_transform(train['Gender'])   # female -> 0, male -> 1

test['Gender'] = label_encoder.fit_transform(test['Gender'])   # female -> 0, male -> 1

In [None]:
train['Gender'].head(3)

In [None]:
train.dtypes

In [None]:
train.isnull().sum()

In [None]:
# Geography -> obj to int,
train['Geography'].value_counts()

In [None]:
label_encoder = LabelEncoder()
train['Geography'] = label_encoder.fit_transform(train['Geography'])
test['Geography'] = label_encoder.fit_transform(test['Geography'])

# France : 0 / Germany : 1 / Spain : 2
sns.countplot(data=train, x='Geography')
plt.show()

In [None]:
train['IsActiveMember'] = train['IsActiveMember'].astype(int)
train['EstimatedSalary'] = train['EstimatedSalary'].round().astype(int)
train['Balance'] = train['Balance'].round().astype(int)

train = train.drop('Surname', axis=1)
train = train.drop('id', axis=1)
train = train.drop('CustomerId', axis=1)

In [None]:
test['IsActiveMember'] = test['IsActiveMember'].astype(int)
test['EstimatedSalary'] = test['EstimatedSalary'].round().astype(int)
test['Balance'] = test['Balance'].round().astype(int)

test = test.drop('Surname', axis=1)
test = test.drop('CustomerId', axis=1)

In [None]:
train.dtypes

In [None]:
changed_rp = sv.analyze(train)
changed_rp.show_html('sweetviz_report.html')

In [None]:
train.head(5)

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
train['CreditScore'] = scaler.fit_transform(train[['CreditScore']])
train['Balance'] = scaler.fit_transform(train[['Balance']])
train['EstimatedSalary'] = scaler.fit_transform(train[['EstimatedSalary']])

test['CreditScore'] = scaler.fit_transform(test[['CreditScore']])
test['Balance'] = scaler.fit_transform(test[['Balance']])
test['EstimatedSalary'] = scaler.fit_transform(test[['EstimatedSalary']])

In [None]:
train.head(5)

## Building Our Models

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
X = train.drop('Exited',axis = 1)
y = train['Exited']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

In [None]:
batch_size = 16

train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
train_dataset = train_dataset.shuffle(10000).repeat().batch(batch_size=batch_size, drop_remainder= True)

test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))
test_dataset = test_dataset.shuffle(10000).repeat().batch(batch_size=batch_size, drop_remainder= True)



In [None]:
def mish(x):
    return x * keras.backend.tanh(keras.backend.softplus(x))

model = keras.Sequential()

model.add(layers.Dense(10))
model.add(layers.Activation(mish))

model.add(layers.Dense(16))
model.add(layers.Activation(mish))

model.add(layers.Dense(32))
model.add(layers.Activation(mish))

model.add(layers.Dense(64))
model.add(layers.Activation(mish))

model.add(layers.Dense(64))
model.add(layers.Activation(mish))

model.add(layers.Dense(128))
model.add(layers.Activation(mish))

model.add(layers.Dense(128))
model.add(layers.Activation(mish))


model.add(layers.Dense(1))

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(1e-4),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
cp_callback = tf.keras.callbacks.ModelCheckpoint(checkpoint_dir,
                                                 save_weights_only=True,
                                                 monitor='val_loss',
                                                 mode='auto',
                                                 save_best_only=True,
                                                 verbose=1)

early_stop = tf.keras.callbacks.EarlyStopping(patience=3,
                                                     monitor='val_loss',
                                                     restore_best_weights=True,
                                                     verbose=1)

In [None]:
max_epochs = 15

history = model.fit(train_dataset, epochs= max_epochs,
                    steps_per_epoch= len(train) // batch_size,
                    validation_data = test_dataset,
                    validation_steps = len(test) // batch_size,
                    callbacks= [cp_callback, early_stop])

In [None]:
# Submission
eval_result = model.evaluate(X_test, y_test)
print("Test Accuracy:", eval_result[1])

In [None]:
import matplotlib.pyplot as plt


# Plot the training loss and validation loss
plt.plot(history.history['accuracy'], label='training accuracy')
plt.plot(history.history['val_accuracy'], label='Validation accuracy')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

## Submission

In [None]:
ids = test['id']
test.drop(['id'], axis = 1, inplace = True)

In [None]:
test_predictions = model.predict(test)

In [None]:
submission = pd.DataFrame()
submission['id'] = ids
submission['Exited'] = test_predictions

In [None]:
submission.head(5)

In [None]:
submission.to_csv('submission.csv', index = False)