<a href="https://colab.research.google.com/github/fabriziobasso/kaggle/blob/main/Models_dnn_v0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **S4E1 BANK CHURN**

##### About Dataset
The bank customer churn dataset is a commonly used dataset for predicting customer churn in the banking industry. It contains information on bank customers who either left the bank or continue to be a customer. The dataset includes the following attributes:

* Customer ID: A unique identifier for each customer
* Surname: The customer's surname or last name
* Credit Score: A numerical value representing the customer's credit score
* Geography: The country where the customer resides (France, Spain or Germany)
* Gender: The customer's gender (Male or Female)
* Age: The customer's age.
* Tenure: The number of years the customer has been with the bank
* Balance: The customer's account balance
* NumOfProducts: The number of bank products the customer uses (e.g., savings account, credit card)
* HasCrCard: Whether the customer has a credit card (1 = yes, 0 = no)
* IsActiveMember: Whether the customer is an active member (1 = yes, 0 = no)
* EstimatedSalary: The estimated salary of the customer
* Exited: Whether the customer has churned (1 = yes, 0 = no)

##### Evaluation
Submissions are evaluated on area under the ROC curve between the predicted probability and the observed target.


The submitted probabilities for a given row are not required to sum to one because they are rescaled prior to being scored (each row is divided by the row sum). In order to avoid the extremes of the log function, predicted probabilities are replaced with
.

#### **Files:**
* train.csv - the training dataset; Hardness is the continuous target
* test.csv - the test dataset; your objective is to predict the value of Hardness
* sample_submission.csv - a sample submission file in the correct format
* Churn_Modelling.csv - Original Dataset

## 1.0 Workbook Set-up and Libraries:

#### 1.0 Libraries

In [1]:
%%capture
!pip install tensorflow-addons
#!pip install shap
#!pip install eli5
#!pip install tf-nightly
#!pip install -U scikit-learn==1.2.0
!pip install catboost
#!pip install haversine
#!pip install pytorch-forecasting
!pip install umap-learn
#!pip install reverse_geocoder
#!pip install --upgrade protobuf
!pip install colorama
!pip install imbalanced-learn
!pip install optuna
!pip install optuna-integration
#!pip install pygam
!pip install keras-tuner --upgrade
#!pip install pycaret
#!pip install lightning==2.0.1
!pip install keras-nlp
#!pip install MiniSom
!pip install BorutaShap

In [2]:
#importing modules

import warnings
warnings.filterwarnings('ignore')
import time
t = time.time()

print('Importing started...')

# basic moduele
import os
import numpy as np
import pandas as pd
import re
#from scipy import stats
from random import randint
import random
import math
import os
import gc
import pickle
from glob import glob
from IPython import display as ipd
from tqdm import tqdm
from datetime import datetime
from joblib import dump, load
import sklearn as sk
from imblearn.over_sampling import SMOTE, RandomOverSampler
from functools import partial
import itertools
import joblib
from itertools import combinations
import IPython
import statsmodels.api as sm
import IPython.display
from IPython.display import clear_output
from prettytable import PrettyTable

# visualization moduels
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatches
from matplotlib_venn import venn2_unweighted
import seaborn as sns
import missingno as msno
import imblearn
import scipy.stats as stats
from scipy.special import boxcox, boxcox1p


# Palette Setup
colors = ['#FB5B68','#FFEB48','#2676A1','#FFBDB0',]
colormap_0 = mpl.colors.LinearSegmentedColormap.from_list("",colors)
palette_1 = sns.color_palette("coolwarm", as_cmap=True)
palette_2 = sns.color_palette("YlOrBr", as_cmap=True)
palette_3 = sns.light_palette("red", as_cmap=True)
palette_4 = sns.color_palette("viridis", as_cmap=True)
palette_5 = sns.color_palette("rocket", as_cmap=True)
palette_6 = sns.color_palette("GnBu", as_cmap=True)
palette_7 = sns.color_palette("tab20c", as_cmap=False)
palette_8 = sns.color_palette("Set2", as_cmap=False)

palette_custom = ['#fbb4ae','#b3cde3','#ccebc5','#decbe4','#fed9a6','#ffffcc','#e5d8bd','#fddaec','#f2f2f2']
palette_9 = sns.color_palette(palette_custom, as_cmap=False)

sns.set_style("whitegrid",{"grid.linestyle":"--", 'grid.linewidth':0.2, 'grid.alpha':0.5})
#sns.set_theme(style="ticks", context="notebook")
sns.despine(left=True, bottom=True, top=False, right=False)

mpl.rcParams['axes.spines.left'] = True
mpl.rcParams['axes.spines.right'] = False
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.bottom'] = True

# Style Import
from colorama import Style, Fore
red = Style.BRIGHT + Fore.RED
blu = Style.BRIGHT + Fore.BLUE
mgt = Style.BRIGHT + Fore.MAGENTA
gld = Style.BRIGHT + Fore.YELLOW
res = Style.RESET_ALL

# preprocessing modules
from sklearn.model_selection import (train_test_split,
                                     KFold,
                                     StratifiedKFold,
                                     cross_val_score,
                                     GroupKFold,
                                     GridSearchCV,
                                     RepeatedStratifiedKFold)

from sklearn.preprocessing import (LabelEncoder,
                                   StandardScaler,
                                   MinMaxScaler,
                                   OrdinalEncoder,
                                   RobustScaler,
                                   PowerTransformer,
                                   OneHotEncoder,
                                   LabelEncoder,
                                   OrdinalEncoder,
                                   QuantileTransformer,
                                   PolynomialFeatures)

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.compose import ColumnTransformer

from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import FunctionTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


# metrics
from sklearn.metrics import (mean_squared_error,
                             r2_score,
                             mean_absolute_error,
                             mean_absolute_percentage_error,
                             classification_report,
                             confusion_matrix,
                             ConfusionMatrixDisplay,
                             multilabel_confusion_matrix,
                             accuracy_score,
                             roc_auc_score,
                             auc,
                             roc_curve,
                             log_loss)


# modeling algos
from sklearn.linear_model import (LogisticRegression,
                                  Lasso,
                                  ridge_regression,
                                  LinearRegression,
                                  Ridge,
                                  RidgeCV,
                                  ElasticNet,
                                  BayesianRidge,
                                  TweedieRegressor,
                                  ARDRegression,
                                  PoissonRegressor,
                                  GammaRegressor)

from sklearn.neighbors import KNeighborsRegressor

from sklearn.tree import DecisionTreeRegressor
from sklearn.isotonic import IsotonicRegression

from sklearn.ensemble import (AdaBoostRegressor,
                              RandomForestRegressor,
                              RandomForestClassifier,
                              VotingRegressor,
                              GradientBoostingRegressor,
                              GradientBoostingClassifier,
                              StackingRegressor,
                              HistGradientBoostingClassifier,
                              ExtraTreesClassifier)

from sklearn.base import BaseEstimator, TransformerMixin

# Other Models
#from pygam import LogisticGAM, s, te
import xgboost as xgb
from xgboost import XGBRegressor, XGBClassifier
import lightgbm as lgb
from lightgbm import (LGBMRegressor,
                      LGBMClassifier,
                      early_stopping,
                      record_evaluation,
                      log_evaluation)

#import catboost as cat
from catboost import CatBoost, CatBoostRegressor
from catboost import CatBoostClassifier

#from catboost.utils import get_roc_curve

from lightgbm import early_stopping
# check installed version
#import pycaret
warnings.filterwarnings("ignore")
#from minisom import MiniSom

from sklearn.base import clone ## sklearn base models for stacked ensemble model
from sklearn.calibration import CalibratedClassifierCV, CalibrationDisplay

#Interpretiability of the model
#import shap
#import eli5
#from eli5.sklearn import PermutationImportance


## miss
from sklearn.pipeline import (make_pipeline,
                              Pipeline)


import tensorflow as tf
# Only the TensorFlow backend supports string inputs.
os.environ["KERAS_BACKEND"] = "tensorflow"
import keras
from keras import layers
import tensorflow_addons as tfa
from keras.utils import FeatureSpace
import keras_nlp

# Import libraries for Hypertuning
import kerastuner as kt
from kerastuner.tuners import RandomSearch, GridSearch, BayesianOptimization
# Model Tuning tools:
import optuna
from optuna.integration import TFKerasPruningCallback
from optuna.integration import LightGBMPruningCallback, XGBoostPruningCallback
from optuna.trial import TrialState
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_contour
# Feature selection
from BorutaShap import BorutaShap
%matplotlib inline
SEED = 1984
N_SPLITS = 10

print('Done, All the required modules are imported. Time elapsed: {} sec'.format(time.time()-t))

Importing started...
Using TensorFlow backend
Done, All the required modules are imported. Time elapsed: 9.67065691947937 sec


<Figure size 640x480 with 0 Axes>

In [3]:
# Check Versions:
print("CHECK VERSIONS:")
print(f"sns: {sns.__version__}")
print(f"mpl: {mpl.__version__}")
print(f"tensorflow: {tf.__version__}")
print(f"pandas: {pd.__version__}")
print(f"numpy: {np.__version__}")
print(f"scikit-learn: {sk.__version__}")
print(f"statsmodels: {sm.__version__}")
print(f"missingno: {msno.__version__}")
#print(f"TF-addon: {tfa.__version__}")
print(f"Inbalance_Learning: {imblearn.__version__}")
print(f"XGBoost: {xgb.__version__}")
#print(f"CatBoost: {cat.__version__}")
#print(f"PyCaret: {pycaret.__version__}")

CHECK VERSIONS:
sns: 0.13.1
mpl: 3.7.1
tensorflow: 2.15.0
pandas: 1.5.3
numpy: 1.23.5
scikit-learn: 1.2.2
statsmodels: 0.14.1
missingno: 0.5.2
Inbalance_Learning: 0.10.1
XGBoost: 2.0.3


In [4]:
def seed_everything(seed,
                    tensorflow_init=True,
                    pytorch_init=True):
    """
    Seeds basic parameters for reproducibility of results
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    if tensorflow_init is True:
        tf.random.set_seed(seed)
    if pytorch_init is True:
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False


seed_everything(42,tensorflow_init=True,pytorch_init=False)

### **1.1 Utility Functions**

#### Graph Functions:

In [5]:
def summary(df):
    print(f'data shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#missing'] = df.isnull().sum().values
    summ['%missing'] = df.isnull().sum().values / len(df)* 100
    summ['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summ['min'] = desc['min'].values
    summ['max'] = desc['max'].values
    summ['median'] = desc['50%'].values
    summ['mean'] = desc['mean'].values
    return summ

def plot_confusion_matrix(y_true, y_pred, labels):
    """
    This function plots:
        1. Confusion matrix
        2. Precision matrix
        3. Recall matrix

    Parameters
    ----------
    `y_true`: ground truth (or actual) values
    `y_pred`: predicted values
    `labels`: integer encoded target values

    Returns none.
    """
    cmat = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=labels)
    pmat = cmat / cmat.sum(axis=0)
    print("Column sum of precision matrix: {}".format(pmat.sum(axis=0)))
    rmat = ((cmat.T) / (cmat.sum(axis=1).T)).T
    print("Row sum of recall matrix:       {}".format(rmat.sum(axis=1)))

    plt.figure(figsize=(15, 3))
    plt.subplot(131)
    plot_heatmap(matrix=cmat, title='Confusion Matrix', labels=labels)
    plt.subplot(132)
    plot_heatmap(matrix=pmat, title='Precision Matrix', labels=labels)
    plt.subplot(133)
    plot_heatmap(matrix=rmat, title='Recall Matrix', labels=labels)
    plt.show()

def plot_heatmap(matrix, title, labels):
    """
    This function plots the heatmap.

    Parameters
    ----------
    `matrix`: 2D array
    `title`: title
    `labels`: integer encoded target values

    Returns none.
    """
    sns.heatmap(data=matrix, annot=True, fmt='.2f', linewidths=0.1,
                xticklabels=labels, yticklabels=labels)
    plt.xlabel(xlabel='Predicted Class')
    plt.ylabel(ylabel='Actual Class')
    plt.title(label=title, fontsize=10)

#### NN Functions:

In [6]:
def encode_inputs(inputs, list_categorical_nn, Cat_Feat_Entries, edense=4, num_dense_exp=True, embedding_dims_det=None, name="enc"):
    encoded_categorical_feature_list = []
    numerical_feature_list = []

    for feature_name in inputs:
        if feature_name in list_categorical_nn:

          vocabulary = Cat_Feat_Entries[feature_name]

          if embedding_dims_det is not None:
            ed = embedding_dims_det[feature_name]
          else:
            ed = edense

          embedding = layers.Embedding(input_dim=len(vocabulary), output_dim=ed)
          # Convert the index values to embedding representations.
          encoded_categorical_feature = embedding(inputs[feature_name])
          if num_dense_exp:
            encoded_categorical_feature = tf.keras.layers.Dense(edense, name=f"cat_dense_{feature_name}_{name}")(encoded_categorical_feature)
            encoded_categorical_feature = keras.layers.Reshape((edense,), name=f"cat_reshape_{feature_name}_{name}")(encoded_categorical_feature)

          encoded_categorical_feature_list.append(encoded_categorical_feature)

        else:
          # Use the numerical features as-is.
          numerical_feature = inputs[feature_name] #tf.expand_dims(inputs[feature_name], -1)

          if num_dense_exp:
            numerical_feature = tf.keras.layers.Dense(edense, name=f"num_dense_{feature_name}_{name}")(numerical_feature)

          numerical_feature_list.append(numerical_feature)

    return encoded_categorical_feature_list, numerical_feature_list

### **1.2 Connect Drives**

In [7]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

/bin/bash: line 1: nvidia-smi: command not found


Connect to Google Drive:

In [8]:
%%capture
# Connect to Colab:
from google.colab import drive
drive.mount('/content/drive')

In [9]:
folder_data = "/content/drive/MyDrive/Exercises/Studies_Structured_Data/Data/S4E1_BankChurn"
models_folders = "/content/drive/MyDrive/Exercises/Studies_Structured_Data/Models/S4E1_BankChurn"
folders_nn = "/content/drive/MyDrive/Exercises/Studies_Structured_Data/Models/S4E1_BankChurn/neural_networks/"
folders_trees = "/content/drive/MyDrive/Exercises/Studies_Structured_Data/Models/S4E1_BankChurn/trees_models/"

list_directories = [folder_data,models_folders,folders_nn,folders_trees]

for path in list_directories:
  try:
      os.mkdir(path)
  except OSError as error:
      print(f"{path} already exists")


os.chdir(folder_data)

/content/drive/MyDrive/Exercises/Studies_Structured_Data/Data/S4E1_BankChurn already exists
/content/drive/MyDrive/Exercises/Studies_Structured_Data/Models/S4E1_BankChurn already exists
/content/drive/MyDrive/Exercises/Studies_Structured_Data/Models/S4E1_BankChurn/neural_networks/ already exists
/content/drive/MyDrive/Exercises/Studies_Structured_Data/Models/S4E1_BankChurn/trees_models/ already exists


## 2.0 Create Datasets

In [10]:
#train = pd.read_csv('new_train_dnn_small_norm_cat.csv',index_col=0)
train = pd.read_csv('new_train_dnn_boruta_norm_cat.csv',index_col=0)
#old_train = pd.read_csv("Churn_Modelling.csv")
#test = pd.read_csv("new_test_dnn_small_norm_cat.csv",index_col=0)
test = pd.read_csv("new_test_dnn_boruta_norm_cat.csv",index_col=0)

ensemble = pd.read_csv("ensemble_new_data.csv",index_col=0)

duplicate_results_test_df = pd.read_csv("known_test_targets.csv",index_col=0)
sample_submission = pd.read_csv('sample_submission.csv',index_col=0)

# Drop column id
#train.drop('id',axis=1,inplace=True)
#test.drop('id',axis=1,inplace=True)
#old_train.dropna(inplace=True,axis=0)
#old_train.rename({"RowNumber":"id"},axis=1,inplace=True)
#old_train.set_index("id", inplace=True)

In [11]:
print("TRAIN DATA shape: {}".format(train.shape))
display(train.head(3))
#print("OLD-TRAIN DATA: {}".format(old_train.shape))
#display(old_train.head(3))
print("TEST DATA: {}".format(test.shape))
display(test.head(3))

TRAIN DATA shape: (165034, 44)


Unnamed: 0,Age_Category_enc,bs_nop_enc,act_nop,Surname_tfidf_1,bs_nop_count_label,HasCrCard,IsActiveMember,Age_pca_comb,CreditScore_unimp_cluster_WOE,Age-NumOfProducts_cat_count,...,NumOfProducts_cat_count,Balance_Salary,bs_active_enc,act_nop_enc,CreditScore_pca_comb_final,Balance_Range_count,Surname_tfidf_0,HasCrCard_enc,bs_age_enc,Exited
0,-2.069623,-0.924451,1,-0.49923,0,1,0,-0.555462,0,-0.953493,...,84291,1.833879,0.297783,-0.149317,1.254062,89648,-0.912774,0.261683,-0.966387,0.0
1,-0.906434,-0.306724,3,0.687939,0,1,1,-0.555462,7,-0.953493,...,84291,0.051452,-1.890492,-1.151342,-2.279961,89648,-0.893908,-0.144951,-1.494328,0.0
2,-0.137777,-2.566904,1,-1.449682,0,1,0,0.407442,0,-0.308427,...,84291,2.028678,0.553561,-0.253912,-5.440209,89648,0.830234,-1.274168,-0.131707,0.0


TEST DATA: (110023, 43)


Unnamed: 0,Age_Category_enc,bs_nop_enc,act_nop,Surname_tfidf_1,bs_nop_count_label,HasCrCard,IsActiveMember,Age_pca_comb,CreditScore_unimp_cluster_WOE,Age-NumOfProducts_cat_count,...,Balance_pca_comb,NumOfProducts_cat_count,Balance_Salary,bs_active_enc,act_nop_enc,CreditScore_pca_comb_final,Balance_Range_count,Surname_tfidf_0,HasCrCard_enc,bs_age_enc
0,-1.279208,-1.236768,3,-0.877196,0,0,1,-2.131682,1,-2.345158,...,5.199338,84291,1.187295,-0.945019,-0.74882,0.109902,89648,0.432104,0.399311,-1.200575
1,1.059063,1.959783,0,0.036391,2,1,0,0.99792,0,1.166919,...,5.199338,77374,0.210348,0.450468,1.351334,-4.404257,89648,-0.235509,-0.526706,1.171625
2,-0.496432,-0.963318,1,0.036391,0,1,0,-0.408805,7,-0.840906,...,5.199338,84291,0.851665,0.187291,-0.573484,-2.099692,89648,-0.235509,0.57772,-0.738212


In [12]:
#total = pd.concat([train,test],axis=0,ignore_index=True)
#train = total.iloc[:len(train),:]
#test = total.iloc[len(train):,:]
#cat_col

In [13]:
train = train.astype("float")
test = test.astype("float")
cat_col = [name for name in train.columns if train[name].nunique()<25]

#train_=train.copy()
train[cat_col] = train[cat_col].astype("int")
cat_col.remove("Exited")
test[cat_col] = test[cat_col].astype("int")
summary(train).style.background_gradient(cmap='Reds')

data shape: (165034, 44)


Unnamed: 0,data type,#missing,%missing,#unique,min,max,median,mean
Age_Category_enc,float64,0,0.0,165033,-5.199338,5.199338,-1.4e-05,-0.000678
bs_nop_enc,float64,0,0.0,165025,-5.199338,5.199338,0.004491,0.002825
act_nop,int64,0,0.0,5,0.0,4.0,2.0,1.574748
Surname_tfidf_1,float64,0,0.0,1007,-5.199338,5.199338,0.036391,0.004019
bs_nop_count_label,int64,0,0.0,5,0.0,4.0,1.0,1.075778
HasCrCard,int64,0,0.0,2,0.0,1.0,1.0,0.753954
IsActiveMember,int64,0,0.0,2,0.0,1.0,0.0,0.49777
Age_pca_comb,float64,0,0.0,74,-5.199338,5.199338,0.005018,-0.004031
CreditScore_unimp_cluster_WOE,int64,0,0.0,10,0.0,9.0,5.0,4.177091
Age-NumOfProducts_cat_count,float64,0,0.0,197,-5.199338,5.199338,0.005018,-6e-06


## **3.0 Dataset Manager:**


In [14]:
strat_feature = train["Exited"]

X=train.drop(columns=["Exited"]).copy()
y=train["Exited"].copy()
X_test=test.copy()

X.shape[1]==X_test.shape[1]

True

In [15]:
num_var = X.select_dtypes("float").columns
list_to_stand = [name for name in num_var if X[name].nunique()>25]

scaler = QuantileTransformer(subsample=50_000, output_distribution="normal",random_state=42)

X[list_to_stand] = scaler.fit_transform(X[list_to_stand])
X_test[list_to_stand] = scaler.transform(X_test[list_to_stand])

In [16]:
summary(X.select_dtypes("int")).style.background_gradient(cmap='Reds')

data shape: (165034, 14)


Unnamed: 0,data type,#missing,%missing,#unique,min,max,median,mean
act_nop,int64,0,0.0,5,0.0,4.0,2.0,1.574748
bs_nop_count_label,int64,0,0.0,5,0.0,4.0,1.0,1.075778
HasCrCard,int64,0,0.0,2,0.0,1.0,1.0,0.753954
IsActiveMember,int64,0,0.0,2,0.0,1.0,0.0,0.49777
CreditScore_unimp_cluster_WOE,int64,0,0.0,10,0.0,9.0,5.0,4.177091
bs_nop,int64,0,0.0,5,0.0,4.0,2.0,1.720379
Geography_count_label-NumOfProducts_cat_count,int64,0,0.0,9,-84291.0,-3367.0,-84289.0,-79395.477859
act_age,int64,0,0.0,12,0.0,11.0,5.0,4.166566
Geography_count/NumOfProducts_cat_count_label,int64,0,0.0,5,-5.0,1.0,0.0,0.144946
bs_active,int64,0,0.0,4,0.0,3.0,2.0,1.645522


In [17]:
encoder = OrdinalEncoder()
to_encode = ["Geography_count_label-NumOfProducts_cat_count","Geography_count/NumOfProducts_cat_count_label","bs_nop_count","NumOfProducts_cat_count","Balance_Range_count"]
X[to_encode] = encoder.fit_transform(X[to_encode])
X[to_encode] = X[to_encode].astype("int")
X_test[to_encode] = encoder.fit_transform(X_test[to_encode])

In [18]:
summary(X.select_dtypes("int")).style.background_gradient(cmap='Reds')

data shape: (165034, 14)


Unnamed: 0,data type,#missing,%missing,#unique,min,max,median,mean
act_nop,int64,0,0.0,5,0.0,4.0,2.0,1.574748
bs_nop_count_label,int64,0,0.0,5,0.0,4.0,1.0,1.075778
HasCrCard,int64,0,0.0,2,0.0,1.0,1.0,0.753954
IsActiveMember,int64,0,0.0,2,0.0,1.0,0.0,0.49777
CreditScore_unimp_cluster_WOE,int64,0,0.0,10,0.0,9.0,5.0,4.177091
bs_nop,int64,0,0.0,5,0.0,4.0,2.0,1.720379
Geography_count_label-NumOfProducts_cat_count,int64,0,0.0,9,0.0,8.0,2.0,2.167802
act_age,int64,0,0.0,12,0.0,11.0,5.0,4.166566
Geography_count/NumOfProducts_cat_count_label,int64,0,0.0,5,0.0,4.0,3.0,3.159464
bs_active,int64,0,0.0,4,0.0,3.0,2.0,1.645522


In [19]:
summary(X.select_dtypes("float")).style.background_gradient(cmap='Blues')

data shape: (165034, 29)


Unnamed: 0,data type,#missing,%missing,#unique,min,max,median,mean
Age_Category_enc,float64,0,0.0,165031,-5.199338,5.199338,0.002424,0.004039
bs_nop_enc,float64,0,0.0,165025,-5.199338,5.199338,0.009085,0.004027
Surname_tfidf_1,float64,0,0.0,1007,-5.199338,5.199338,0.03388,0.005195
Age_pca_comb,float64,0,0.0,74,-5.199338,5.199338,0.001255,-0.006444
Age-NumOfProducts_cat_count,float64,0,0.0,196,-5.199338,5.199338,-0.003764,-0.003649
act_age_enc,float64,0,0.0,165030,-5.199338,5.199338,0.0056,0.000294
Age_cat*NumOfProducts_cat_count_label,float64,0,0.0,111,-5.199338,5.199338,-5.199338,-2.259986
CustomerId,float64,0,0.0,23218,-5.199338,5.199338,0.006273,0.005358
IsActiveMember_enc,float64,0,0.0,165033,-5.199338,5.199338,-0.006644,-0.006912
CreditScore_cat_pca_comb_final,float64,0,0.0,456,-5.199338,5.199338,0.0,-0.041095


In [20]:
num_var = X.select_dtypes("float").columns
cat_var = X.select_dtypes("int").columns

X[num_var] = X[num_var].astype("float32")
X_test[num_var] = X_test[num_var].astype("int32")

In [21]:
X["Exited"] = y
weight=X[X["Exited"]==0].shape[0]/X[X["Exited"]==1].shape[0]
X["weights"] = [1 if x==0 else weight for x in X["Exited"]]
X_test["Exited"] = 0
X_test["weights"] = 1

### 3.1 Data Loading

In [22]:
def dataframe_to_dataset(dataframe, shuffle=False, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe["Exited"]
    weights = dataframe.pop("weights")
    dataframe = dataframe.drop(columns=["Exited"])

    ds = tf.data.Dataset.from_tensor_slices((
                                             dict(dataframe),
                                             labels,
                                             weights
                                             ))
    if shuffle:
      ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

In [23]:
train_dataset = dataframe_to_dataset(X, batch_size=32, shuffle=True)
test_dataset = dataframe_to_dataset(X_test, batch_size=32, shuffle=False)

In [24]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165034 entries, 0 to 165033
Data columns (total 45 columns):
 #   Column                                         Non-Null Count   Dtype  
---  ------                                         --------------   -----  
 0   Age_Category_enc                               165034 non-null  float32
 1   bs_nop_enc                                     165034 non-null  float32
 2   act_nop                                        165034 non-null  int64  
 3   Surname_tfidf_1                                165034 non-null  float32
 4   bs_nop_count_label                             165034 non-null  int64  
 5   HasCrCard                                      165034 non-null  int64  
 6   IsActiveMember                                 165034 non-null  int64  
 7   Age_pca_comb                                   165034 non-null  float32
 8   CreditScore_unimp_cluster_WOE                  165034 non-null  int64  
 9   Age-NumOfProducts_cat_count          

In [25]:
gc.collect()

159

### 3.2 Test FeatureSpace

In [26]:
#X_fs = X.sample(frac=0.5,random_state=17)
#train_dataset_fs = dataframe_to_dataset(X_fs, batch_size=64, shuffle=True)

In [27]:
X.columns

Index(['Age_Category_enc', 'bs_nop_enc', 'act_nop', 'Surname_tfidf_1',
       'bs_nop_count_label', 'HasCrCard', 'IsActiveMember', 'Age_pca_comb',
       'CreditScore_unimp_cluster_WOE', 'Age-NumOfProducts_cat_count',
       'bs_nop', 'Geography_count_label-NumOfProducts_cat_count',
       'act_age_enc', 'act_age', 'Age_cat*NumOfProducts_cat_count_label',
       'CustomerId', 'Geography_count/NumOfProducts_cat_count_label',
       'IsActiveMember_enc', 'CreditScore_cat_pca_comb_final', 'bs_active',
       'Age_pca_comb_pca_comb_final', 'Balance_Range', 'bs_gender_enc',
       'quant_EstimatedSalary', 'Surname_tfidf_2', 'bs_nop_count',
       'Surname_tfidf_3', 'Gender_enc', 'Tenure_enc', 'Balance_Range_enc',
       'Surname_tfidf_4', 'Balance-NumOfProducts_cat_count_label',
       'Geo_Gender_enc', 'Balance_pca_comb', 'NumOfProducts_cat_count',
       'Balance_Salary', 'bs_active_enc', 'act_nop_enc',
       'CreditScore_pca_comb_final', 'Balance_Range_count', 'Surname_tfidf_0',
       

In [28]:
num_var,cat_var

(Index(['Age_Category_enc', 'bs_nop_enc', 'Surname_tfidf_1', 'Age_pca_comb',
        'Age-NumOfProducts_cat_count', 'act_age_enc',
        'Age_cat*NumOfProducts_cat_count_label', 'CustomerId',
        'IsActiveMember_enc', 'CreditScore_cat_pca_comb_final',
        'Age_pca_comb_pca_comb_final', 'bs_gender_enc', 'quant_EstimatedSalary',
        'Surname_tfidf_2', 'Surname_tfidf_3', 'Gender_enc', 'Tenure_enc',
        'Balance_Range_enc', 'Surname_tfidf_4',
        'Balance-NumOfProducts_cat_count_label', 'Geo_Gender_enc',
        'Balance_pca_comb', 'Balance_Salary', 'bs_active_enc', 'act_nop_enc',
        'CreditScore_pca_comb_final', 'Surname_tfidf_0', 'HasCrCard_enc',
        'bs_age_enc'],
       dtype='object'),
 Index(['act_nop', 'bs_nop_count_label', 'HasCrCard', 'IsActiveMember',
        'CreditScore_unimp_cluster_WOE', 'bs_nop',
        'Geography_count_label-NumOfProducts_cat_count', 'act_age',
        'Geography_count/NumOfProducts_cat_count_label', 'bs_active',
        'Bal

In [None]:
%%time
feature_space = FeatureSpace(
                            features={**{a:FeatureSpace.integer_categorical(num_oov_indices=0, output_mode="int") for a in cat_var},**{b:FeatureSpace.float() for b in num_var}},
                            output_mode="dict"
                            )

train_ds_with_no_labels = train_dataset.map(lambda x, *_: x)
print("Adapting Features Space....")
feature_space.adapt(train_ds_with_no_labels)

preprocessed_train_ds = train_dataset.map(lambda x, y, w: (feature_space(x), y, w), num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)

Cause: could not parse the source code of <function <lambda> at 0x7ed3262ae7a0>: no matching AST found among candidates:



Cause: could not parse the source code of <function <lambda> at 0x7ed3262ae7a0>: no matching AST found among candidates:

Adapting Features Space....


In [None]:
gc.collect()

In [None]:
for X_,y_,w_ in preprocessed_train_ds.take(1):
  print(len(X_.keys()))
  print(y_.shape)
  print(w_.shape)

## **3.0 MODELS**

### 4.1 Baseline:

In [None]:
def encode_inputs(inputs, list_categorical_nn, Cat_Feat_Entries, edense=4, num_dense_exp=False, embedding_dims_det=None, name="enc"):
    encoded_categorical_feature_list = []
    numerical_feature_list = []

    for feature_name in inputs:
        if feature_name in list_categorical_nn:

          vocabulary = Cat_Feat_Entries[feature_name]

          if embedding_dims_det is not None:
            ed = embedding_dims_det#[feature_name]
          else:
            ed = edense

          embedding = layers.Embedding(input_dim=vocabulary, output_dim=ed)
          # Convert the index values to embedding representations.
          encoded_categorical_feature = embedding(inputs[feature_name])
          if num_dense_exp:
            encoded_categorical_feature = layers.Dense(edense)(encoded_categorical_feature)
            encoded_categorical_feature = layers.Reshape((edense,))(encoded_categorical_feature)

          encoded_categorical_feature_list.append(encoded_categorical_feature)

        else:
          # Use the numerical features as-is.
          numerical_feature = inputs[feature_name] #tf.expand_dims(inputs[feature_name], -1)

          if num_dense_exp:
            numerical_feature = layers.Dense(edense)(numerical_feature)

          numerical_feature = keras.ops.expand_dims(numerical_feature, -1)
          numerical_feature_list.append(numerical_feature)

    return encoded_categorical_feature_list, numerical_feature_list

In [None]:
list_categorical_nn = [name for name in X.drop(columns="Exited").select_dtypes("int").columns]
Cat_Feat_Entries = {name: X[name].nunique() for name in X.drop(columns="Exited").select_dtypes("int").columns}

In [None]:
list_categorical_nn

In [None]:
def create_baseline_model(fs,name="baseline_model",learning_rate = 0.001,
                          activation="relu",gn_noise=0.0,dropout=0.5,hidden_layers=3,
                          units_0=256,units_1=256,):

  encoded_features = fs.get_encoded_features()
  # Wide Leg
  cat,num = encode_inputs(encoded_features, list_categorical_nn=list_categorical_nn, Cat_Feat_Entries=Cat_Feat_Entries, embedding_dims_det=8, name="enc_wide",num_dense_exp=False)

  wide_cat = tf.keras.layers.concatenate(cat, name="wide_cat_concat")
  wide_num = tf.keras.layers.concatenate(num, name="wide_num_concat")

  model = tf.keras.Model(inputs=encoded_features, outputs=[wide_cat,wide_num], name=name)

  model.compile(
          optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
          loss=tf.keras.losses.CategoricalCrossentropy(name="cat_ce"),
          metrics=[keras.metrics.AUC(curve="PR")],
          weighted_metrics=[])

  return model

In [None]:
len(cat),len(num)

In [None]:
model = create_baseline_model(feature_space)
model.summary()

In [None]:
  x = tf.keras.layers.Dense(units_0, activation=activation, name="dense_in")(encoded_features_conc)
  x = tf.keras.layers.BatchNormalization(name="bn_in")(x)
  x = tf.keras.layers.Dropout(dropout,name="do_in")(x)

  x = tf.keras.layers.GaussianNoise(stddev=gn_noise, name="gsn")(x)

  for lr in range(hidden_layers):
    x = tf.keras.layers.Dense(units_1, activation=activation, name=f"dense_{lr}")(x)
    x = tf.keras.layers.BatchNormalization(name=f"bn_{lr}")(x)
    x = tf.keras.layers.Dropout(dropout,name=f"do_{lr}")(x)

  output = tf.keras.layers.Dense(3, activation="softmax",name="output_final")(x)

In [64]:
gc.collect()

885