In [19]:
# Data manipulation
import pandas as pd
import numpy as np
# Plotting
import seaborn as sns
import matplotlib.pyplot as plt
# Metric and model form scikit-learn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import learning_curve
# Boosting model
import catboost as cb
import lightgbm as lgb
import xgboost as xgb
# Tuning optimization
import optuna
import optuna.visualization as vis
# System utilities
import warnings
warnings.filterwarnings("ignore")
import sys
print(sys.executable)


c:\Users\emanu\Analysis-on-Churn-Banking-Modeling-Dataset\.venv\Scripts\python.exe


Import and cleaning of our dataset

In [20]:
df = pd.read_csv(r"Clean_dataset.csv") # Data cleaning and feature engineering done in Feature_Engineering.ipynb
df = df.drop('Id_Cliente',axis=1) # No need cause it was confusing the model
df.head()

Unnamed: 0,Flag_Richiesta_Estinzione_cc,Imp_Valore_del_Cliente,Flag_Apertura_Conto_Online,Flag_Possesso_piu_Conti,Eta,Provincia_Domicilio,Provincia_Residenza,Anno_Apertura_primo_Conto,Professione,Imp_Reddito,...,Imp_Liquidit�_Attuale,Imp_Gestito_attuale,Imp_Amministrato_attuale,Imp_Liquidit�_Attuale_6m,Imp_Gestito_attuale_6m,Imp_Amministrato_attuale_6m,Flag_Trasferimento_Titoli_Out,Flag_Rifiuto_Carte,Flag_Rifiuto_Prestiti,Flag_Disattivazione_RID
0,no,65.03,0,0,38.0,BO,BO,2004.0,Impiegato,3.0,...,1634.57,2978.4,2980.92,1550.44,2853.2,2853.22,0,0,0,0
1,no,138.88,0,0,45.0,SA,SA,2000.0,Impiegato,4.0,...,11918.26,,34916.15,26122.17,,22053.82,0,0,0,0
2,no,546.54,0,0,61.0,VT,VT,2005.0,Altro/Nessuno,4.0,...,2671.95,,232776.62,16545.25,,216304.33,1,0,0,0
3,no,68.69,0,0,33.0,MI,MI,2010.0,Impiegato,2.0,...,19211.31,15013.53,15017.53,12500.0,,,0,0,0,0
4,no,2417.05,0,0,36.0,MI,MI,2001.0,Quadro,,...,1526.38,,,1427.7,,21514.6,0,0,0,0


In [21]:
X = df.drop('Flag_Richiesta_Estinzione_cc', axis=1)
y = df['Flag_Richiesta_Estinzione_cc'].map({'no': 0, 'si': 1})  
XX = pd.get_dummies(X) # It will be useful later to ensure that the column size of all the df is the same (Provincia and Residenza were giving us problem due to their unique values)


In [22]:
def clean_feature_names(df):
    df.columns = [col.replace(' ', '_').replace('[', '').replace(']', '').replace('<', '') for col in df.columns]
    return df
# After creating dummies, the model was not able to elaborate certain columns due to the presence of certain special characters

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # Split 4:5, random state to ensure reproducibility
# To ensure that the size is the same even after the split
XX = pd.get_dummies(pd.concat([X_train, X_test], sort=False)) 
X_train, X_test = XX.loc[X_train.index], XX.loc[X_test.index]
# apply clean_feature_names
X_train = clean_feature_names(X_train)
X_test = clean_feature_names(X_test)

X_train, X_test = X_train.align(X_test, join='inner', axis=1)
# I didn't know if the code would confuse the df of the last tuning with the final one, i changed names to ensure this don't happen
X_train_final = X_train
X_test_final = X_test
y_train_final = y_train
y_test_final = y_test

Import and cleaning of our synthetic dataset previously generated

In [24]:
synthetic_df = pd.read_csv(r"Synthetic_dataset.csv")
synthetic_df = synthetic_df.drop('Id_Cliente',axis=1)

We apply the same feature engineering process for the original dataset

In [25]:
synthetic_df=synthetic_df
synthetic_df['Professione'] = synthetic_df['Professione'].apply(lambda x: 'Ufficiale/Sottoufficiale' if x == 'Ufficiale/Sottufficiale' else 
                                                                  'Libero professionista/Titolare impresa' if x == 'Libero professionista/Titolare di impresa' else 
                                                                  x)
synthetic_df['Provincia_Residenza'] = synthetic_df['Provincia_Residenza'].replace('FO', 'FC')
synthetic_df['Provincia_Domicilio'] = synthetic_df['Provincia_Domicilio'].replace('FO', 'FC')

synthetic_df['Provincia_Residenza'] = synthetic_df['Provincia_Residenza'].replace('PS', 'PU')
synthetic_df['Provincia_Domicilio'] = synthetic_df['Provincia_Domicilio'].replace('PS', 'PU')

synthetic_df['Provincia_Domicilio'] = synthetic_df['Provincia_Domicilio'].replace('8N', np.nan)
synthetic_df['Provincia_Domicilio'] = synthetic_df['Provincia_Domicilio'].replace('BE', np.nan)
mapping = {
    'CAUTO': 1,
    'PRUDENTE': 2,
    'BILANCIATO': 3,
    'DINAMICO':4,
    'ND':0
}
synthetic_df['Profilo_MIFID'] = synthetic_df['Profilo_MIFID'].replace(mapping)

mapping2 = {
    'Bassissimo (<1200)': 1,
    'Basso (tra 1200 e 1500)': 2,
    'Medio (tra 1500 e 1800)': 3,
    'Alto (tra 1800 e 2500)':4,
    'Altissimo (>2500)':5
}

synthetic_df['Imp_Reddito'] = synthetic_df['Imp_Reddito'].replace(mapping2)

In [26]:
X_synthetic = synthetic_df.drop('Flag_Richiesta_Estinzione_cc', axis=1)
y_synthetic = synthetic_df['Flag_Richiesta_Estinzione_cc'].map({'no': 0, 'si': 1})  

In [27]:
# Same process as before to ensure that column lenght is the same since once we train our model with specific columns, we need to ensure that our model has in input a df with same lenght and names
X_synthetic = pd.get_dummies(X_synthetic)
X_train_synthetic = clean_feature_names(X_synthetic)

missing_columns_train = set(XX.columns) - set(X_train_synthetic.columns)
for col in missing_columns_train:
    X_train_synthetic[col] = 0  

X_train_res_synthetic = X_train_synthetic[XX.columns]

print("Training features:", X_train_res_synthetic.shape)


Training features: (70000, 279)


LightGBM 

In [28]:
# We stopped the tuning earlier since we found a good set of parameters and tuning wasn't giving us better results but was converging to the same parameters
'''
[I 2024-04-23 17:36:46,835] Trial 26 finished with value: 0.8232704402515724 and parameters: {'n_estimators': 297, 'learning_rate': 0.010065762501114987, 'max_depth': 13, 'num_leaves': 25, 'min_child_samples': 28, 'subsample': 0.9186047768818155, 'subsample_freq': 5, 'colsample_bytree': 0.8969786318428297, 'reg_alpha': 0.9097824682690728, 'reg_lambda': 0.6075396766185748, 'max_bin': 289}. Best is trial 26 with value: 0.8232704402515724.
'''
scale_pos_weight = len(y[y == 0]) / len(y[y == 1])

best_params = {
    "n_estimators": 297,
    "learning_rate": 0.010065762501114987,
    "max_depth": 13,
    "num_leaves": 25,
    "min_child_samples": 28,
    "subsample": 0.9186047768818155,
    "subsample_freq": 5,
    "colsample_bytree": 0.8969786318428297,
    "reg_alpha": 0.9097824682690728,
    "reg_lambda": 0.6075396766185748,
    "max_bin": 289,
    'force_col_wise': True,
    'scale_pos_weight': scale_pos_weight,
    'verbosity': -1
}

best_lgb_classifier = lgb.LGBMClassifier(**best_params)
best_lgb_classifier.fit(X_train_final, y_train_final)
lgbm_predictions = best_lgb_classifier.predict_proba(X_test_final)[:, 1]

XGBoost

In [29]:
import xgboost
'''
Best Hyperparameters: {'n_estimators': 661, 'learning_rate': 0.036491004518573594, 'max_depth': 3, 'min_child_weight': 4, 'subsample': 0.6625916610133735, 'colsample_bytree': 0.864803089169032, 'gamma': 3.1877873567760657, 'reg_alpha': 4.436063712881633, 'reg_lambda': 4.7749277591032975}
Best Score for Top 50,000: 0.829559748427673
'''
# There was a problem with the parameters verbosity, I had to set manually a value in range in main xgboost configuration
xgb.set_config(verbosity=0)
config = xgb.get_config()
assert config['verbosity'] == 0
xgboost.config_context(verbosity=0)
scale_pos_weight = len(y[y == 0]) / len(y[y == 1])
xgb_params = {
    'n_estimators': 661,
    'learning_rate': 0.036491004518573594,
    'max_depth': 3,
    'min_child_weight': 4,
    'subsample': 0.6625916610133735,
    'colsample_bytree': 0.864803089169032,
    'gamma': 3.1877873567760657,
    'reg_alpha': 4.436063712881633,
    'reg_lambda': 4.7749277591032975,
    'scale_pos_weight': scale_pos_weight,
}

best_xgb_classifier = xgb.XGBClassifier(verbosity=0, **xgb_params)

best_xgb_classifier.fit(X_train_final, y_train_final)
xgb_predictions = best_xgb_classifier.predict_proba(X_test_final)[:, 1]

Catboost

In [50]:
'''
Best Hyperparameters: {'n_estimators': 486, 'learning_rate': 0.08564115269968339, 'max_depth': 5, 'l2_leaf_reg': 5.495063193351241, 'border_count': 113, 'grow_policy': 'SymmetricTree', 'min_data_in_leaf': 8, 'leaf_estimation_iterations': 9, 'leaf_estimation_method': 'Gradient'}
Best Score for Top 50,000: 0.8327044025157233
'''
scale_pos_weight = len(y[y == 0]) / len(y[y == 1])

catboost_params = {
    'n_estimators': 486,
    'learning_rate': 0.08564115269968339,
    'max_depth': 5,
    'l2_leaf_reg': 5.495063193351241,
    'border_count': 113,
    'grow_policy': 'SymmetricTree',
    'min_data_in_leaf': 8,
    'leaf_estimation_iterations': 9,
    'leaf_estimation_method': 'Gradient',
    'scale_pos_weight': scale_pos_weight,
}

best_catboost_classifier = cb.CatBoostClassifier(**catboost_params)

best_catboost_classifier.fit(X_train_final, y_train_final)
catboost_predictions = best_catboost_classifier.predict_proba(X_test_final)[:, 1]



0:	learn: 0.6633804	total: 714ms	remaining: 5m 46s
1:	learn: 0.6394381	total: 1.31s	remaining: 5m 18s
2:	learn: 0.6182192	total: 1.83s	remaining: 4m 55s
3:	learn: 0.5990031	total: 2.34s	remaining: 4m 42s
4:	learn: 0.5832813	total: 2.79s	remaining: 4m 28s
5:	learn: 0.5696593	total: 3.37s	remaining: 4m 29s
6:	learn: 0.5572461	total: 3.97s	remaining: 4m 31s
7:	learn: 0.5457160	total: 4.42s	remaining: 4m 23s
8:	learn: 0.5371766	total: 4.97s	remaining: 4m 23s
9:	learn: 0.5294709	total: 5.5s	remaining: 4m 21s
10:	learn: 0.5212493	total: 6.25s	remaining: 4m 29s
11:	learn: 0.5127680	total: 6.88s	remaining: 4m 31s
12:	learn: 0.5044516	total: 7.5s	remaining: 4m 32s
13:	learn: 0.4982165	total: 7.94s	remaining: 4m 27s
14:	learn: 0.4925573	total: 8.48s	remaining: 4m 26s
15:	learn: 0.4882312	total: 9.18s	remaining: 4m 29s
16:	learn: 0.4838150	total: 9.68s	remaining: 4m 27s
17:	learn: 0.4782039	total: 10.2s	remaining: 4m 25s
18:	learn: 0.4739136	total: 10.6s	remaining: 4m 20s
19:	learn: 0.4697709	tot

Ensembling

In [75]:
weight_catboost = 0.7
weight_xgb = 0.25
weight_lgbm = 0.05

In [67]:
catboost_predictions = best_catboost_classifier.predict_proba(X_test_final)[:, 1]
xgb_predictions = best_xgb_classifier.predict_proba(X_test_final)[:, 1]
lgbm_predictions = best_lgb_classifier.predict_proba(X_test_final)[:, 1]

ensemble_predictions = (weight_catboost * catboost_predictions) + (weight_xgb * xgb_predictions) + (weight_lgbm * lgbm_predictions)

Se ti serve il classifier allora non puoi usare le tecniche che dicevi, falle invece per ogni algoritmo