In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier

from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import classification_report, confusion_matrix

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/digipodium/Datasets/main/classfication/bank_data.csv',index_col=0)
df.head()

Unnamed: 0_level_0,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
df.drop(columns=['CustomerId','Surname'], inplace=True)
df.head(2)

Unnamed: 0_level_0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
RowNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,619,France,Female,42,2,0.0,1,1,1,101348.88,1
2,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0


In [4]:
X = df.drop(columns='Exited')
y = df['Exited']

#### OVER SAMPLING

In [5]:
print("Data Null values check")
df.isnull().sum()

Data Null values check


CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [6]:
X.select_dtypes('object').columns.tolist()

['Geography', 'Gender']

In [7]:
X.select_dtypes(np.number).columns.tolist()

['CreditScore',
 'Age',
 'Tenure',
 'Balance',
 'NumOfProducts',
 'HasCrCard',
 'IsActiveMember',
 'EstimatedSalary']

#### pipeline creation

In [8]:
numeric_transformer = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

In [9]:
preprocessor = ColumnTransformer( transformers=[
    ("numeric", numeric_transformer, X.select_dtypes(np.number).columns.tolist() ),
    ("category", categorical_transformer, X.select_dtypes('object').columns.tolist() ),
])

In [10]:
clf = Pipeline( steps=[
    ('preprocessor',preprocessor),
    ("resampling",SMOTE()),
    ("classifier", RandomForestClassifier(n_estimators=5, min_samples_split=4))
])

In [11]:
clf

In [12]:
param_grid = {
    'classifier__min_samples_split':[2,3,4,5]
}
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
grid_search.fit(X_train, y_train)
print("Best params:")
print(grid_search.best_params_)

Best params:
{'classifier__min_samples_split': 3}


In [14]:
print(f"Internal CV score: {grid_search.best_score_:.3f}")

Internal CV score: 0.821


In [15]:
X.shape

(10000, 10)

In [16]:
clf.fit(X_train, y_train) # final fit
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1412  195]
 [ 170  223]]
              precision    recall  f1-score   support

           0       0.89      0.88      0.89      1607
           1       0.53      0.57      0.55       393

    accuracy                           0.82      2000
   macro avg       0.71      0.72      0.72      2000
weighted avg       0.82      0.82      0.82      2000



In [17]:
from joblib import dump

dump(clf, 'model/churn_model_v1.jb')

['model/churn_model_v1.jb']

In [18]:
X.iloc[0] # sample

CreditScore              619
Geography             France
Gender                Female
Age                       42
Tenure                     2
Balance                  0.0
NumOfProducts              1
HasCrCard                  1
IsActiveMember             1
EstimatedSalary    101348.88
Name: 1, dtype: object

In [26]:
X.EstimatedSalary.unique()

array([101348.88, 112542.58, 113931.57, ...,  42085.58,  92888.52,
        38190.78])

In [36]:
clf.predict(pd.DataFrame([X.loc[1]]))

array([1], dtype=int64)

CreditScore              619
Geography             France
Gender                Female
Age                       42
Tenure                     2
Balance                  0.0
NumOfProducts              1
HasCrCard                  1
IsActiveMember             1
EstimatedSalary    101348.88
Name: 1, dtype: object