# Bank Churners Classifier Model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


#pre-training
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

#training
from sklearn import ensemble
from sklearn import pipeline


#post training 
from sklearn.metrics import accuracy_score
from joblib import dump

#### Read data

In [2]:
data_df = pd.read_csv('../../datasets/credit-card-customers/BankChurners.zip')
data_df.shape

(10127, 23)

In [3]:
data_df.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,...,12691.0,777,11914.0,1.335,1144,42,1.625,0.061,9.3e-05,0.99991
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,...,8256.0,864,7392.0,1.541,1291,33,3.714,0.105,5.7e-05,0.99994
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,...,3418.0,0,3418.0,2.594,1887,20,2.333,0.0,2.1e-05,0.99998
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,...,3313.0,2517,796.0,1.405,1171,20,2.333,0.76,0.000134,0.99987
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,...,4716.0,0,4716.0,2.175,816,28,2.5,0.0,2.2e-05,0.99998


In [4]:
data_df.columns

Index(['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender',
       'Dependent_count', 'Education_Level', 'Marital_Status',
       'Income_Category', 'Card_Category', 'Months_on_book',
       'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio',
       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
       'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],
      dtype='object')

In [5]:
data_df.isna().sum()

CLIENTNUM                                                                                                                             0
Attrition_Flag                                                                                                                        0
Customer_Age                                                                                                                          0
Gender                                                                                                                                0
Dependent_count                                                                                                                       0
Education_Level                                                                                                                       0
Marital_Status                                                                                                                        0
Income_Category                                 

#### Remove columns which should not go into the model

In [6]:
data_df.drop([
    'CLIENTNUM',
    'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
    'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'
], axis=1, inplace=True)

#### Convert categorical columns

In [7]:
#https://medium.com/@sami.yousuf.azad/one-hot-encoding-with-pandas-dataframe-49a304e8507a
CATEGORICAL_COLS = ['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category', ]
col_transformer = make_column_transformer(
  (OneHotEncoder(), CATEGORICAL_COLS),
  remainder='passthrough')

transformed = col_transformer.fit_transform(data_df)

transformed_df = pd.DataFrame(transformed, columns=col_transformer.get_feature_names_out())

In [8]:
transformed_df.head()

Unnamed: 0,onehotencoder__Gender_F,onehotencoder__Gender_M,onehotencoder__Education_Level_College,onehotencoder__Education_Level_Doctorate,onehotencoder__Education_Level_Graduate,onehotencoder__Education_Level_High School,onehotencoder__Education_Level_Post-Graduate,onehotencoder__Education_Level_Uneducated,onehotencoder__Education_Level_Unknown,onehotencoder__Marital_Status_Divorced,...,remainder__Months_Inactive_12_mon,remainder__Contacts_Count_12_mon,remainder__Credit_Limit,remainder__Total_Revolving_Bal,remainder__Avg_Open_To_Buy,remainder__Total_Amt_Chng_Q4_Q1,remainder__Total_Trans_Amt,remainder__Total_Trans_Ct,remainder__Total_Ct_Chng_Q4_Q1,remainder__Avg_Utilization_Ratio
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


In [9]:
transformed_df.columns

Index(['onehotencoder__Gender_F', 'onehotencoder__Gender_M',
       'onehotencoder__Education_Level_College',
       'onehotencoder__Education_Level_Doctorate',
       'onehotencoder__Education_Level_Graduate',
       'onehotencoder__Education_Level_High School',
       'onehotencoder__Education_Level_Post-Graduate',
       'onehotencoder__Education_Level_Uneducated',
       'onehotencoder__Education_Level_Unknown',
       'onehotencoder__Marital_Status_Divorced',
       'onehotencoder__Marital_Status_Married',
       'onehotencoder__Marital_Status_Single',
       'onehotencoder__Marital_Status_Unknown',
       'onehotencoder__Income_Category_$120K +',
       'onehotencoder__Income_Category_$40K - $60K',
       'onehotencoder__Income_Category_$60K - $80K',
       'onehotencoder__Income_Category_$80K - $120K',
       'onehotencoder__Income_Category_Less than $40K',
       'onehotencoder__Income_Category_Unknown',
       'onehotencoder__Card_Category_Blue',
       'onehotencoder__Card_Ca

#### Build model

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    data_df.drop(['Attrition_Flag'], axis=1)
    , data_df.Attrition_Flag
    , random_state=1)

In [11]:
pipe = pipeline.make_pipeline(
    col_transformer
    ,ensemble.RandomForestClassifier(n_estimators=100, min_samples_split=2) # <== Classifier
)

In [12]:
#%%time
pipe.fit(X_train, y_train)

y_predict = pipe.predict(X_test)
pipe.score(X_test, y_test)

CPU times: total: 1.53 s
Wall time: 2.56 s


0.9569510268562401

In [13]:
pipe

In [14]:
#pd.DataFrame({'feature':X_train.columns, 'importance':pipe.feature_importances_}).sort_values(by='importance')

#### Save model

In [15]:
#%%time
dump(pipe, 'bank_churners_classifier_model.joblib')

CPU times: total: 15.6 ms
Wall time: 69.2 ms


['bank_churners_classifier_model.joblib']

In [16]:
#%ls

 Volume in drive C is OS
 Volume Serial Number is 8468-2DB6

 Directory of C:\Users\shahb\OneDrive\Documents\GitHub\ProgrammingForAnalytics\lectures\075_web_under_the_hood

02/05/2024  06:38 PM    <DIR>          .
01/19/2024  03:10 PM    <DIR>          ..
02/05/2024  06:36 PM    <DIR>          .ipynb_checkpoints
02/05/2024  06:35 PM    <DIR>          __pycache__
02/05/2024  06:38 PM            10,613 120-back_churners_classifier_model.ipynb
02/05/2024  06:38 PM         7,310,303 bank_churners_classifier_model.joblib
01/18/2024  10:45 PM               568 consume_json.py
01/18/2024  11:25 PM             6,927 consume_services.ipynb
01/30/2024  08:17 PM               408 decorator.pyx
01/18/2024  10:32 PM               464 serve_json.py
01/18/2024  11:21 PM               934 serve_post_json.py
01/18/2024  10:32 PM               418 serve_text.py
01/27/2024  11:21 AM         1,078,075 The web, under the hood.pdf
01/27/2024  12:05 PM         2,265,008 The web, under the hood.pptx
         

#### Read model

In [17]:
from joblib import load

In [18]:
trained_model = load('bank_churners_classifier_model.joblib')

In [19]:
trained_model.feature_names_in_

array(['Customer_Age', 'Gender', 'Dependent_count', 'Education_Level',
       'Marital_Status', 'Income_Category', 'Card_Category',
       'Months_on_book', 'Total_Relationship_Count',
       'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit',
       'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1',
       'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1',
       'Avg_Utilization_Ratio'], dtype=object)

The following columns are categorical

In [20]:
CATEGORICAL_COLS

['Gender',
 'Education_Level',
 'Marital_Status',
 'Income_Category',
 'Card_Category']

In [21]:
for col in CATEGORICAL_COLS:
    print(col, data_df[col].unique())

Gender ['M' 'F']
Education_Level ['High School' 'Graduate' 'Uneducated' 'Unknown' 'College' 'Post-Graduate'
 'Doctorate']
Marital_Status ['Married' 'Single' 'Unknown' 'Divorced']
Income_Category ['$60K - $80K' 'Less than $40K' '$80K - $120K' '$40K - $60K' '$120K +'
 'Unknown']
Card_Category ['Blue' 'Gold' 'Silver' 'Platinum']


In [22]:
test_data_df = pd.Series({
    'Customer_Age'   : 30, 
    'Gender'         : 'M', 
    'Dependent_count': 3, 
    'Education_Level': 'Graduate',
    'Marital_Status' : 'Single', 
    'Income_Category': '$40K - $60K', 
    'Card_Category'  : 'Blue',
    'Months_on_book' : 5, 
    'Total_Relationship_Count' : 3,
    'Months_Inactive_12_mon'   : 1, 
    'Contacts_Count_12_mon'    : 2, 
    'Credit_Limit'             : 34000,
    'Total_Revolving_Bal'      : 40000, 
    'Avg_Open_To_Buy'          : 200, 
    'Total_Amt_Chng_Q4_Q1'     : 34,
    'Total_Trans_Amt'          : 500, 
    'Total_Trans_Ct'           : 3, 
    'Total_Ct_Chng_Q4_Q1'      : 23,
    'Avg_Utilization_Ratio'    : .1
}).to_frame().T

In [23]:
test_data_df

Unnamed: 0,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,30,M,3,Graduate,Single,$40K - $60K,Blue,5,3,1,2,34000,40000,200,34,500,3,23,0.1


In [24]:
test_data_df.columns

Index(['Customer_Age', 'Gender', 'Dependent_count', 'Education_Level',
       'Marital_Status', 'Income_Category', 'Card_Category', 'Months_on_book',
       'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio'],
      dtype='object')

In [25]:
trained_model.predict(test_data_df)

array(['Existing Customer'], dtype=object)

In [26]:
trained_model.classes_

array(['Attrited Customer', 'Existing Customer'], dtype=object)

In [27]:
trained_model.predict_proba(test_data_df)

array([[0.42, 0.58]])

### Convert this notebook to .py
Some students having trouble reading the model so they can run a .py file in their own enviornment and generate the model file using the same env as their web services code

In [2]:
!jupyter nbconvert --to python 120-bank_churners_classifier_model.ipynb 

[NbConvertApp] Converting notebook 120-bank_churners_classifier_model.ipynb to python
[NbConvertApp] Writing 3973 bytes to 120-bank_churners_classifier_model.py
