# The goal of this project is to use several machine learning to predict credit card customer attrition from customer information. We use logistic regression, a 2-layer neural network, and a random forest model. 

In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics 
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

# Open [Bank Churners dataset ](https://www.kaggle.com/datasets/thedevastator/predicting-credit-card-customer-attrition-with-m?select=BankChurners.csv) , from Kaggle

In [2]:
df = pd.read_csv('BankChurners.csv')

#Shuffle data
df = df.sample(frac = 1)

#Convert output values to numbers
ys = np.array((df.loc[:, 'Attrition_Flag'] == 'Attrited Customer'))

In [3]:
#Columns affecting attrition status
cols = ['Customer_Age', 'Gender',
       'Dependent_count', 'Education_Level', 'Marital_Status',
       'Income_Category', 'Card_Category', 'Months_on_book',
       'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']
for col in df.columns:
    if col not in cols:
        df.drop(col, axis=1, inplace=True)
print(df.columns)

Index(['Customer_Age', 'Gender', 'Dependent_count', 'Education_Level',
       'Marital_Status', 'Income_Category', 'Card_Category', 'Months_on_book',
       'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio'],
      dtype='object')


In [4]:
#Use one hot encoding to convert categorical variables into numerical variables

df = pd.get_dummies(df.loc[:, cols])


# The data set consists of rows, credit card holders, along with attritted status and various facts about each customer such as card category, martial status and education level.

In [5]:
df

Unnamed: 0,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,...,Income_Category_$120K +,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Income_Category_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver
5076,62,1,51,6,2,3,5711.0,0,5711.0,0.583,...,0,0,0,1,0,0,1,0,0,0
4338,64,1,56,5,2,4,5152.0,948,4204.0,0.953,...,0,1,0,0,0,0,1,0,0,0
9199,49,4,45,1,3,1,5975.0,2045,3930.0,0.847,...,0,0,0,0,1,0,1,0,0,0
7265,46,3,29,5,1,1,2228.0,1449,779.0,0.610,...,0,0,0,0,1,0,1,0,0,0
8370,47,3,37,2,2,2,2154.0,1628,526.0,0.665,...,0,0,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8990,38,2,32,2,2,6,7385.0,1817,5568.0,1.029,...,0,0,0,0,1,0,1,0,0,0
8588,49,2,38,3,6,3,16494.0,1158,15336.0,0.692,...,1,0,0,0,0,0,1,0,0,0
827,46,4,36,4,3,2,17023.0,1084,15939.0,0.928,...,0,0,0,0,0,1,1,0,0,0
1944,45,3,37,6,2,2,10651.0,672,9979.0,0.767,...,0,0,0,1,0,0,1,0,0,0


# 16% of the data consists of Attrited Customers. Hence, we would like a model that can beat the majority classifier

In [6]:
sum(ys)/len(ys)

0.1606596227905599

In [7]:
#Split into training and test data

# From the correlation matrix, we see that several feautres are correlated. We use PCA for dimensionality reduction to improve training

In [8]:
df.corr().style.background_gradient(cmap='coolwarm')

Unnamed: 0,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Gender_F,Gender_M,Education_Level_College,Education_Level_Doctorate,Education_Level_Graduate,Education_Level_High School,Education_Level_Post-Graduate,Education_Level_Uneducated,Education_Level_Unknown,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_Single,Marital_Status_Unknown,Income_Category_$120K +,Income_Category_$40K - $60K,Income_Category_$60K - $80K,Income_Category_$80K - $120K,Income_Category_Less than $40K,Income_Category_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver
Customer_Age,1.0,-0.122254,0.788912,-0.010931,0.054361,-0.018452,0.002476,0.01478,0.001151,-0.062042,-0.046446,-0.067097,-0.012143,0.007114,0.017312,-0.017312,-0.014788,0.025199,-0.000203,0.001199,-0.022081,0.005057,0.005377,-0.042614,0.047364,-0.011248,-0.026694,0.044332,-0.013804,-0.017869,0.005381,-0.002573,-0.002264,0.021409,-0.011901,0.006515,-0.019425
Dependent_count,-0.122254,1.0,-0.103062,-0.039076,-0.010768,-0.040505,0.068065,-0.002688,0.068291,-0.035439,0.025046,0.049912,0.011087,-0.037135,-0.004563,0.004563,0.003369,-0.003368,0.000671,-0.013127,0.009459,0.00219,0.004922,0.006697,0.014385,-0.040707,0.041743,0.024831,-0.013493,0.028975,0.047611,-0.046785,-0.019205,-0.027671,0.027035,0.003555,0.017322
Months_on_book,0.788912,-0.103062,1.0,-0.009203,0.074164,-0.010774,0.007507,0.008623,0.006732,-0.048959,-0.038591,-0.049819,-0.014072,-0.007541,0.006728,-0.006728,-0.010281,0.024114,0.003531,0.002637,-0.016703,0.001099,-0.00361,-0.027678,0.033194,-0.005065,-0.026145,0.033532,-0.003204,-0.016635,0.00772,-0.0043,-0.007688,0.014934,-0.005426,0.001791,-0.014357
Total_Relationship_Count,-0.010931,-0.039076,-0.009203,1.0,-0.003675,0.055203,-0.071386,0.013726,-0.072601,0.050119,-0.347229,-0.241891,0.040831,0.067663,-0.003157,0.003157,-0.013582,-0.009077,0.005397,-0.001707,0.01205,0.008202,-0.003969,0.009276,0.017001,-0.017329,-0.009376,-0.00609,-0.008747,0.005475,0.001185,0.005109,0.00049,0.085789,-0.055678,-0.043289,-0.060566
Months_Inactive_12_mon,0.054361,-0.010768,0.074164,-0.003675,1.0,0.029493,-0.020394,-0.04221,-0.016605,-0.032247,-0.036982,-0.042787,-0.038989,-0.007503,0.011163,-0.011163,0.004038,0.002432,0.005885,-0.005575,-0.00624,0.010127,-0.012378,0.001796,-0.007065,0.008973,-0.005053,-0.002661,-0.021437,-0.004616,-0.00591,0.018848,0.011446,0.016565,-0.003283,-0.004013,-0.016034
Contacts_Count_12_mon,-0.018452,-0.040505,-0.010774,0.055203,0.029493,1.0,0.020817,-0.053913,0.025646,-0.024445,-0.112774,-0.152213,-0.094997,-0.055471,-0.039987,0.039987,-0.008996,-0.001016,0.00266,-0.003927,-0.006878,0.012596,0.000843,-0.008389,0.001598,0.007997,-0.009562,0.020744,0.002336,0.000167,0.011473,-0.023255,0.002195,0.000929,-0.000685,0.001797,-0.001059
Credit_Limit,0.002476,0.068065,0.007507,-0.071386,-0.020394,0.020817,1.0,0.042493,0.995981,0.012813,0.17173,0.075927,-0.00202,-0.482965,-0.420806,0.420806,0.001929,-0.005195,-0.004844,-0.001432,0.005879,0.012213,-0.006478,0.022578,-0.056713,0.032309,0.025291,0.339211,-0.161605,0.093808,0.333828,-0.395233,0.034186,-0.515644,0.234331,0.105976,0.44103
Total_Revolving_Bal,0.01478,-0.002688,0.008623,0.013726,-0.04221,-0.053913,0.042493,1.0,-0.047167,0.058174,0.06437,0.05606,0.089861,0.624022,-0.029658,0.029658,-0.011058,-0.018208,-0.000356,0.019276,0.007068,-0.004446,-0.001219,-0.002368,0.039124,-0.037794,-0.001756,0.020264,0.00522,-0.003847,0.022776,-0.015974,-0.020578,-0.022681,0.023974,0.005739,0.012799
Avg_Open_To_Buy,0.001151,0.068291,0.006732,-0.072601,-0.016605,0.025646,0.995981,-0.047167,1.0,0.007595,0.165923,0.070885,-0.010076,-0.538808,-0.418059,0.418059,0.00292,-0.003562,-0.004811,-0.00316,0.005244,0.012609,-0.006368,0.022786,-0.060209,0.035691,0.025443,0.337323,-0.162039,0.094133,0.331716,-0.393718,0.036024,-0.513502,0.232132,0.105439,0.43979
Total_Amt_Chng_Q4_Q1,-0.062042,-0.035439,-0.048959,0.050119,-0.032247,-0.024445,0.012813,0.058174,0.007595,1.0,0.039678,0.005469,0.384189,0.035235,-0.026712,0.026712,0.006848,-0.018612,-0.007694,0.005834,0.008208,0.000956,0.00243,-0.005613,0.052388,-0.043323,-0.013484,-0.009492,-0.000867,0.022399,0.012905,-0.025276,0.007951,-0.00571,0.006219,0.004091,0.002621


In [9]:
#Normalize the dataset so that each column has mean 0 and variance 1

df_normalized = df
for col in df.columns:
    
    df_normalized[col] = (df[col]- df[col].sum()/len(df)).astype('float64')
df_normalized = df_normalized/df_normalized.std()

In [10]:
#Use PCA to deal with correlated features and to reduce dimension
pca = PCA(n_components= 'mle')

data = pca.fit_transform(df_normalized)


In [11]:
data.shape

(10127, 31)

In [12]:
df.shape

(10127, 37)

In [13]:
#Split into training and test data

data_train = data[0: int(len(data)*.8)]
y_train = ys[0: int(len(data)*.8)]

data_test = data[int(len(data)*.8): len(data)]
y_test = ys[int(len(data)*.8): len(ys)]

# Model 1: Logistic regression

In [14]:
model1 = tf.keras.Sequential([keras.layers.Dense(1, activation = 'sigmoid')])

model1.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics = ['accuracy'])

model1.fit(data_train, y_train, epochs=300)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<keras.callbacks.History at 0x26a2c4646d0>

In [15]:
test_predict = model1.predict(data_test).flatten()>.5
test_accuracy = accuracy_score(test_predict,y_test)

print('Model 1 has an accuracy of '+str(test_accuracy)+' on the test data')

Model 1 has an accuracy of 0.9042448173741362 on the test data


# Model 2: 2-layer neural network

In [16]:
model2 = tf.keras.models.Sequential([tf.keras.layers.Dense(6, activation = 'relu'),
                              tf.keras.layers.Dense(1, activation = 'sigmoid')])

model2.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics = ['accuracy'])

history = model2.fit(data_train, y_train, epochs=300)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

In [17]:
test_predict = model2.predict(data_test).flatten()>.5
test_accuracy = accuracy_score(test_predict,y_test)

print('Model 2 has an accuracy of '+str(test_accuracy)+' on the test data')

Model 2 has an accuracy of 0.9215202369200395 on the test data


# Model 3: Random forest

In [18]:
#Split into training and test data

train_x = np.array(df[0:int(.8*len(df))])

test_x = np.array(df[int(.8*len(df)): len(df)])


In [19]:
model3 = RandomForestClassifier()

In [20]:
model3.fit(train_x,y_train)

In [21]:
test_predict = model3.predict(test_x)
test_accuracy = accuracy_score(test_predict,y_test)

print('Model 3 has an accuracy of '+str(test_accuracy)+' on the test data')


Model 3 has an accuracy of 0.9565646594274433 on the test data


# Thus, we see that the random forest model performs the best