#Importing Data

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('https://raw.githubusercontent.com/21accel/bank_churn_prediction/main/BankChruner-Processed-2.csv')
data

Unnamed: 0,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,0,45,1,3,1,2,3,1,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,0,49,2,5,3,1,1,1,44,6,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,0,51,1,3,3,2,4,1,36,4,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.000
3,0,40,2,4,1,0,1,1,34,3,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.760
4,0,40,1,3,0,2,3,1,21,5,1,0,4716.0,0,4716.0,2.175,816,28,2.500,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,0,50,1,2,3,1,2,1,40,3,2,3,4003.0,1851,2152.0,0.703,15476,117,0.857,0.462
10123,1,41,1,2,-1,3,2,1,25,4,2,3,4277.0,2186,2091.0,0.804,8764,69,0.683,0.511
10124,1,44,2,1,1,2,1,1,36,5,3,4,5409.0,0,5409.0,0.819,10291,60,0.818,0.000
10125,1,30,1,2,3,0,2,1,36,4,3,3,5281.0,0,5281.0,0.535,8395,62,0.722,0.000


#Plain Regression without Hyperparameter Tuning or SMOTE

In [None]:
#Split X (train) and Y (target) from data
X = pd.DataFrame(data.loc[:, data.columns != 'Attrition_Flag'])
y = pd.DataFrame(data.loc[:, data.columns == 'Attrition_Flag'])
print('Shape of X: {}'.format(X.shape))
print('Shape of y: {}'.format(y.shape))

Shape of X: (10127, 19)
Shape of y: (10127, 1)


In [None]:
# train 80%, test 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
print(regressor.intercept_)

[0.16183187]


In [None]:
print(regressor.coef_)

[[ 0.00082011  0.04296277  0.01324506  0.00261056 -0.01660493  0.0193898
   0.01159752 -0.00520999 -0.06788618  0.04393159  0.04721462 -0.01190361
  -0.07212029 -0.00536204 -0.01139835  0.11853535 -0.23316906 -0.07151487
  -0.00719212]]


In [None]:
coeff_df = pd.DataFrame(regressor.coef_.T, X.columns, columns=['Coefficient'])
coeff_df

Unnamed: 0,Coefficient
Customer_Age,0.00082
Gender,0.042963
Dependent_count,0.013245
Education_Level,0.002611
Marital_Status,-0.016605
Income_Category,0.01939
Card_Category,0.011598
Months_on_book,-0.00521
Total_Relationship_Count,-0.067886
Months_Inactive_12_mon,0.043932


In [None]:
y_pred = regressor.predict(X_test)

In [None]:
y_pred = [np.round(y) for y in y_pred]
y_pred = [int(y) for y in y_pred]

In [None]:
len(y_pred)

2026

In [None]:
y_test

Unnamed: 0,Attrition_Flag
3536,0
8876,0
4628,0
2123,0
2774,0
...,...
10043,0
1525,0
1930,0
7304,0


In [None]:
test = y_test.to_numpy()

In [None]:
test.shape

(2026, 1)

In [None]:
test = test.flatten('F')

In [None]:
df = pd.DataFrame({'Actual': test, 'Predicted': y_pred})
df

Unnamed: 0,Actual,Predicted
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
2021,0,0
2022,0,0
2023,0,0
2024,0,0


In [None]:
df.query('Actual == 1')

Unnamed: 0,Actual,Predicted
6,1,0
19,1,1
25,1,1
28,1,0
29,1,1
...,...,...
1974,1,0
1986,1,0
2001,1,1
2005,1,0


In [None]:
df['Predicted'].describe()

count    2026.000000
mean        0.086871
std         0.281715
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: Predicted, dtype: float64

In [None]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.09970384995064166
Mean Squared Error: 0.09970384995064166
Root Mean Squared Error: 0.3157591644760951


In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred), ": is the confusion matrix")
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred), ": is the accuracy score")
from sklearn.metrics import precision_score
print(precision_score(y_test, y_pred, average='weighted'), ": is the precision score")
from sklearn.metrics import recall_score
print(recall_score(y_test, y_pred, average='weighted'), ": is the recall score")
from sklearn.metrics import f1_score
print(f1_score(y_test, y_pred, average='weighted'), ": is the f1 score")

[[1679   31]
 [ 171  145]] : is the confusion matrix
0.9002961500493584 : is the accuracy score
0.894512067932601 : is the precision score
0.9002961500493584 : is the recall score
0.8880711119022634 : is the f1 score


#Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
params = {
    'fit_intercept': [True, False],
    'normalize': [True, False],
    'copy_X': [True, False],
}
regModel_grid = GridSearchCV(estimator=LinearRegression(), param_grid=params, verbose=1, cv=10, n_jobs=-1)
regModel_grid.fit(X_train, y_train)
print(regModel_grid.best_estimator_)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)


[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    1.8s finished


In [None]:
regModel_grid.best_params_

{'copy_X': True, 'fit_intercept': True, 'normalize': False}

In [None]:
regModel_grid.best_score_

0.37307541139016936

In [None]:
y_pred = regModel_grid.predict(X_test)

In [None]:
y_pred = [np.round(y) for y in y_pred]
y_pred = [int(y) for y in y_pred]

In [None]:
len(y_pred)

2026

In [None]:
y_test

Unnamed: 0,Attrition_Flag
3536,0
8876,0
4628,0
2123,0
2774,0
...,...
10043,0
1525,0
1930,0
7304,0


In [None]:
test = y_test.to_numpy()

In [None]:
test.shape

(2026, 1)

In [None]:
test = test.flatten('F')

In [None]:
df = pd.DataFrame({'Actual': test, 'Predicted': y_pred})
df

Unnamed: 0,Actual,Predicted
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
2021,0,0
2022,0,0
2023,0,0
2024,0,0


In [None]:
df.query('Actual == 1')

Unnamed: 0,Actual,Predicted
6,1,0
19,1,1
25,1,1
28,1,0
29,1,1
...,...,...
1974,1,0
1986,1,0
2001,1,1
2005,1,0


In [None]:
df['Predicted'].describe()

count    2026.000000
mean        0.086871
std         0.281715
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: Predicted, dtype: float64

In [None]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.09970384995064166
Mean Squared Error: 0.09970384995064166
Root Mean Squared Error: 0.3157591644760951


In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred), ": is the confusion matrix")
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred), ": is the accuracy score")
from sklearn.metrics import precision_score
print(precision_score(y_test, y_pred, average='weighted'), ": is the precision score")
from sklearn.metrics import recall_score
print(recall_score(y_test, y_pred, average='weighted'), ": is the recall score")
from sklearn.metrics import f1_score
print(f1_score(y_test, y_pred, average='weighted'), ": is the f1 score")

[[1679   31]
 [ 171  145]] : is the confusion matrix
0.9002961500493584 : is the accuracy score
0.894512067932601 : is the precision score
0.9002961500493584 : is the recall score
0.8880711119022634 : is the f1 score


#Using SMOTE and Hyperparameter Tuning

In [None]:
from imblearn.over_sampling import SMOTE
print("Before OverSampling, counts of label '1' in data train: {}".format(sum(y_train['Attrition_Flag']==1)))
print("Before OverSampling, counts of label '0' in data train: {} \n".format(sum(y_train['Attrition_Flag']==0)))

sm = SMOTE(random_state=1)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

print('After OverSampling, the shape of train_X: {}'.format(X_train_res.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

print("After OverSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_res==0)))

  y = column_or_1d(y, warn=True)


Before OverSampling, counts of label '1' in data train: 1311
Before OverSampling, counts of label '0' in data train: 6790 

After OverSampling, the shape of train_X: (13580, 19)
After OverSampling, the shape of train_y: (13580,) 

After OverSampling, counts of label '1': 6790
After OverSampling, counts of label '0': 6790


In [None]:
from sklearn.model_selection import GridSearchCV
params = {
    'fit_intercept': [True, False],
    'normalize': [True, False],
    'copy_X': [True, False],
}
regModel_grid = GridSearchCV(estimator=LinearRegression(), param_grid=params, verbose=1, cv=10, n_jobs=-1)
regModel_grid.fit(X_train_res, y_train_res)
print(regModel_grid.best_estimator_)

Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)


[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:    0.7s finished


In [None]:
regModel_grid.best_params_

{'copy_X': True, 'fit_intercept': True, 'normalize': True}

In [None]:
regModel_grid.best_score_

0.052609250053311664

In [None]:
y_pred = regModel_grid.predict(X_test)

In [None]:
y_pred = [round(y) for y in y_pred]

In [None]:
len(y_pred)

2026

In [None]:
y_test

Unnamed: 0,Attrition_Flag
3536,0
8876,0
4628,0
2123,0
2774,0
...,...
10043,0
1525,0
1930,0
7304,0


In [None]:
test = y_test.to_numpy()

In [None]:
test.shape

(2026, 1)

In [None]:
test = test.flatten('F')

In [None]:
df = pd.DataFrame({'Actual': test, 'Predicted': y_pred})
df

Unnamed: 0,Actual,Predicted
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
2021,0,0
2022,0,0
2023,0,1
2024,0,0


In [None]:
df.query('Actual == 1')

Unnamed: 0,Actual,Predicted
6,1,1
19,1,1
25,1,1
28,1,1
29,1,1
...,...,...
1974,1,1
1986,1,0
2001,1,1
2005,1,1


In [None]:
df['Predicted'].describe()

count    2026.000000
mean        0.273445
std         0.446944
min        -1.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.000000
Name: Predicted, dtype: float64

In [None]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.16584402764067127
Mean Squared Error: 0.16584402764067127
Root Mean Squared Error: 0.4072395212165333


In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred), ": is the confusion matrix")
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred), ": is the accuracy score")
from sklearn.metrics import precision_score
print(precision_score(y_test, y_pred, average='weighted'), ": is the precision score")
from sklearn.metrics import recall_score
print(recall_score(y_test, y_pred, average='weighted'), ": is the recall score")
from sklearn.metrics import f1_score
print(f1_score(y_test, y_pred, average='weighted'), ": is the f1 score")

[[   0    0    0]
 [   1 1422  287]
 [   0   48  268]] : is the confusion matrix
0.8341559723593287 : is the accuracy score
0.8917839369679732 : is the precision score
0.8341559723593287 : is the recall score
0.8508303520130895 : is the f1 score


  _warn_prf(average, modifier, msg_start, len(result))
