In [44]:
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [45]:
churn_modeling_path = ('Churn_Modelling.csv')
churn_data = pd.read_csv(churn_modeling_path)
churn_data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [46]:
churn_data.shape

(10000, 14)

In [47]:
churn_data.describe()

Unnamed: 0,RowNumber,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,15690940.0,650.5288,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037
std,2886.89568,71936.19,96.653299,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769
min,1.0,15565700.0,350.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0
25%,2500.75,15628530.0,584.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0
50%,5000.5,15690740.0,652.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0
75%,7500.25,15753230.0,718.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0
max,10000.0,15815690.0,850.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0


In [48]:
churn_data.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [49]:
churn_data.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [50]:
data = churn_data.drop(['CustomerId', 'Surname'], axis=1)
data.head()

Unnamed: 0,RowNumber,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [51]:
data['Geography'].unique()

array(['France', 'Spain', 'Germany'], dtype=object)

In [52]:
#Encoding on Geography and Gender
Geography_dummies=pd.get_dummies(data=data,columns=['Geography'])
Gender_dummies=Geography_dummies.replace(to_replace={'Gender': {'Female':1, 'Male':0}})

In [53]:
data_encoded = Gender_dummies
data_encoded.head()

Unnamed: 0,RowNumber,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,1,619,1,42,2,0.0,1,1,1,101348.88,1,1,0,0
1,2,608,1,41,1,83807.86,1,0,1,112542.58,0,0,0,1
2,3,502,1,42,8,159660.8,3,1,0,113931.57,1,1,0,0
3,4,699,1,39,1,0.0,2,0,0,93826.63,0,1,0,0
4,5,850,1,43,2,125510.82,1,1,1,79084.1,0,0,0,1


In [54]:
#Before Handling Imbalanced data
data_encoded['Exited'].value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [56]:
#To Split in training and testing
X = data_encoded.drop('Exited', axis=1)
Y = data_encoded['Exited']

In [57]:
#To handle Imbalanced Data
from imblearn.over_sampling import SMOTE

In [58]:
X_train, Y_train = SMOTE().fit_resample(X,Y)

In [59]:
Y_train.value_counts()

1    7963
0    7963
Name: Exited, dtype: int64

In [60]:
#Splitting
X_train1,X_test,Y_train1,Y_test=train_test_split(X_train,Y_train,test_size=0.2,random_state=42)

In [61]:
#Feature Scaling
sc = StandardScaler()

In [62]:
X_train1 = sc.fit_transform(X_train1)
X_test = sc.transform(X_test)

In [63]:
X_train1

array([[ 1.01375165, -0.01974846, -0.81462859, ...,  1.26596041,
        -0.57384565, -0.46150489],
       [-1.18666088,  1.09602249, -0.81462859, ...,  1.26596041,
        -0.57384565, -0.46150489],
       [-0.06818117,  0.52719808,  1.22755328, ..., -0.78991411,
         1.74262887, -0.46150489],
       ...,
       [-1.47795049,  0.19903016, -0.81462859, ...,  1.26596041,
        -0.57384565, -0.46150489],
       [ 1.21783529,  0.2865416 ,  1.22755328, ..., -0.78991411,
        -0.57384565, -0.46150489],
       [ 0.84151084,  0.52719808, -0.81462859, ..., -0.78991411,
         1.74262887, -0.46150489]])

In [64]:
lr = LogisticRegression()

In [65]:
lr.fit(X_train1, Y_train1)

In [66]:
y_pred = lr.predict(X_test)

In [67]:
print(y_pred)

[0 1 0 ... 1 1 1]


In [68]:
accuracy_score(Y_test, y_pred)

0.8141870684243565

In [69]:
from sklearn.metrics import precision_score,recall_score,f1_score
print("Precision Score", precision_score(Y_test, y_pred))
print("Recall Score", recall_score(Y_test, y_pred))
print("f1_Score", f1_score(Y_test, y_pred))

Precision Score 0.8357791754018169
Recall Score 0.7701223438506117
f1_Score 0.8016085790884718


In [70]:
print(classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.86      0.83      1633
           1       0.84      0.77      0.80      1553

    accuracy                           0.81      3186
   macro avg       0.82      0.81      0.81      3186
weighted avg       0.82      0.81      0.81      3186



In [71]:
results=confusion_matrix(Y_test, y_pred)
print(results)

[[1398  235]
 [ 357 1196]]


In [72]:
#RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [73]:
rf.fit(X_train1, Y_train1)

In [74]:
y_pred_rf = rf.predict(X_test)

In [75]:
print(classification_report(Y_test,y_pred_rf))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90      1633
           1       0.90      0.88      0.89      1553

    accuracy                           0.89      3186
   macro avg       0.89      0.89      0.89      3186
weighted avg       0.89      0.89      0.89      3186



In [76]:
accuracy_score(Y_test, y_pred_rf)

0.8929692404268675

In [77]:
#GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()

In [78]:
gb.fit(X_train1, Y_train1)

In [79]:
y_pred_gb = gb.predict(X_test)

In [80]:
print(classification_report(Y_test,y_pred_gb))

              precision    recall  f1-score   support

           0       0.86      0.89      0.87      1633
           1       0.88      0.85      0.86      1553

    accuracy                           0.87      3186
   macro avg       0.87      0.87      0.87      3186
weighted avg       0.87      0.87      0.87      3186



In [81]:
final_data=pd.DataFrame({'Models':['LR','RF','GBC'],
                        'Accuracy':[accuracy_score(Y_test,y_pred),
                              accuracy_score(Y_test,y_pred_rf),
                              accuracy_score(Y_test,y_pred_gb)]})

In [82]:
final_data

Unnamed: 0,Models,Accuracy
0,LR,0.814187
1,RF,0.892969
2,GBC,0.86629
