In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


LOAD THE DATA


In [2]:
data = pd.read_csv('Churn_Modelling.csv')
print(data.head())

   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         790

In [3]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB
None


In [4]:
print(data.describe())

         RowNumber    CustomerId   CreditScore           Age        Tenure  \
count  10000.00000  1.000000e+04  10000.000000  10000.000000  10000.000000   
mean    5000.50000  1.569094e+07    650.528800     38.921800      5.012800   
std     2886.89568  7.193619e+04     96.653299     10.487806      2.892174   
min        1.00000  1.556570e+07    350.000000     18.000000      0.000000   
25%     2500.75000  1.562853e+07    584.000000     32.000000      3.000000   
50%     5000.50000  1.569074e+07    652.000000     37.000000      5.000000   
75%     7500.25000  1.575323e+07    718.000000     44.000000      7.000000   
max    10000.00000  1.581569e+07    850.000000     92.000000     10.000000   

             Balance  NumOfProducts    HasCrCard  IsActiveMember  \
count   10000.000000   10000.000000  10000.00000    10000.000000   
mean    76485.889288       1.530200      0.70550        0.515100   
std     62397.405202       0.581654      0.45584        0.499797   
min         0.000000     

In [5]:
print(data.isnull().sum())

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64


data preprocess


In [9]:
data.dropna(inplace=True)
print(data.head())


   RowNumber  CustomerId   Surname  CreditScore Geography  Gender  Age  \
0          1    15634602  Hargrave          619    France  Female   42   
1          2    15647311      Hill          608     Spain  Female   41   
2          3    15619304      Onio          502    France  Female   42   
3          4    15701354      Boni          699    France  Female   39   
4          5    15737888  Mitchell          850     Spain  Female   43   

   Tenure    Balance  NumOfProducts  HasCrCard  IsActiveMember  \
0       2       0.00              1          1               1   
1       1   83807.86              1          0               1   
2       8  159660.80              3          1               0   
3       1       0.00              2          0               0   
4       2  125510.82              1          1               1   

   EstimatedSalary  Exited  
0        101348.88       1  
1        112542.58       0  
2        113931.57       1  
3         93826.63       0  
4         790

one-hot encoding droping one column to prdict other


In [11]:
df = pd.get_dummies(data, drop_first=True)
print(df.head())



   RowNumber  CustomerId  CreditScore  Age  Tenure    Balance  NumOfProducts  \
0          1    15634602          619   42       2       0.00              1   
1          2    15647311          608   41       1   83807.86              1   
2          3    15619304          502   42       8  159660.80              3   
3          4    15701354          699   39       1       0.00              2   
4          5    15737888          850   43       2  125510.82              1   

   HasCrCard  IsActiveMember  EstimatedSalary  ...  Surname_Zotova  \
0          1               1        101348.88  ...           False   
1          0               1        112542.58  ...           False   
2          1               0        113931.57  ...           False   
3          0               0         93826.63  ...           False   
4          1               1         79084.10  ...           False   

   Surname_Zox  Surname_Zubarev  Surname_Zubareva  Surname_Zuev  \
0        False            False

In [12]:

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numerical_features =[ 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

print(df.head())


   RowNumber  CustomerId  CreditScore  Age    Tenure   Balance  NumOfProducts  \
0          1    15634602          619   42 -1.041760 -1.225848      -0.911583   
1          2    15647311          608   41 -1.387538  0.117350      -0.911583   
2          3    15619304          502   42  1.032908  1.333053       2.527057   
3          4    15701354          699   39 -1.387538 -1.225848       0.807737   
4          5    15737888          850   43 -1.041760  0.785728      -0.911583   

   HasCrCard  IsActiveMember  EstimatedSalary  ...  Surname_Zotova  \
0          1               1         0.021886  ...           False   
1          0               1         0.216534  ...           False   
2          1               0         0.240687  ...           False   
3          0               0        -0.108918  ...           False   
4          1               1        -0.365276  ...           False   

   Surname_Zox  Surname_Zubarev  Surname_Zubareva  Surname_Zuev  \
0        False           

split the data

In [13]:
from sklearn.model_selection import train_test_split
X = df.drop('Exited', axis=1)
y = df['Exited']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(df.head())

   RowNumber  CustomerId  CreditScore  Age    Tenure   Balance  NumOfProducts  \
0          1    15634602          619   42 -1.041760 -1.225848      -0.911583   
1          2    15647311          608   41 -1.387538  0.117350      -0.911583   
2          3    15619304          502   42  1.032908  1.333053       2.527057   
3          4    15701354          699   39 -1.387538 -1.225848       0.807737   
4          5    15737888          850   43 -1.041760  0.785728      -0.911583   

   HasCrCard  IsActiveMember  EstimatedSalary  ...  Surname_Zotova  \
0          1               1         0.021886  ...           False   
1          0               1         0.216534  ...           False   
2          1               0         0.240687  ...           False   
3          0               0        -0.108918  ...           False   
4          1               1        -0.365276  ...           False   

   Surname_Zox  Surname_Zubarev  Surname_Zubareva  Surname_Zuev  \
0        False           

train a model


In [14]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)


evaluate the model

In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Precision: {precision_score(y_test, y_pred)}')
print(f'Recall: {recall_score(y_test, y_pred)}')
print(f'F1 Score: {f1_score(y_test, y_pred)}')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7875
Precision: 0.2948717948717949
Recall: 0.058524173027989825
F1 Score: 0.09766454352441614
[[1552   55]
 [ 370   23]]
              precision    recall  f1-score   support

           0       0.81      0.97      0.88      1607
           1       0.29      0.06      0.10       393

    accuracy                           0.79      2000
   macro avg       0.55      0.51      0.49      2000
weighted avg       0.71      0.79      0.73      2000



improved model


In [18]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred_rf)}')
print(f'Precision: {precision_score(y_test, y_pred_rf)}')
print(f'Recall: {recall_score(y_test, y_pred_rf)}')
print(f'F1 Score: {f1_score(y_test, y_pred_rf)}')
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

Accuracy: 0.859
Precision: 0.84472049689441
Recall: 0.3460559796437659
F1 Score: 0.49097472924187724
[[1582   25]
 [ 257  136]]
              precision    recall  f1-score   support

           0       0.86      0.98      0.92      1607
           1       0.84      0.35      0.49       393

    accuracy                           0.86      2000
   macro avg       0.85      0.67      0.70      2000
weighted avg       0.86      0.86      0.83      2000

