In [1]:
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
file_path = Path("./Resources/bin_5_clean.csv")
df = pd.read_csv(file_path)
df.head()

Unnamed: 0.1,Unnamed: 0,CreditScore,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,bin_age,Geography_France,Geography_Germany,Geography_Spain,gender_0,gender_1
0,0,619,2,0.0,1,1,1,101348.88,1,6,1,0,0,1,0
1,1,608,1,83807.86,1,0,1,112542.58,0,6,0,0,1,1,0
2,2,502,8,159660.8,3,1,0,113931.57,1,6,1,0,0,1,0
3,3,699,1,0.0,2,0,0,93826.63,0,5,1,0,0,1,0
4,4,850,2,125510.82,1,1,1,79084.1,0,6,0,0,1,1,0


In [3]:
# Drop unnecessary columns 
df = df.drop(['Unnamed: 0'], axis=1)
df

Unnamed: 0,CreditScore,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,bin_age,Geography_France,Geography_Germany,Geography_Spain,gender_0,gender_1
0,619,2,0.00,1,1,1,101348.88,1,6,1,0,0,1,0
1,608,1,83807.86,1,0,1,112542.58,0,6,0,0,1,1,0
2,502,8,159660.80,3,1,0,113931.57,1,6,1,0,0,1,0
3,699,1,0.00,2,0,0,93826.63,0,5,1,0,0,1,0
4,850,2,125510.82,1,1,1,79084.10,0,6,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9584,771,5,0.00,2,1,0,96270.64,0,-1,1,0,0,0,1
9585,516,10,57369.61,1,1,1,101699.77,0,-1,1,0,0,0,1
9586,709,7,0.00,1,0,1,42085.58,1,-1,1,0,0,1,0
9587,772,3,75075.31,2,1,0,92888.52,1,-1,0,1,0,0,1


In [4]:
df.dtypes

CreditScore            int64
Tenure                 int64
Balance              float64
NumOfProducts          int64
HasCrCard              int64
IsActiveMember         int64
EstimatedSalary      float64
Exited                 int64
bin_age                int64
Geography_France       int64
Geography_Germany      int64
Geography_Spain        int64
gender_0               int64
gender_1               int64
dtype: object

In [5]:
df.nunique()

CreditScore           460
Tenure                 11
Balance              6122
NumOfProducts           4
HasCrCard               2
IsActiveMember          2
EstimatedSalary      9588
Exited                  2
bin_age                11
Geography_France        2
Geography_Germany       2
Geography_Spain         2
gender_0                2
gender_1                2
dtype: int64

In [6]:
# Define features set
X = df.copy()
X = X.drop("Exited", axis=1)
X.head()

Unnamed: 0,CreditScore,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,bin_age,Geography_France,Geography_Germany,Geography_Spain,gender_0,gender_1
0,619,2,0.0,1,1,1,101348.88,6,1,0,0,1,0
1,608,1,83807.86,1,0,1,112542.58,6,0,0,1,1,0
2,502,8,159660.8,3,1,0,113931.57,6,1,0,0,1,0
3,699,1,0.0,2,0,0,93826.63,5,1,0,0,1,0
4,850,2,125510.82,1,1,1,79084.1,6,0,0,1,1,0


In [7]:
# Define target vector
y = df["Exited"].values.reshape(-1, 1)
y[:5]

array([[1],
       [0],
       [1],
       [0],
       [0]])

In [8]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [9]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7191, 13)
(2398, 13)
(7191, 1)
(2398, 1)


In [10]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [11]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [12]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [13]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [14]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [15]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

In [16]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [17]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1576,324
Actual 1,306,192


Accuracy Score : 0.737281067556297
Classification Report
              precision    recall  f1-score   support

           0       0.84      0.83      0.83      1900
           1       0.37      0.39      0.38       498

    accuracy                           0.74      2398
   macro avg       0.60      0.61      0.61      2398
weighted avg       0.74      0.74      0.74      2398



In [18]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN

sample = SMOTEENN(random_state=42)
X_resample, y_resample = sample.fit_resample(X_train_scaled, y_train)

print(y_resample)

[0 0 0 ... 1 1 1]


In [19]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [20]:
# Fitting the model
model = model.fit(X_resample, y_resample)

In [21]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

In [22]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [23]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1407,493
Actual 1,183,315


Accuracy Score : 0.7180984153461217
Classification Report
              precision    recall  f1-score   support

           0       0.88      0.74      0.81      1900
           1       0.39      0.63      0.48       498

    accuracy                           0.72      2398
   macro avg       0.64      0.69      0.64      2398
weighted avg       0.78      0.72      0.74      2398

