In [15]:
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [16]:
file_path = Path("./Resources/bin_10_clean.csv")
df = pd.read_csv(file_path)
df.head()

Unnamed: 0.1,Unnamed: 0,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,bin_age,gender_0,gender_1
0,0,619,France,42,2,0.0,1,1,1,101348.88,1,3,1,0
1,1,608,Spain,41,1,83807.86,1,0,1,112542.58,0,3,1,0
2,2,502,France,42,8,159660.8,3,1,0,113931.57,1,3,1,0
3,3,699,France,39,1,0.0,2,0,0,93826.63,0,2,1,0
4,4,850,Spain,43,2,125510.82,1,1,1,79084.1,0,3,1,0


In [17]:
# Drop unnecessary columns 
df = df.drop(['Unnamed: 0', 'Age'], axis=1)
df

Unnamed: 0,CreditScore,Geography,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,bin_age,gender_0,gender_1
0,619,France,2,0.00,1,1,1,101348.88,1,3,1,0
1,608,Spain,1,83807.86,1,0,1,112542.58,0,3,1,0
2,502,France,8,159660.80,3,1,0,113931.57,1,3,1,0
3,699,France,1,0.00,2,0,0,93826.63,0,2,1,0
4,850,Spain,2,125510.82,1,1,1,79084.10,0,3,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9584,771,France,5,0.00,2,1,0,96270.64,0,2,0,1
9585,516,France,10,57369.61,1,1,1,101699.77,0,2,0,1
9586,709,France,7,0.00,1,0,1,42085.58,1,2,1,0
9587,772,Germany,3,75075.31,2,1,0,92888.52,1,3,0,1


In [18]:
df.dtypes

CreditScore          int64
Geography           object
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
bin_age              int64
gender_0             int64
gender_1             int64
dtype: object

In [19]:
# Binary encoding
encoded = pd.get_dummies(df, columns=["Geography"])
encoded.head()

Unnamed: 0,CreditScore,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,bin_age,gender_0,gender_1,Geography_France,Geography_Germany,Geography_Spain
0,619,2,0.0,1,1,1,101348.88,1,3,1,0,1,0,0
1,608,1,83807.86,1,0,1,112542.58,0,3,1,0,0,0,1
2,502,8,159660.8,3,1,0,113931.57,1,3,1,0,1,0,0
3,699,1,0.0,2,0,0,93826.63,0,2,1,0,1,0,0
4,850,2,125510.82,1,1,1,79084.1,0,3,1,0,0,0,1


In [20]:
df.nunique()

CreditScore         460
Geography             3
Tenure               11
Balance            6122
NumOfProducts         4
HasCrCard             2
IsActiveMember        2
EstimatedSalary    9588
Exited                2
bin_age               6
gender_0              2
gender_1              2
dtype: int64

In [21]:
# Define features set
X = encoded.copy()
X = X.drop("Exited", axis=1)
X.head()

Unnamed: 0,CreditScore,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,bin_age,gender_0,gender_1,Geography_France,Geography_Germany,Geography_Spain
0,619,2,0.0,1,1,1,101348.88,3,1,0,1,0,0
1,608,1,83807.86,1,0,1,112542.58,3,1,0,0,0,1
2,502,8,159660.8,3,1,0,113931.57,3,1,0,1,0,0
3,699,1,0.0,2,0,0,93826.63,2,1,0,1,0,0
4,850,2,125510.82,1,1,1,79084.1,3,1,0,0,0,1


In [22]:
# Define target vector
y = encoded["Exited"].values.reshape(-1, 1)
y[:5]

array([[1],
       [0],
       [1],
       [0],
       [0]])

In [23]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [24]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7191, 13)
(2398, 13)
(7191, 1)
(2398, 1)


In [25]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [26]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [27]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [28]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [29]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [30]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

In [31]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [32]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1631,269
Actual 1,274,224


Accuracy Score : 0.7735613010842368
Classification Report
              precision    recall  f1-score   support

           0       0.86      0.86      0.86      1900
           1       0.45      0.45      0.45       498

    accuracy                           0.77      2398
   macro avg       0.66      0.65      0.65      2398
weighted avg       0.77      0.77      0.77      2398



In [33]:
# Resample the training data with SMOTEENN
from imblearn.combine import SMOTEENN

sample = SMOTEENN(random_state=42)
X_resample, y_resample = sample.fit_resample(X_train_scaled, y_train)

print(y_resample)

[0 0 0 ... 1 1 1]


In [34]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [35]:
# Fitting the model
model = model.fit(X_resample, y_resample)

In [36]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)

In [37]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [38]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1511,389
Actual 1,169,329


Accuracy Score : 0.7673060884070059
Classification Report
              precision    recall  f1-score   support

           0       0.90      0.80      0.84      1900
           1       0.46      0.66      0.54       498

    accuracy                           0.77      2398
   macro avg       0.68      0.73      0.69      2398
weighted avg       0.81      0.77      0.78      2398

