In [1]:
# Importing dependencies
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.svm import SVC
from collections import Counter

In [2]:
# Read in the data
df = pd.read_csv("../Resources/Churn_Modelling_2.csv")
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [3]:
# Dropping unessential columns
df = df.drop(["RowNumber", "CustomerId", "Surname"], axis=1)
df.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
# Split preprocessed data into features and target
X = df.copy()
X = X.drop("Exited", axis=1)
X = pd.get_dummies(X)

# Create our target
y = df["Exited"]

In [5]:
# Check the balance of our target values
y.value_counts()

0    7963
1    2037
Name: Exited, dtype: int64

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=1)
Counter(y_train)

Counter({0: 5983, 1: 1517})

In [7]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### Naive Random Oversampling

In [8]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({0: 5983, 1: 5983})

In [9]:
model = SVC(kernel="linear")
model.fit(X_resampled, y_resampled)

SVC(kernel='linear')

In [12]:
y_pred = model.predict(X_test_scaled)

In [15]:
# Calculated the balanced accuracy score
acc_score = balanced_accuracy_score(y_test, y_pred)
acc_score

0.7131216006216006

In [16]:
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.90      0.74      0.69      0.81      0.71      0.51      1980
          1       0.41      0.69      0.74      0.51      0.71      0.51       520

avg / total       0.80      0.73      0.70      0.75      0.71      0.51      2500



In [18]:
# Displaying results
print("Confusion Matrix")
display(confusion_matrix(y_test, y_pred))
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


array([[1457,  523],
       [ 161,  359]], dtype=int64)

Accuracy Score : 0.7131216006216006
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.90      0.74      0.69      0.81      0.71      0.51      1980
          1       0.41      0.69      0.74      0.51      0.71      0.51       520

avg / total       0.80      0.73      0.70      0.75      0.71      0.51      2500



### SMOTE Oversampling

In [20]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy="auto").fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({0: 5983, 1: 5983})

In [21]:
model = SVC(kernel="linear")
model.fit(X_resampled, y_resampled)

SVC(kernel='linear')

In [22]:
y_pred = model.predict(X_test_scaled)

In [23]:
# Calculated the balanced accuracy score
acc_score = balanced_accuracy_score(y_test, y_pred)
acc_score

0.7148407148407148

In [24]:
# Displaying results
print("Confusion Matrix")
display(confusion_matrix(y_test, y_pred))
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


array([[1460,  520],
       [ 160,  360]], dtype=int64)

Accuracy Score : 0.7148407148407148
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.90      0.74      0.69      0.81      0.71      0.51      1980
          1       0.41      0.69      0.74      0.51      0.71      0.51       520

avg / total       0.80      0.73      0.70      0.75      0.71      0.51      2500



### Undersampling

In [25]:
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({0: 1517, 1: 1517})

In [26]:
model = SVC(kernel="linear")
model.fit(X_resampled, y_resampled)

SVC(kernel='linear')

In [27]:
y_pred = model.predict(X_test_scaled)

In [28]:
# Calculated the balanced accuracy score
acc_score = balanced_accuracy_score(y_test, y_pred)
acc_score

0.6950660450660451

In [29]:
# Displaying results
print("Confusion Matrix")
display(confusion_matrix(y_test, y_pred))
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


array([[1435,  545],
       [ 174,  346]], dtype=int64)

Accuracy Score : 0.6950660450660451
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.89      0.72      0.67      0.80      0.69      0.49      1980
          1       0.39      0.67      0.72      0.49      0.69      0.48       520

avg / total       0.79      0.71      0.68      0.74      0.69      0.48      2500



### SMOTEEN

In [30]:
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X_train_scaled, y_train)
Counter(y_resampled)

Counter({0: 3985, 1: 5070})

In [31]:
model = SVC(kernel="linear")
model.fit(X_resampled, y_resampled)

SVC(kernel='linear')

In [32]:
y_pred = model.predict(X_test_scaled)

In [33]:
# Calculated the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.7104700854700854

In [34]:
# Displaying results
print("Confusion Matrix")
display(confusion_matrix(y_test, y_pred))
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report_imbalanced(y_test, y_pred))

Confusion Matrix


array([[1298,  682],
       [ 122,  398]], dtype=int64)

Accuracy Score : 0.6950660450660451
Classification Report
                   pre       rec       spe        f1       geo       iba       sup

          0       0.91      0.66      0.77      0.76      0.71      0.50      1980
          1       0.37      0.77      0.66      0.50      0.71      0.51       520

avg / total       0.80      0.68      0.74      0.71      0.71      0.50      2500

