In [1]:
# Importing Dependencies
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# Load the data
file_path = Path("..\Final_Project\Churn_Modelling (1).csv")
churn_df = pd.read_csv(file_path)
churn_df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [4]:
churn_df.dtypes

RowNumber            int64
CustomerId           int64
Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object

In [5]:
# Drop the non-beneficial columns
churn_df = churn_df.drop(["CustomerId", "Surname", "RowNumber"], axis= 1)
churn_df.head()

Unnamed: 0,RowNumber,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [6]:
# Determine the number of unique values in each column.
churn_df.nunique()

RowNumber          10000
CreditScore          460
Geography              3
Gender                 2
Age                   70
Tenure                11
Balance             6382
NumOfProducts          4
HasCrCard              2
IsActiveMember         2
EstimatedSalary     9999
Exited                 2
dtype: int64

In [7]:
# Determining if duplicated rows exist
churn_df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
9995    False
9996    False
9997    False
9998    False
9999    False
Length: 10000, dtype: bool

In [8]:
# Determining if null values rows exist
churn_df.isnull().sum()

RowNumber          0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [9]:
# Encoding categorical values
df = churn_df.copy()
df = pd.get_dummies(df, columns=["Geography", "Gender"])
df.head()

Unnamed: 0,RowNumber,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,1,619,42,2,0.0,1,1,1,101348.88,1,1,0,0,1,0
1,2,608,41,1,83807.86,1,0,1,112542.58,0,0,0,1,1,0
2,3,502,42,8,159660.8,3,1,0,113931.57,1,1,0,0,1,0
3,4,699,39,1,0.0,2,0,0,93826.63,0,1,0,0,1,0
4,5,850,43,2,125510.82,1,1,1,79084.1,0,0,0,1,1,0


In [10]:
# Split preprocessed data into features and target
y = df["Exited"]
X = df.drop(["Exited"], axis=1)

In [11]:
# Split the preprocessed data into a training and testing dataset
# Stratifying since there is a class imbalance.
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
X_train.shape

(7500, 14)

In [12]:
y_train.value_counts()

0    5972
1    1528
Name: Exited, dtype: int64

In [14]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [15]:
from sklearn.svm import SVC

In [16]:
model = SVC(kernel='linear')

In [17]:
model.fit(X_train, y_train)

SVC(kernel='linear')

In [18]:
y_pred = model.predict(X_test)

In [21]:
results = pd.DataFrame({
    "Prediction": y_pred,
    "Actual": y_test
}).reset_index(drop=True)
results.head()

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


In [19]:
accuracy_score(y_test, y_pred)

0.7808

In [20]:
confusion_matrix(y_test, y_pred)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.97      0.88      1991
           1       0.28      0.05      0.08       509

    accuracy                           0.78      2500
   macro avg       0.54      0.51      0.48      2500
weighted avg       0.69      0.78      0.71      2500

