# `2.3 b) Scale sensitive models`

1. Numerical Scaling
1. (SMOTE) 'Synthetic Minority Over-sampling Technique' class balancing 
1. Modeling
    - a) Logistic Regression
    - b) (SVM) Support Vectore Machine
    - c) Neural Networks

## 2.3.0 Libraries and constants

In [1]:
# Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Paths
TRAIN_SET_PATH  = 'data/processed-data/2-fayaad-2-train_processed.csv'
TEST_SET_PATH   = 'data/processed-data/2-fayaad-2-test_processed.csv'

# Constants
TRAIN_SET_DF = pd.read_csv(TRAIN_SET_PATH)
TEST_SET_DF  = pd.read_csv(TEST_SET_PATH)

TARGET_COLUMN = 'T1 - Is good credit'

Y_TRAIN, X_TRAIN = TRAIN_SET_DF[TARGET_COLUMN], TRAIN_SET_DF.drop(columns=[TARGET_COLUMN])
Y_TEST,  X_TEST  = TEST_SET_DF[TARGET_COLUMN],  TEST_SET_DF.drop(columns=[TARGET_COLUMN])

RANDOM_STATE = 42

## 2.3.2 Numerical Scaling

In [3]:
from sklearn.preprocessing import StandardScaler

In [4]:
# Fit to the training data only
scaler = StandardScaler()
scaler.fit(X_TRAIN)

# Transform both the training and testing data
X_TRAIN_SCALED = pd.DataFrame(scaler.transform(X_TRAIN), columns=X_TRAIN.columns)
X_TEST_SCALED  = pd.DataFrame(scaler.transform(X_TEST),  columns=X_TEST.columns)

## 2.3.2 Class balancing

In [5]:
from imblearn.over_sampling import SMOTE

In [6]:
# Instantiate SMOTE
smote = SMOTE(random_state = RANDOM_STATE)

# Fit and resample ONLY on the training data
X_TRAIN_RESAMPLED, Y_TRAIN_RESAMPLED = smote.fit_resample(X_TRAIN_SCALED, Y_TRAIN)

## 2.3.3 Modeling

In [7]:
from sklearn.metrics import classification_report, ConfusionMatrixDisplay

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [None]:
def fit_predict_metrics(model, smote: bool, X_train, y_train, X_test, y_test):
    
    print(f"\n--- {model.__class__.__name__} Results ---")
    print(f"SMOTE applied: {smote}")
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(classification_report(y_test, y_pred))

In [9]:
log_reg = LogisticRegression(random_state=RANDOM_STATE)
svm_model = SVC(random_state=RANDOM_STATE)
nn_model = MLPClassifier(random_state=RANDOM_STATE, max_iter=1000)

### a) Logistic regression

In [24]:
fit_predict_metrics(log_reg,   smote=True,  X_train=X_TRAIN_RESAMPLED, y_train=Y_TRAIN_RESAMPLED, X_test=X_TEST_SCALED, y_test=Y_TEST)
fit_predict_metrics(log_reg,   smote=False, X_train=X_TRAIN_SCALED, y_train=Y_TRAIN, X_test=X_TEST_SCALED, y_test=Y_TEST)


--- LogisticRegression Results ---
SMOTE applied.
              precision    recall  f1-score   support

           0       0.41      0.59      0.48        90
           1       0.78      0.63      0.70       210

    accuracy                           0.62       300
   macro avg       0.60      0.61      0.59       300
weighted avg       0.67      0.62      0.63       300


--- LogisticRegression Results ---
No SMOTE.
              precision    recall  f1-score   support

           0       0.55      0.41      0.47        90
           1       0.77      0.86      0.81       210

    accuracy                           0.72       300
   macro avg       0.66      0.63      0.64       300
weighted avg       0.71      0.72      0.71       300



### b) (SVM) Support Vector Machine

In [26]:
fit_predict_metrics(svm_model, smote=True,  X_train=X_TRAIN_RESAMPLED, y_train=Y_TRAIN_RESAMPLED, X_test=X_TEST_SCALED, y_test=Y_TEST)
fit_predict_metrics(svm_model, smote=False, X_train=X_TRAIN_SCALED, y_train=Y_TRAIN, X_test=X_TEST_SCALED, y_test=Y_TEST)


--- SVC Results ---
SMOTE applied.
              precision    recall  f1-score   support

           0       0.42      0.47      0.44        90
           1       0.76      0.72      0.74       210

    accuracy                           0.65       300
   macro avg       0.59      0.60      0.59       300
weighted avg       0.66      0.65      0.65       300


--- SVC Results ---
No SMOTE.
              precision    recall  f1-score   support

           0       0.42      0.18      0.25        90
           1       0.72      0.90      0.80       210

    accuracy                           0.68       300
   macro avg       0.57      0.54      0.52       300
weighted avg       0.63      0.68      0.63       300



### c) Neural Networks

In [None]:
fit_predict_metrics(nn_model, smote=True,  X_train=X_TRAIN_RESAMPLED, y_train=Y_TRAIN_RESAMPLED, X_test=X_TEST_SCALED, y_test=Y_TEST)
fit_predict_metrics(nn_model, smote=False, X_train=X_TRAIN_SCALED, y_train=Y_TRAIN, X_test=X_TEST_SCALED, y_test=Y_TEST)


--- MLPClassifier Results ---
SMOTE applied.
              precision    recall  f1-score   support

           0       0.45      0.48      0.46        90
           1       0.77      0.75      0.76       210

    accuracy                           0.67       300
   macro avg       0.61      0.61      0.61       300
weighted avg       0.67      0.67      0.67       300


--- MLPClassifier Results ---
No SMOTE.
              precision    recall  f1-score   support

           0       0.44      0.44      0.44        90
           1       0.76      0.76      0.76       210

    accuracy                           0.66       300
   macro avg       0.60      0.60      0.60       300
weighted avg       0.66      0.66      0.66       300



# END