## Support Vector Hyperparameter Tuning (Recall)

In [10]:
# Load Libraries

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import recall_score

from sklearn.svm import SVC

import optuna

In [12]:
# Load Data

df = pd.read_csv('bank_4.csv', index_col=0)

In [14]:
# Train / Test Split

X = df.drop(columns=['churn', 'complain', 'umap_1', 'umap_2'])
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [16]:
# Hyperparameter tuning

def objective(trial):
    
    params = {
        'C': trial.suggest_float('C', 0.001, 1.0),
        'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly'])
    }

    model = SVC(
        **params,
        random_state=42,
        probability=True)
    
    threshold = 0.27
    
    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    scores = []
    
    for tr, te in skf.split(X_train, y_train):
        
        X_tr, X_te = X_train.iloc[tr], X_train.iloc[te]
        y_tr, y_te = y_train.iloc[tr], y_train.iloc[te]
        
        model.fit(X_tr, y_tr)
        prob = model.predict_proba(X_te)[:, 1]
        y_pred = np.where(prob < threshold, 0, 1)
        
        scores.append(recall_score(y_te, y_pred))
        
    return np.mean(scores)

In [18]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, show_progress_bar=True)

[I 2024-07-15 18:37:49,969] A new study created in memory with name: no-name-d10cb353-8546-4073-946c-61c85af42e9e


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2024-07-15 18:38:27,210] Trial 0 finished with value: 0.558282208588957 and parameters: {'C': 0.051199972487481114, 'kernel': 'rbf'}. Best is trial 0 with value: 0.558282208588957.
[I 2024-07-15 18:38:56,122] Trial 1 finished with value: 0.5 and parameters: {'C': 0.07972708110156429, 'kernel': 'poly'}. Best is trial 0 with value: 0.558282208588957.
[I 2024-07-15 18:39:14,184] Trial 2 finished with value: 0.43128834355828227 and parameters: {'C': 0.0051896276710999166, 'kernel': 'linear'}. Best is trial 0 with value: 0.558282208588957.
[I 2024-07-15 18:39:54,278] Trial 3 finished with value: 0.5552147239263803 and parameters: {'C': 0.1814990994030717, 'kernel': 'rbf'}. Best is trial 0 with value: 0.558282208588957.
[I 2024-07-15 18:40:24,254] Trial 4 finished with value: 0.5 and parameters: {'C': 0.9829238075355662, 'kernel': 'poly'}. Best is trial 0 with value: 0.558282208588957.
[I 2024-07-15 18:40:51,056] Trial 5 finished with value: 0.49815950920245394 and parameters: {'C': 0.720

In [27]:
print("Best trial:", study.best_trial)
print("Best hyperparameters:", study.best_params)

Best trial: FrozenTrial(number=89, state=1, values=[0.592638036809816], datetime_start=datetime.datetime(2024, 7, 15, 19, 9, 24, 835207), datetime_complete=datetime.datetime(2024, 7, 15, 19, 9, 43, 382584), params={'C': 0.2600853655654022, 'kernel': 'linear'}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'C': FloatDistribution(high=1.0, log=False, low=0.001, step=None), 'kernel': CategoricalDistribution(choices=('linear', 'rbf', 'poly'))}, trial_id=89, value=None)
Best hyperparameters: {'C': 0.2600853655654022, 'kernel': 'linear'}


In [43]:
# See if scaling data improves score

sv = SVC(C=0.2600853655654022, kernel='linear', random_state=42, probability=True)

ss = StandardScaler()
mm = MinMaxScaler()

X_train_scaled_ss = pd.DataFrame(ss.fit_transform(X_train))
X_train_scaled_mm = pd.DataFrame(mm.fit_transform(X_train))

scales = [X_train_scaled_ss, X_train_scaled_mm]
names = ['StandardScaler', 'MinMaxScaler']

for i, n in zip(scales, names):
    
    threshold = 0.27
    
    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    scores = []
    
    for tr, te in skf.split(i, y_train):
        
        X_tr, X_te = i.iloc[tr],i.iloc[te]
        y_tr, y_te = y_train.iloc[tr], y_train.iloc[te]
        
        sv.fit(X_tr, y_tr)
        prob = sv.predict_proba(X_te)[:, 1]
        y_pred = np.where(prob < threshold, 0, 1)
        
        scores.append(recall_score(y_te, y_pred))
        
    print(f'{n} recall: {np.mean(scores)}')

StandardScaler recall: 0.5441717791411043
MinMaxScaler recall: 0.4012269938650307


Scaling the data does not improve model performance.