## Logistic Regression Hyperparameter Tuning (F1-Score)

In [78]:
# Load Libraries

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression

import optuna

In [80]:
# Load Data

df = pd.read_csv('bank_4.csv', index_col=0)

In [82]:
# Train / Test Split

X = df.drop(columns=['churn', 'complain', 'umap_1', 'umap_2'])
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [86]:
# Hyperparameter tuning

def objective(trial):
    
    params = {
        'tol': trial.suggest_float('tol', 1e-6 , 1e-3),
        'C': trial.suggest_float('C', 0.001, 1.0),
        'max_iter': trial.suggest_int('max_iter', 100, 1000)                             
    }

    model = LogisticRegression(
        **params,
        random_state=42,
        n_jobs=-1)
    
    threshold = 0.25
    
    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    scores = []
    
    for tr, te in skf.split(X_train, y_train):
        
        X_tr, X_te = X_train.iloc[tr], X_train.iloc[te]
        y_tr, y_te = y_train.iloc[tr], y_train.iloc[te]
        
        model.fit(X_tr, y_tr)
        prob = model.predict_proba(X_te)[:, 1]
        y_pred = np.where(prob < threshold, 0, 1)
        
        scores.append(f1_score(y_te, y_pred))
        
    return np.mean(scores)

In [88]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500, show_progress_bar=True)    # More trials because LR is faster

[I 2024-07-15 17:54:09,157] A new study created in memory with name: no-name-c4ef80e9-f2b5-45e8-b3eb-bbae92e29dc1


  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-07-15 17:54:11,858] Trial 0 finished with value: 0.4891801282739551 and parameters: {'tol': 0.0002473242300450039, 'C': 0.7622194488786767, 'max_iter': 485}. Best is trial 0 with value: 0.4891801282739551.
[I 2024-07-15 17:54:13,515] Trial 1 finished with value: 0.48880150560084107 and parameters: {'tol': 0.0007988815883309575, 'C': 0.8657190080139927, 'max_iter': 786}. Best is trial 0 with value: 0.4891801282739551.
[I 2024-07-15 17:54:14,190] Trial 2 finished with value: 0.49021794040366384 and parameters: {'tol': 0.00032816011189402413, 'C': 0.09625496319044169, 'max_iter': 777}. Best is trial 2 with value: 0.49021794040366384.
[I 2024-07-15 17:54:14,429] Trial 3 finished with value: 0.48944427141203695 and parameters: {'tol': 0.00020772234159488245, 'C': 0.5901221512443166, 'max_iter': 606}. Best is trial 2 with value: 0.49021794040366384.
[I 2024-07-15 17:54:14,651] Trial 4 finished with value: 0.49034556962324743 and parameters: {'tol': 0.0008288324527993242, 'C': 0.20019

In [91]:
print("Best trial:", study.best_trial)
print("Best hyperparameters:", study.best_params)

Best trial: FrozenTrial(number=13, state=1, values=[0.4954671530759874], datetime_start=datetime.datetime(2024, 7, 15, 17, 54, 16, 462101), datetime_complete=datetime.datetime(2024, 7, 15, 17, 54, 16, 646676), params={'tol': 0.0009440145968962416, 'C': 0.005741852418682025, 'max_iter': 326}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'tol': FloatDistribution(high=0.001, log=False, low=1e-06, step=None), 'C': FloatDistribution(high=1.0, log=False, low=0.001, step=None), 'max_iter': IntDistribution(high=1000, log=False, low=100, step=1)}, trial_id=13, value=None)
Best hyperparameters: {'tol': 0.0009440145968962416, 'C': 0.005741852418682025, 'max_iter': 326}


In [119]:
# See if scaling data improves score

lr = LogisticRegression(tol=0.0009440145968962416, C=0.005741852418682025, max_iter=326, random_state=42, n_jobs=-1)

ss = StandardScaler()
mm = MinMaxScaler()

X_train_scaled_ss = pd.DataFrame(ss.fit_transform(X_train))
X_train_scaled_mm = pd.DataFrame(mm.fit_transform(X_train))

scales = [X_train_scaled_ss, X_train_scaled_mm]
names = ['StandardScaler', 'MinMaxScaler']

for i, n in zip(scales, names):
    
    threshold = 0.25
    
    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    scores = []
    
    for tr, te in skf.split(i, y_train):
        
        X_tr, X_te = i.iloc[tr],i.iloc[te]
        y_tr, y_te = y_train.iloc[tr], y_train.iloc[te]
        
        lr.fit(X_tr, y_tr)
        prob = lr.predict_proba(X_te)[:, 1]
        y_pred = np.where(prob < threshold, 0, 1)
        
        scores.append(f1_score(y_te, y_pred))
        
    print(f'{n} f1-score: {np.mean(scores)}')

StandardScaler f1-score: 0.48805628981105703
MinMaxScaler f1-score: 0.4417577306768427


Scaling the data does not improve model performance.