## Logistic Regression Hyperparameter Tuning (Recall)

In [4]:
# Load Libraries

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression

import optuna

In [5]:
# Load Data

df = pd.read_csv('bank_4.csv', index_col=0)

In [8]:
# Train / Test Split

X = df.drop(columns=['churn', 'complain', 'umap_1', 'umap_2'])
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [10]:
# Hyperparameter tuning

def objective(trial):
    
    params = {
        'tol': trial.suggest_float('tol', 1e-6 , 1e-3),
        'C': trial.suggest_float('C', 0.001, 1.0),
        'max_iter': trial.suggest_int('max_iter', 100, 1000)                             
    }

    model = LogisticRegression(
        **params,
        random_state=42,
        n_jobs=-1)
    
    threshold = 0.25
    
    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    scores = []
    
    for tr, te in skf.split(X_train, y_train):
        
        X_tr, X_te = X_train.iloc[tr], X_train.iloc[te]
        y_tr, y_te = y_train.iloc[tr], y_train.iloc[te]
        
        model.fit(X_tr, y_tr)
        prob = model.predict_proba(X_te)[:, 1]
        y_pred = np.where(prob < threshold, 0, 1)
        
        scores.append(recall_score(y_te, y_pred))
        
    return np.mean(scores)

In [12]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500, show_progress_bar=True)

[I 2024-07-15 18:17:21,272] A new study created in memory with name: no-name-0f61f60c-3e29-4d7c-9ad7-a1a4a40531d8


  0%|          | 0/500 [00:00<?, ?it/s]

[I 2024-07-15 18:17:24,821] Trial 0 finished with value: 0.5920245398773005 and parameters: {'tol': 0.0006853793636106287, 'C': 0.8457273313650081, 'max_iter': 319}. Best is trial 0 with value: 0.5920245398773005.
[I 2024-07-15 18:17:26,504] Trial 1 finished with value: 0.5920245398773005 and parameters: {'tol': 0.0006325843899496682, 'C': 0.7911178115853714, 'max_iter': 425}. Best is trial 0 with value: 0.5920245398773005.
[I 2024-07-15 18:17:26,758] Trial 2 finished with value: 0.5914110429447852 and parameters: {'tol': 0.00035914539707157734, 'C': 0.9767840397598615, 'max_iter': 772}. Best is trial 0 with value: 0.5920245398773005.
[I 2024-07-15 18:17:26,986] Trial 3 finished with value: 0.5944785276073621 and parameters: {'tol': 0.000589445126073971, 'C': 0.2450691606995958, 'max_iter': 322}. Best is trial 3 with value: 0.5944785276073621.
[I 2024-07-15 18:17:27,191] Trial 4 finished with value: 0.5920245398773006 and parameters: {'tol': 0.0003304238047041756, 'C': 0.04885650803229

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[I 2024-07-15 18:19:30,266] Trial 431 finished with value: 0.5938650306748466 and parameters: {'tol': 8.1719602308176e-05, 'C': 0.18031351255312236, 'max_iter': 100}. Best is trial 3 with value: 0.5944785276073621.
[I 2024-07-15 18:19:30,565] Trial 432 finished with value: 0.5938650306748466 and parameters: {'tol': 0.0006066939890833078, 'C': 0.24103305401164868, 'max_iter': 172}. Best is trial 3 with value: 0.5944785276073621.
[I 2024-07-15 18:19:30,858] Trial 433 finished with value: 0.5944785276073621 and parameters: {'tol': 0.0005181780601673564, 'C': 0.19875577213679857, 'max_iter': 299}. Best is trial 3 with value: 0.5944785276073621.
[I 2024-07-15 18:19:31,177] Trial 434 finished with value: 0.5944785276073621 and parameters: {'tol': 0.0005652957424718188, 'C': 0.21894866366192967, 'max_iter': 142}. Best is trial 3 with value: 0.5944785276073621.
[I 2024-07-15 18:19:31,461] Trial 435 finished with value: 0.5938650306748465 and parameters: {'tol': 0.0005833309149189461, 'C': 0.12

In [17]:
print("Best trial:", study.best_trial)
print("Best hyperparameters:", study.best_params)

Best trial: FrozenTrial(number=3, state=1, values=[0.5944785276073621], datetime_start=datetime.datetime(2024, 7, 15, 18, 17, 26, 759476), datetime_complete=datetime.datetime(2024, 7, 15, 18, 17, 26, 986583), params={'tol': 0.000589445126073971, 'C': 0.2450691606995958, 'max_iter': 322}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'tol': FloatDistribution(high=0.001, log=False, low=1e-06, step=None), 'C': FloatDistribution(high=1.0, log=False, low=0.001, step=None), 'max_iter': IntDistribution(high=1000, log=False, low=100, step=1)}, trial_id=3, value=None)
Best hyperparameters: {'tol': 0.000589445126073971, 'C': 0.2450691606995958, 'max_iter': 322}


In [30]:
# See if scaling data improves score

lr = LogisticRegression(tol=0.000589445126073971, C=0.2450691606995958, max_iter=322, random_state=42, n_jobs=-1)

ss = StandardScaler()
mm = MinMaxScaler()

X_train_scaled_ss = pd.DataFrame(ss.fit_transform(X_train))
X_train_scaled_mm = pd.DataFrame(mm.fit_transform(X_train))

scales = [X_train_scaled_ss, X_train_scaled_mm]
names = ['StandardScaler', 'MinMaxScaler']

for i, n in zip(scales, names):
    
    threshold = 0.25
    
    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    scores = []
    
    for tr, te in skf.split(i, y_train):
        
        X_tr, X_te = i.iloc[tr],i.iloc[te]
        y_tr, y_te = y_train.iloc[tr], y_train.iloc[te]
        
        lr.fit(X_tr, y_tr)
        prob = lr.predict_proba(X_te)[:, 1]
        y_pred = np.where(prob < threshold, 0, 1)
        
        scores.append(recall_score(y_te, y_pred))
        
    print(f'{n} recall: {np.mean(scores)}')

StandardScaler recall: 0.59079754601227
MinMaxScaler recall: 0.5858895705521473


Scaling the data does not improve model performance.