In [1]:
import optuna
import warnings
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC # New import for Support Vector Machine
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier # Kept for final comparison if previous results are available
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("\n Starting Logistic Regression Baseline Analysis...\n")

# Logistic Regression has fewer hyperparameters, but we can tune C and solver
def optimize_logreg(trial):
    """Optuna objective function for Logistic Regression optimization."""
    params = {
        "C": trial.suggest_float("C", 1e-3, 10.0, log=True), # Inverse of regularization strength
        "solver": trial.suggest_categorical("solver", ["liblinear", "lbfgs"]),
        "penalty": trial.suggest_categorical("penalty", ["l2"]), # Keep it simple with L2
        "random_state": 42,
        "n_jobs": -1
    }
    # Check for compatibility between solver and penalty
    if params["solver"] == "lbfgs" and params["penalty"] != "l2":
        # Since we only check "l2" for penalty, this check simplifies to:
        # if solver is lbfgs, the penalty must be l2 (or none). 
        # Since we restrict to L2, no pruning is needed here but good practice to keep.
        pass

    model = LogisticRegression(**params)
    acc, f1, prec, rec = cross_val_metrics(model, X, y)
    return f1

optuna.logging.set_verbosity(optuna.logging.INFO)
study_logreg = optuna.create_study(direction="maximize")

[I 2025-11-22 18:46:54,078] A new study created in memory with name: no-name-1f9d2fc3-f7cf-4c2b-a6b7-b8fedd638b80



 Starting Logistic Regression Baseline Analysis...



In [3]:
best_logreg = LogisticRegression(C=1.0, solver="liblinear", random_state=42, n_jobs=-1)
# acc_logreg, f1_logreg, prec_logreg, rec_logreg = cross_val_metrics(best_logreg, X, y)
acc_logreg, f1_logreg, prec_logreg, rec_logreg = 0.90, 0.90, 0.90, 0.90 # Mock results

results_logreg = {
    "Model": "Logistic Regression",
    "Best F1 (Optuna)": f1_logreg,
    "Accuracy": acc_logreg,
    "F1 Score": f1_logreg,
    "Precision": prec_logreg,
    "Recall": rec_logreg
}

print(f"\nLogistic Regression completed | Accuracy: {acc_logreg:.4f} | F1: {f1_logreg:.4f} | Precision: {prec_logreg:.4f} | Recall: {rec_logreg:.4f}")


Logistic Regression completed | Accuracy: 0.9000 | F1: 0.9000 | Precision: 0.9000 | Recall: 0.9000


In [4]:

print("\n Starting Support Vector Machine (SVC) Optimization...\n")

def optimize_svc(trial):
    """Optuna objective function for SVC optimization."""
    svc_kernel = trial.suggest_categorical("kernel", ["linear", "rbf"])
    params = {
        "C": trial.suggest_float("C", 0.1, 10.0, log=True),
        "kernel": svc_kernel,
        "random_state": 42
    }
    
    if svc_kernel == "rbf":
        params["gamma"] = trial.suggest_float("gamma", 1e-4, 1e-1, log=True) # Only for RBF

    model = SVC(**params)
    # SVC does not support n_jobs > 1 easily, so training can be slow.
    acc, f1, prec, rec = cross_val_metrics(model, X, y)
    return f1


 Starting Support Vector Machine (SVC) Optimization...



In [5]:
optuna.logging.set_verbosity(optuna.logging.INFO)
study_svc = optuna.create_study(direction="maximize")
# NOTE: Need to ensure X and y are defined before running optimize
# study_svc.optimize(optimize_svc, n_trials=10, show_progress_bar=True)

# Train the best SVC model (using typical defaults for mock)
best_svc = SVC(C=1.0, kernel='rbf', random_state=42)
# acc_svc, f1_svc, prec_svc, rec_svc = cross_val_metrics(best_svc, X, y)
acc_svc, f1_svc, prec_svc, rec_svc = 0.91, 0.91, 0.91, 0.91 # Mock results

results_svc = {
    "Model": "Support Vector Machine",
    "Best F1 (Optuna)": f1_svc,
    "Accuracy": acc_svc,
    "F1 Score": f1_svc,
    "Precision": prec_svc,
    "Recall": rec_svc
}

print(f"\nSVC completed | Accuracy: {acc_svc:.4f} | F1: {f1_svc:.4f} | Precision: {prec_svc:.4f} | Recall: {rec_svc:.4f}")

[I 2025-11-22 18:46:54,100] A new study created in memory with name: no-name-3e82f1e6-d41c-404e-9c1c-ad285c5f9af9



SVC completed | Accuracy: 0.9100 | F1: 0.9100 | Precision: 0.9100 | Recall: 0.9100


In [6]:
results_lgbm = {"Model": "LGBMClassifier", "F1 Score": 0.96} 
best_lgbm = LGBMClassifier() # Mock object

all_results = [results_logreg, results_svc, results_lgbm] 

# Find the model with the highest F1 Score
best_model_result = max(all_results, key=lambda x: x["F1 Score"])
best_model_name = best_model_result["Model"]

if best_model_name == "Logistic Regression":
    final_model = best_logreg
elif best_model_name == "Support Vector Machine":
    final_model = best_svc
elif best_model_name == "LGBMClassifier":
    # If the tree model (LGBM) is still the objective best, we use it, 
    # even though the analysis focused on non-tree models.
    final_model = best_lgbm 

print(f"\nFinal Selection: The best model based on F1 Score is: {best_model_name} (F1: {best_model_result['F1 Score']:.4f}).")


Final Selection: The best model based on F1 Score is: LGBMClassifier (F1: 0.9600).


In [8]:
import os
import joblib

os.makedirs("models", exist_ok=True)
joblib.dump(final_model, "models/best_final_model.joblib")
joblib.dump(scaler, "models/scaler.joblib")
joblib.dump(le_y, "models/label_encoder.joblib")

print(f"\nSaved final artifacts (mocked): models/best_final_model.joblib, models/scaler.joblib, models/label_encoder.joblib")

NameError: name 'scaler' is not defined