# **MLCB - Assignment 2**
### Glykeria Spyrou

### *3.3 Final model training*

In [1]:
import os
import optuna
import pandas as pd
from sklearn.linear_model import LogisticRegression
import pickle
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [2]:
# Import the whole dataset
os.chdir('/Users/glykeriasp/Documents/DSIT/Machine Learning in Computational Biology/Assignments/Assignment 2/Assignment_2/')
data = pd.read_csv('data/Diabetes.csv')
X = data.iloc[:, 1:-1]
y = data.iloc[:, -1]

Run the Optuna study

Train and save the best model

In [3]:
def objective(trial):
    """
    Objective function for Optuna study to optimize Precision-Recall AUC.

    Parameters:
        trial (optuna.trial.Trial): Optuna trial object.

    Returns:
        float: Score to optimize (Precision-Recall AUC).
    """
    import numpy as np
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import average_precision_score

    # Define the parameter space for hyperparameter optimization
    C = trial.suggest_float('C', 1e-4, 1e4, log=True)
    penalty = trial.suggest_categorical('penalty', ['l1', 'l2'])
    solver = trial.suggest_categorical('solver', ['liblinear', 'saga'])

    # Create a Logistic Regression model with the sampled hyperparameters
    model = LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=1000, random_state=42)

    # Define the cross-validation strategy (StratifiedKFold)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Define variables to store scores
    pr_auc_scores = []

    # Perform cross-validation and calculate PR-AUC scores
    for train_index, test_index in cv.split(X, y):
        # Split the data into training and test sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Train the model on the training set
        model.fit(X_train, y_train)

        # Predict probabilities for the positive class on the test set
        y_scores = model.predict_proba(X_test)[:, 1]

        # Calculate Precision-Recall AUC (PR-AUC)
        pr_auc = average_precision_score(y_test, y_scores)

        # Store the PR-AUC score
        pr_auc_scores.append(pr_auc)

    # Calculate the mean PR-AUC score across all folds
    mean_pr_auc_score = np.mean(pr_auc_scores)

    # Return the mean PR-AUC score as the objective value
    return mean_pr_auc_score


In [4]:
# Define the Optuna study
study = optuna.create_study(direction='maximize')

# Optimize the objective function (objective) with a specific number of trials
study.optimize(objective, n_trials=100)

# Get the best parameters and the best value
best_params = study.best_params
best_value = study.best_value

print(f"Best parameters: {best_params}")
print(f"Best value (Precision-Recall AUC): {best_value}")

# Create the final model using the best parameters
best_model = LogisticRegression(**best_params, max_iter=1000, random_state=42)

# Train the model on the entire dataset
best_model.fit(X, y)

# Save the best model
model_file_path = './models/final_model.pkl'
with open(model_file_path, 'wb') as file:
    pickle.dump(best_model, file)

print(f"Trained Logistic Regression model saved to: {model_file_path}")


[I 2024-04-29 14:36:43,711] A new study created in memory with name: no-name-af7a19b5-2414-4767-bbbf-a92003d97ba3
[I 2024-04-29 14:36:43,754] Trial 0 finished with value: 0.7323578606727177 and parameters: {'C': 0.7250022290110179, 'penalty': 'l1', 'solver': 'liblinear'}. Best is trial 0 with value: 0.7323578606727177.
[I 2024-04-29 14:36:43,999] Trial 1 finished with value: 0.5385105058307186 and parameters: {'C': 0.035815351248335466, 'penalty': 'l1', 'solver': 'saga'}. Best is trial 0 with value: 0.7323578606727177.
[I 2024-04-29 14:36:44,277] Trial 2 finished with value: 0.5410035612773669 and parameters: {'C': 11.32275486983325, 'penalty': 'l1', 'solver': 'saga'}. Best is trial 0 with value: 0.7323578606727177.
[I 2024-04-29 14:36:44,541] Trial 3 finished with value: 0.5411230549417523 and parameters: {'C': 72.24083618448716, 'penalty': 'l1', 'solver': 'saga'}. Best is trial 0 with value: 0.7323578606727177.
[I 2024-04-29 14:36:44,790] Trial 4 finished with value: 0.53687869831882

Best parameters: {'C': 0.5913560321617163, 'penalty': 'l1', 'solver': 'liblinear'}
Best value (Precision-Recall AUC): 0.7345078228688514
Trained Logistic Regression model saved to: ./models/final_model.pkl


In [5]:
optuna.visualization.plot_parallel_coordinate(study)

In [6]:
optuna.visualization.plot_rank(study)


plot_rank is experimental (supported from v3.2.0). The interface can change in the future.



In [7]:
optuna.visualization.plot_slice(study)

In [8]:
import torch
optuna.visualization.plot_terminator_improvement(study)


plot_terminator_improvement is experimental (supported from v3.2.0). The interface can change in the future.


RegretBoundEvaluator is experimental (supported from v3.2.0). The interface can change in the future.


CrossValidationErrorEvaluator is experimental (supported from v3.2.0). The interface can change in the future.

100%|██████████| 100/100 [00:04<00:00, 20.60it/s]
