In [None]:
import pandas as pd
import optuna
from optuna import pruners
from optuna.visualization import plot_optimization_history, plot_param_importances
from optuna.samplers import TPESampler

import imblearn
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours
from imblearn.combine import SMOTEENN

import sklearn

import xgboost as xgb
import lightgbm as lgb

import numpy as np
import matplotlib.pyplot as plt
import importlib
from joblib import dump, load
import os
import math
from functools import reduce

import torch
import torch.nn as nn
from torch.nn import ReLU
import random

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFE, RFECV, SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression


import sys
sys.path.append('../')

import model_util
importlib.reload(model_util)
from model_util import get_scoring_metrics

import optuna_util
importlib.reload(optuna_util)
from optuna_util import run_optuna_studies

import feature_sets
importlib.reload(feature_sets)



#from sklearnex import patch_sklearn
#patch_sklearn()

In [None]:
removed_features = ['IOPg pre-treatment', 'IOPg pre-treatment inter-eye difference', 
                    'Corneal hysteresis inter-eye difference', 
                    'Spherical equivalent', 'Corneal resistance factor', 
                    'Speech reception threshold', 'Corneal hysteresis',
                    'Tinnitus frequency (self-reported)', 'Arterial stiffness index', 'Private healthcare utilisation', 
                    'Exercise (summed MET minutes per week)', 'Total household income']
# this removes all 'opthalmic' features


model_feature_dict = {
    #'ophthalmic': feature_sets.ophthalmic_features['feature'].values[~np.isin(feature_sets.ophthalmic_features['feature'].values, removed_features)],
    'demographic': feature_sets.demographic_features['feature'].values[~np.isin(feature_sets.demographic_features['feature'].values, removed_features)],
    'systemic': feature_sets.systemic_features['feature'].values[~np.isin(feature_sets.systemic_features['feature'].values, removed_features)],
    'lifestyle': feature_sets.lifestyle_features['feature'].values[~np.isin(feature_sets.lifestyle_features['feature'].values, removed_features)],

    #'OD': feature_sets.OD_features['feature'].values[~np.isin(feature_sets.OD_features['feature'].values, removed_features)],
    'SL': feature_sets.SL_features['feature'].values[~np.isin(feature_sets.SL_features['feature'].values, removed_features)],
    #'ODSL': feature_sets.ODSL_features['feature'].values[~np.isin(feature_sets.ODSL_features['feature'].values, removed_features)],
    'DSL': feature_sets.DSL_features['feature'].values[~np.isin(feature_sets.DSL_features['feature'].values, removed_features)],
    'DS': feature_sets.DS_features['feature'].values[~np.isin(feature_sets.DS_features['feature'].values, removed_features)],
    'DL': feature_sets.DL_features['feature'].values[~np.isin(feature_sets.DL_features['feature'].values, removed_features)]}


# 3 year
X_train_imputed_scaled_IOPremoved3year_tte = load('../data/imputed/IOPsubcohort_X_train_imputed_scaled_IOPremoved3year_tte.pkl')
y_train_IOPremoved3year_tte = load('../data/imputed/IOPsubcohort_y_train_IOPremoved3year_tte.pkl')

# 5 year
X_train_imputed_scaled_IOPremoved5year_tte = load('../data/imputed/IOPsubcohort_X_train_imputed_scaled_IOPremoved5year_tte.pkl')
y_train_IOPremoved5year_tte = load('../data/imputed/IOPsubcohort_y_train_IOPremoved5year_tte.pkl')

# 10 year
X_train_imputed_scaled_IOPremoved10year_tte = load('../data/imputed/IOPsubcohort_X_train_imputed_scaled_IOPremoved10year_tte.pkl')
y_train_IOPremoved10year_tte = load('../data/imputed/IOPsubcohort_y_train_IOPremoved10year_tte.pkl')

n_trials = 100
n_cv_folds = 5
scoring_metric = 'roc_auc'

<br>

## 3 year

### running models

In [None]:
run_optuna_studies(
    X = X_train_imputed_scaled_IOPremoved3year_tte,
    y = y_train_IOPremoved3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LGBM_OptunaObjective,
    save_dir = './optuna_results/lightgbm_IOPremoved3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# XGBoost
run_optuna_studies(
    X = X_train_imputed_scaled_IOPremoved3year_tte,
    y = y_train_IOPremoved3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.XGBoost_OptunaObjective,
    save_dir = './optuna_results/xgboost_IOPremoved3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# random forest
run_optuna_studies(
    X = X_train_imputed_scaled_IOPremoved3year_tte,
    y = y_train_IOPremoved3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.RF_OptunaObjective,
    save_dir = './optuna_results/randomforest_IOPremoved3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# KNN
run_optuna_studies(
    X = X_train_imputed_scaled_IOPremoved3year_tte,
    y = y_train_IOPremoved3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.KNN_OptunaObjective,
    save_dir = './optuna_results/knn_IOPremoved3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# SVM
run_optuna_studies(
    X = X_train_imputed_scaled_IOPremoved3year_tte,
    y = y_train_IOPremoved3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.SVC_OptunaObjective,
    save_dir = './optuna_results/svm_IOPremoved3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# logistic regression
run_optuna_studies(
    X = X_train_imputed_scaled_IOPremoved3year_tte,
    y = y_train_IOPremoved3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LogisticRegressionSGD_OptunaObjective,
    save_dir = './optuna_results/logistic_regression_sgd_IOPremoved3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
optuna_results_dir = './optuna_results'
fitted_models_dir = './best_hyperparams_fitted/'

algorithms = {
    'logistic_regression_sgd_IOPremoved3year_tte': sklearn.linear_model.SGDClassifier,
    'svm_IOPremoved3year_tte': sklearn.svm.SVC,
    'knn_IOPremoved3year_tte': sklearn.neighbors.KNeighborsClassifier,
    'randomforest_IOPremoved3year_tte': sklearn.ensemble.RandomForestClassifier,
    'xgboost_IOPremoved3year_tte': xgb.XGBClassifier,
    'lightgbm_IOPremoved3year_tte': lgb.LGBMClassifier}

In [None]:
# Refit models with best hyperparams on entire train set & save (+save params)

for model_name, feature_set in model_feature_dict.items():
    print(feature_set)
    model_save_dir = f'{fitted_models_dir}/{model_name}'
    if not os.path.exists(model_save_dir):
        os.makedirs(model_save_dir)
        
    X = X_train_imputed_scaled_IOPremoved3year_tte[feature_set]

    for algorithm, estimator_class in algorithms.items():
        print(algorithm)
        print(f'Fitting {model_name} {algorithm}')

        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        print(study)
        best_params = study.best_trial.user_attrs['all_params']
        best_params_str = f'\n'.join([': '.join([key, str(val)]) for key, val in best_params.items()])
    
        # Save params as txt
        with open(f'{model_save_dir}/{algorithm}_best_params.txt', 'w+') as txt:
            txt.write(best_params_str)
    
        # Fit and save model
        estimator = estimator_class(**best_params)
        estimator.fit(X, y_train_IOPremoved3year_tte)
        dump(estimator, f'{model_save_dir}/{algorithm}.pkl')

In [None]:
# now go to model eval

In [None]:
# if up not working
def load_study_compat(study_path):
    """Safe study loading that handles version differences"""
    try:
        return joblib.load(study_path)
    except (TypeError, AttributeError) as e:
        # Handle version mismatch cases
        study = optuna.create_study()
        with open(study_path, 'rb') as f:
            data = joblib.load(f)
            
            # Reconstruct study from essential components
            if hasattr(data, 'trials'):
                for trial in data.trials:
                    study.add_trial(trial)
            elif isinstance(data, dict) and 'trials' in data:
                for trial in data['trials']:
                    study.add_trial(trial)
        return study

for model_name, feature_set in model_feature_dict.items():
    model_save_dir = f'{fitted_models_dir}/{model_name}'
    os.makedirs(model_save_dir, exist_ok=True)
    
    X = X_train_imputed_scaled_IOPremoved3year_tte[feature_set]

    for algorithm, estimator_class in algorithms.items():
        print(f'\nProcessing {model_name} with {algorithm}')
        
        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        
        try:
            # Load with compatibility wrapper
            study = load_study_compat(study_dir)
            print(f"Successfully loaded study for {model_name}/{algorithm}")
            
            # Get best parameters (handles different storage formats)
            if hasattr(study.best_trial, 'user_attrs'):
                best_params = study.best_trial.user_attrs.get('all_params', study.best_params)
            else:
                best_params = study.best_params
                
            # Save params
            best_params_str = '\n'.join(f'{k}: {v}' for k, v in best_params.items())
            with open(f'{model_save_dir}/{algorithm}_best_params.txt', 'w') as f:
                f.write(best_params_str)
            
            # Fit and save model
            estimator = estimator_class(**best_params)
            estimator.fit(X, y_train_IOPremoved3year_tte)
            joblib.dump(estimator, f'{model_save_dir}/{algorithm}.pkl')
            
        except Exception as e:
            print(f"Failed to process {model_name}/{algorithm}: {str(e)}")
            continue

<br>

### getting minimal feature set

In [None]:
# Using LR with DSL as it is the best predictor as of model_evaluation.ipynb

rfe_obj_IOPremoved3year_tte = RFECV(
    estimator=LogisticRegression(
        penalty='l2',
        C=1.0,
        solver='lbfgs',
        max_iter=1000,
        random_state=2024,
        n_jobs=-1
    ),
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=2024),
    n_jobs=1,
    step=1,
    verbose=100,
    min_features_to_select=5
)

rfe_obj_IOPremoved3year_tte.fit(
    X_train_imputed_scaled_IOPremoved3year_tte[model_feature_dict['DSL']], 
    y_train_IOPremoved3year_tte
)

#### change feature set to applicable one
print(f"Optimal number of features: {rfe_obj_IOPremoved3year_tte.n_features_}")
print(f"Selected features: {np.array(model_feature_dict['DSL'])[rfe_obj_IOPremoved3year_tte.support_]}")

In [None]:
dump(rfe_obj_IOPremoved3year_tte, './rfecv_fitted_IOPremoved3year_tte.pkl')

In [None]:
# Get minimal features selected by RFECV
minimal_features = np.array(model_feature_dict['DSL'])[rfe_obj_IOPremoved3year_tte.support_]
minimal_features

In [None]:
# go to feature_sets and fill in the above features then restart kernel
feature_dict = {'minimal_features_rfecv_IOPremoved3year_tte': feature_sets.minimal_features_rfecv_IOPremoved3year_tte['feature'].values}
feature_dict

In [None]:
## CHANGE TO APPLICABLE MODEL

run_optuna_studies(
    X = X_train_imputed_scaled_IOPremoved3year_tte,
    y = y_train_IOPremoved3year_tte,
    feature_dict = {'minimal_features_rfecv_IOPremoved3year_tte': feature_sets.minimal_features_rfecv_IOPremoved3year_tte['feature'].values},
    n_trials = 1000,
    
    objective_class = optuna_util.LogisticRegressionSGD_OptunaObjective,
    save_dir = './optuna_results/logistic_regression_sgd_IOPremoved3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
algorithms = {
    'logistic_regression_sgd_IOPremoved3year_tte': sklearn.linear_model.SGDClassifier,
    'svm_IOPremoved3year_tte': sklearn.svm.SVC,
    'knn_IOPremoved3year_tte': sklearn.neighbors.KNeighborsClassifier,
    'randomforest_IOPremoved3year_tte': sklearn.ensemble.RandomForestClassifier,
    'xgboost_IOPremoved3year_tte': xgb.XGBClassifier,
    'lightgbm_IOPremoved3year_tte': lgb.LGBMClassifier,
}

In [None]:
feature_set_dfs = []
optuna_results_dir = './optuna_results'
fitted_models_dir = './best_hyperparams_fitted/'

for model_name, feature_set in model_feature_dict.items():
    feature_set_hyperparam_df = pd.DataFrame(columns=['Algorithm', 'Hyperparameter', model_name])

    model_save_dir = f'{fitted_models_dir}/{model_name}'

    for algorithm, estimator_class in algorithms.items():
        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        best_params = study.best_trial.params
        #best_params = study.best_trial.user_attrs['all_params']

        for k, v in best_params.items():
            if type(v) == str:
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, v]
            else:
                x = '%s' % float('%.4g' % v)
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, x]

    feature_set_dfs.append(feature_set_hyperparam_df)

# Combine and save results
combo_df = reduce(lambda df1, df2: pd.merge(df1, df2, on=['Algorithm', 'Hyperparameter'], how='outer'), feature_set_dfs)
combo_df.set_index('Algorithm', inplace=True)
combo_df = combo_df.loc[list(algorithms.keys())]
combo_df.to_csv('./optuna_results/best_hyperparameter_results_IOPremoved3year_tte.tsv', sep='\t', index=True)

<br>

<br>

## 5 year

### running models

In [None]:
# lgbm
run_optuna_studies(
    X = X_train_imputed_scaled_IOPremoved5year_tte,
    y = y_train_IOPremoved5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LGBM_OptunaObjective,
    save_dir = './optuna_results/lightgbm_IOPremoved5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# XGBoost
run_optuna_studies(
    X = X_train_imputed_scaled_IOPremoved5year_tte,
    y = y_train_IOPremoved5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.XGBoost_OptunaObjective,
    save_dir = './optuna_results/xgboost_IOPremoved5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# random forest
run_optuna_studies(
    X = X_train_imputed_scaled_IOPremoved5year_tte,
    y = y_train_IOPremoved5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.RF_OptunaObjective,
    save_dir = './optuna_results/randomforest_IOPremoved5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# KNN
run_optuna_studies(
    X = X_train_imputed_scaled_IOPremoved5year_tte,
    y = y_train_IOPremoved5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.KNN_OptunaObjective,
    save_dir = './optuna_results/knn_IOPremoved5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# SVM
run_optuna_studies(
    X = X_train_imputed_scaled_IOPremoved5year_tte,
    y = y_train_IOPremoved5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.SVC_OptunaObjective,
    save_dir = './optuna_results/svm_IOPremoved5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# logistic regression
run_optuna_studies(
    X = X_train_imputed_scaled_IOPremoved5year_tte,
    y = y_train_IOPremoved5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LogisticRegressionSGD_OptunaObjective,
    save_dir = './optuna_results/logistic_regression_sgd_IOPremoved5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
optuna_results_dir = './optuna_results'
fitted_models_dir = './best_hyperparams_fitted/'

algorithms = {
    'logistic_regression_sgd_IOPremoved5year_tte': sklearn.linear_model.SGDClassifier,
    'svm_IOPremoved5year_tte': sklearn.svm.SVC,
    'knn_IOPremoved5year_tte': sklearn.neighbors.KNeighborsClassifier,
    'randomforest_IOPremoved5year_tte': sklearn.ensemble.RandomForestClassifier,
    'xgboost_IOPremoved5year_tte': xgb.XGBClassifier,
    'lightgbm_IOPremoved5year_tte': lgb.LGBMClassifier}

In [None]:
# Refit models with best hyperparams on entire train set & save (+save params)


for model_name, feature_set in model_feature_dict.items():
    model_save_dir = f'{fitted_models_dir}/{model_name}'
    if not os.path.exists(model_save_dir):
        os.makedirs(model_save_dir)
        
    X = X_train_imputed_scaled_IOPremoved5year_tte[feature_set]

    for algorithm, estimator_class in algorithms.items():
        print(algorithm)
    #for algorithm, estimator_class in zip(algorithms.keys(), algorithms.values()):
        print(f'Fitting {model_name} {algorithm}')

        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        best_params = study.best_trial.user_attrs['all_params']
        best_params_str = f'\n'.join([': '.join([key, str(val)]) for key, val in best_params.items()])
    
        # Save params as txt
        with open(f'{model_save_dir}/{algorithm}_best_params.txt', 'w+') as txt:
            txt.write(best_params_str)
    
        # Fit and save model
        estimator = estimator_class(**best_params)
        estimator.fit(X, y_train_IOPremoved5year_tte)
        dump(estimator, f'{model_save_dir}/{algorithm}.pkl')

In [None]:
# now go to model evaluation

<br>

### getting minimal feature set

In [None]:
# change to applicable model
# Using LR with DSL as it is the best predictor as of model_evaluation.ipynb

rfe_obj_IOPremoved5year_tte = RFECV(
    estimator=LogisticRegression(
        penalty='l2',
        C=1.0,
        solver='lbfgs',
        max_iter=1000,
        random_state=2024,
        n_jobs=-1
    ),
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=2024),
    n_jobs=1,
    step=1,
    verbose=100,
    min_features_to_select=5
)

rfe_obj_IOPremoved5year_tte.fit(
    X_train_imputed_scaled_IOPremoved5year_tte[model_feature_dict['DSL']], 
    y_train_IOPremoved5year_tte
)

#### change feature set to applicable one
print(f"Optimal number of features: {rfe_obj_IOPremoved5year_tte.n_features_}")
print(f"Selected features: {np.array(model_feature_dict['DSL'])[rfe_obj_IOPremoved5year_tte.support_]}")

In [None]:
dump(rfe_obj_IOPremoved5year_tte, './rfecv_fitted_IOPremoved5year_tte.pkl')

In [None]:
# Get minimal features selected by RFECV
minimal_features = np.array(model_feature_dict['DSL'])[rfe_obj_IOPremoved5year_tte.support_]
minimal_features

In [None]:
# go to feature_sets and fill in the above features then restart kernel
feature_dict = {'minimal_features_rfecv_IOPremoved5year_tte': feature_sets.minimal_features_rfecv_IOPremoved3year_tte['feature'].values}
feature_dict

In [None]:
## CHANGE TO APPLICABLE MODEL

run_optuna_studies(
    X = X_train_imputed_scaled_IOPremoved5year_tte,
    y = y_train_IOPremoved5year_tte,
    feature_dict = {'minimal_features_rfecv_IOPremoved5year_tte': feature_sets.minimal_features_rfecv_IOPremoved3year_tte['feature'].values},
    n_trials = 1000,
    
    objective_class = optuna_util.LogisticRegressionSGD_OptunaObjective,
    save_dir = './optuna_results/logistic_regression_sgd_IOPremoved5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
algorithms = {
    'logistic_regression_sgd_IOPremoved5year_tte': sklearn.linear_model.SGDClassifier,
    'svm_IOPremoved5year_tte': sklearn.svm.SVC,
    'knn_IOPremoved5year_tte': sklearn.neighbors.KNeighborsClassifier,
    'randomforest_IOPremoved5year_tte': sklearn.ensemble.RandomForestClassifier,
    'xgboost_IOPremoved5year_tte': xgb.XGBClassifier,
    'lightgbm_IOPremoved5year_tte': lgb.LGBMClassifier,
}

In [None]:
feature_set_dfs = []
optuna_results_dir = './optuna_results'
fitted_models_dir = './best_hyperparams_fitted/'

for model_name, feature_set in model_feature_dict.items():
    feature_set_hyperparam_df = pd.DataFrame(columns=['Algorithm', 'Hyperparameter', model_name])

    model_save_dir = f'{fitted_models_dir}/{model_name}'

    for algorithm, estimator_class in algorithms.items():
        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        best_params = study.best_trial.params
        #best_params = study.best_trial.user_attrs['all_params']

        for k, v in best_params.items():
            if type(v) == str:
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, v]
            else:
                x = '%s' % float('%.4g' % v)
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, x]

    feature_set_dfs.append(feature_set_hyperparam_df)

# Combine and save results
combo_df = reduce(lambda df1, df2: pd.merge(df1, df2, on=['Algorithm', 'Hyperparameter'], how='outer'), feature_set_dfs)
combo_df.set_index('Algorithm', inplace=True)
combo_df = combo_df.loc[list(algorithms.keys())]
combo_df.to_csv('./optuna_results/best_hyperparameter_results_IOPremoved5year_tte.tsv', sep='\t', index=True)

<br>

<br>

## 10 year


### running models

In [None]:
#LGBM
run_optuna_studies(
    X = X_train_imputed_scaled_IOPremoved10year_tte,
    y = y_train_IOPremoved10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LGBM_OptunaObjective,
    save_dir = './optuna_results/lightgbm_IOPremoved10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# XGBoost
run_optuna_studies(
    X = X_train_imputed_scaled_IOPremoved10year_tte,
    y = y_train_IOPremoved10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.XGBoost_OptunaObjective,
    save_dir = './optuna_results/xgboost_IOPremoved10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# random forest
run_optuna_studies(
    X = X_train_imputed_scaled_IOPremoved10year_tte,
    y = y_train_IOPremoved10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.RF_OptunaObjective,
    save_dir = './optuna_results/randomforest_IOPremoved10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# KNN
run_optuna_studies(
    X = X_train_imputed_scaled_IOPremoved10year_tte,
    y = y_train_IOPremoved10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.KNN_OptunaObjective,
    save_dir = './optuna_results/knn_IOPremoved10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# SVM
run_optuna_studies(
    X = X_train_imputed_scaled_IOPremoved10year_tte,
    y = y_train_IOPremoved10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.SVC_OptunaObjective,
    save_dir = './optuna_results/svm_IOPremoved10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# logistic regression
run_optuna_studies(
    X = X_train_imputed_scaled_IOPremoved10year_tte,
    y = y_train_IOPremoved10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LogisticRegressionSGD_OptunaObjective,
    save_dir = './optuna_results/logistic_regression_sgd_IOPremoved10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
optuna_results_dir = './optuna_results'
fitted_models_dir = './best_hyperparams_fitted/'

algorithms = {
    'logistic_regression_sgd_IOPremoved10year_tte': sklearn.linear_model.SGDClassifier,
    'svm_IOPremoved10year_tte': sklearn.svm.SVC,
    'knn_IOPremoved10year_tte': sklearn.neighbors.KNeighborsClassifier,
    'randomforest_IOPremoved10year_tte': sklearn.ensemble.RandomForestClassifier,
    'xgboost_IOPremoved10year_tte': xgb.XGBClassifier,
    'lightgbm_IOPremoved10year_tte': lgb.LGBMClassifier}

In [None]:
# Refit models with best hyperparams on entire train set & save (+save params)

for model_name, feature_set in model_feature_dict.items():
    model_save_dir = f'{fitted_models_dir}/{model_name}'
    if not os.path.exists(model_save_dir):
        os.makedirs(model_save_dir)
        
    X = X_train_imputed_scaled_IOPremoved10year_tte[feature_set]

    for algorithm, estimator_class in algorithms.items():
        print(algorithm)
    #for algorithm, estimator_class in zip(algorithms.keys(), algorithms.values()):
        print(f'Fitting {model_name} {algorithm}')

        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        best_params = study.best_trial.user_attrs['all_params']
        best_params_str = f'\n'.join([': '.join([key, str(val)]) for key, val in best_params.items()])
    
        # Save params as txt
        with open(f'{model_save_dir}/{algorithm}_best_params.txt', 'w+') as txt:
            txt.write(best_params_str)
    
        # Fit and save model
        estimator = estimator_class(**best_params)
        estimator.fit(X, y_train_IOPremoved10year_tte)
        dump(estimator, f'{model_save_dir}/{algorithm}.pkl')

In [None]:
# now go to model evaluation

<br>

### getting minimal feature set

In [None]:
# Using LR with ODSL as it is the best predictor as of model_evaluation.ipynb

rfe_obj_IOPremoved10year_tte = RFECV(
    estimator=LogisticRegression(
        penalty='l2',
        C=1.0,
        solver='lbfgs',
        max_iter=1000,
        random_state=2024,
        n_jobs=-1
    ),
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=2024),
    n_jobs=1,
    step=1,
    verbose=100,
    min_features_to_select=5
)

rfe_obj_IOPremoved10year_tte.fit(
    X_train_imputed_scaled_IOPremoved10year_tte[model_feature_dict['ODSL']], 
    y_train_IOPremoved10year_tte
)

print(f"Optimal number of features: {rfe_obj_IOPremoved10year_tte.n_features_}")
print(f"Selected features: {np.array(model_feature_dict['ODSL'])[rfe_obj_IOPremoved10year_tte.support_]}")

In [None]:
dump(rfe_obj_IOPremoved10year_tte, './rfecv_fitted_IOPremoved10year_tte.pkl')

In [None]:
# Get minimal features selected by RFECV
minimal_features = np.array(model_feature_dict['ODSL'])[rfe_obj_IOPremoved10year_tte.support_]
minimal_features

In [None]:
# go to feature_sets and fill in the above features then restart kernel
feature_dict = {'minimal_features_rfecv_IOPremoved10year_tte': feature_sets.minimal_features_rfecv_IOPremoved10year_tte['feature'].values}
feature_dict

In [None]:
run_optuna_studies(
    X = X_train_imputed_scaled_IOPremoved10year_tte,
    y = y_train_IOPremoved10year_tte,
    feature_dict = {'minimal_features_rfecv_IOPremoved10year_tte': feature_sets.minimal_features_rfecv_IOPremoved10year_tte['feature'].values},
    n_trials = 1000,
    
    objective_class = optuna_util.LogisticRegressionSGD_OptunaObjective,
    save_dir = './optuna_results/logistic_regression_sgd_IOPremoved10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
algorithms = {
    'logistic_regression_sgd_IOPremoved10year_tte': sklearn.linear_model.SGDClassifier,
    'svm_IOPremoved10year_tte': sklearn.svm.SVC,
    #'knn_IOPremoved10year_tte': sklearn.neighbors.KNeighborsClassifier,
    'randomforest_IOPremoved10year_tte': sklearn.ensemble.RandomForestClassifier,
    'xgboost_IOPremoved10year_tte': xgb.XGBClassifier,
    'lightgbm_IOPremoved10year_tte': lgb.LGBMClassifier,
}

In [None]:
feature_set_dfs = []
optuna_results_dir = './optuna_results'
fitted_models_dir = './best_hyperparams_fitted/'

for model_name, feature_set in model_feature_dict.items():
    feature_set_hyperparam_df = pd.DataFrame(columns=['Algorithm', 'Hyperparameter', model_name])

    model_save_dir = f'{fitted_models_dir}/{model_name}'

    for algorithm, estimator_class in algorithms.items():
        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        best_params = study.best_trial.params
        #best_params = study.best_trial.user_attrs['all_params']

        for k, v in best_params.items():
            if type(v) == str:
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, v]
            else:
                x = '%s' % float('%.4g' % v)
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, x]

    feature_set_dfs.append(feature_set_hyperparam_df)

# Combine and save results
combo_df = reduce(lambda df1, df2: pd.merge(df1, df2, on=['Algorithm', 'Hyperparameter'], how='outer'), feature_set_dfs)
combo_df.set_index('Algorithm', inplace=True)
combo_df = combo_df.loc[list(algorithms.keys())]
combo_df.to_csv('./optuna_results/best_hyperparameter_results_IOPremoved10year_tte.tsv', sep='\t', index=True)