In [None]:
import pandas as pd
import optuna
from optuna import pruners
from optuna.visualization import plot_optimization_history, plot_param_importances
from optuna.samplers import TPESampler

import imblearn
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours
from imblearn.combine import SMOTEENN

import sklearn

import xgboost as xgb
import lightgbm as lgb

import numpy as np
import matplotlib.pyplot as plt
import importlib

from joblib import dump, load
import os
import math
from functools import reduce

import torch
import torch.nn as nn
from torch.nn import ReLU
import random

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFE, RFECV, SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression


import sys
sys.path.append('../')

import model_util
importlib.reload(model_util)
from model_util import get_scoring_metrics

import optuna_util
importlib.reload(optuna_util)
from optuna_util import run_optuna_studies

import feature_sets
importlib.reload(feature_sets)



#from sklearnex import patch_sklearn
#patch_sklearn()

In [None]:
model_feature_dict = {
    'ophthalmic': feature_sets.ophthalmic_features['feature'].values,
    'demographic': feature_sets.demographic_features['feature'].values,
    'systemic': feature_sets.systemic_features['feature'].values,
    'lifestyle': feature_sets.lifestyle_features['feature'].values,

    'OD': feature_sets.OD_features['feature'].values,
    'SL': feature_sets.SL_features['feature'].values,
    'ODSL': feature_sets.ODSL_features['feature'].values,
    'DSL': feature_sets.DSL_features['feature'].values}
    

# 3 year
X_train_imputed_scaled_3year_tte = load('../data/imputed/IOPsubcohort_X_train_imputed_scaled_3year_tte.pkl')
y_train_3year_tte = load('../data/imputed/IOPsubcohort_y_train_3year_tte.pkl')

# 5 year
X_train_imputed_scaled_5year_tte = load('../data/imputed/IOPsubcohort_X_train_imputed_scaled_5year_tte.pkl')
y_train_5year_tte = load('../data/imputed/IOPsubcohort_y_train_5year_tte.pkl')

# 10 year
X_train_imputed_scaled_10year_tte = load('../data/imputed/IOPsubcohort_X_train_imputed_scaled_10year_tte.pkl')
y_train_10year_tte = load('../data/imputed/IOPsubcohort_y_train_10year_tte.pkl')

n_trials = 100
n_cv_folds = 5
scoring_metric = 'roc_auc'

## 3 year

### models

In [None]:
run_optuna_studies(
    X = X_train_imputed_scaled_3year_tte,
    y = y_train_3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LGBM_OptunaObjective,
    save_dir = './optuna_results/lightgbm_3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# XGBoost
run_optuna_studies(
    X = X_train_imputed_scaled_3year_tte,
    y = y_train_3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.XGBoost_OptunaObjective,
    save_dir = './optuna_results/xgboost_3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# random forest
run_optuna_studies(
    X = X_train_imputed_scaled_3year_tte,
    y = y_train_3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.RF_OptunaObjective,
    save_dir = './optuna_results/randomforest_3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# KNN
run_optuna_studies(
    X = X_train_imputed_scaled_3year_tte,
    y = y_train_3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.KNN_OptunaObjective,
    save_dir = './optuna_results/knn_3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# SVM
run_optuna_studies(
    X = X_train_imputed_scaled_3year_tte,
    y = y_train_3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.SVC_OptunaObjective,
    save_dir = './optuna_results/svm_3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# logistic regression
run_optuna_studies(
    X = X_train_imputed_scaled_3year_tte,
    y = y_train_3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LogisticRegressionSGD_OptunaObjective,
    save_dir = './optuna_results/logistic_regression_sgd_3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
optuna_results_dir = './optuna_results'
fitted_models_dir = './best_hyperparams_fitted/'

algorithms = {
    'logistic_regression_sgd_3year_tte': sklearn.linear_model.SGDClassifier,
    'svm_3year_tte': sklearn.svm.SVC,
    'knn_3year_tte': sklearn.neighbors.KNeighborsClassifier,
    'randomforest_3year_tte': sklearn.ensemble.RandomForestClassifier,
    'xgboost_3year_tte': xgb.XGBClassifier,
    'lightgbm_3year_tte': lgb.LGBMClassifier}

In [None]:
# Refit models with best hyperparams on entire train set & save (+save params)

for model_name, feature_set in model_feature_dict.items():
    print(feature_set)
    model_save_dir = f'{fitted_models_dir}/{model_name}'
    if not os.path.exists(model_save_dir):
        os.makedirs(model_save_dir)
        
    X = X_train_imputed_scaled_3year_tte[feature_set]

    for algorithm, estimator_class in algorithms.items():
        print(algorithm)
        print(f'Fitting {model_name} {algorithm}')

        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        print(study)
        best_params = study.best_trial.user_attrs['all_params']
        best_params_str = f'\n'.join([': '.join([key, str(val)]) for key, val in best_params.items()])
    
        # Save params as txt
        with open(f'{model_save_dir}/{algorithm}_best_params.txt', 'w+') as txt:
            txt.write(best_params_str)
    
        # Fit and save model
        estimator = estimator_class(**best_params)
        estimator.fit(X, y_train_3year_tte)
        dump(estimator, f'{model_save_dir}/{algorithm}.pkl')

In [None]:
#now go to model eval

### minimal feature set

In [None]:
# Using LR with OD as it is the best predictor as of model_evaluation.ipynb
rfe_obj_3year_tte = RFECV(
    estimator=LogisticRegression(
        penalty='l2',
        C=1.0,
        solver='lbfgs',
        max_iter=1000,
        random_state=2024,
        class_weight='balanced',
        n_jobs=-1
    ),
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=2024),
    n_jobs=1,
    step=1,
    verbose=100,
    min_features_to_select=5
)

rfe_obj_3year_tte.fit(
    X_train_imputed_scaled_3year_tte[model_feature_dict['OD']], 
    y_train_3year_tte
)

#### change feature set to applicable one
print(f"Optimal number of features: {rfe_obj_3year_tte.n_features_}")
print(f"Selected features: {np.array(model_feature_dict['OD'])[rfe_obj_3year_tte.support_]}")

dump(rfe_obj_3year_tte, './rfecv_fitted_3year_tte.pkl')

In [None]:
# Get minimal features selected by RFECV
minimal_features = np.array(model_feature_dict['OD'])[rfe_obj_3year_tte.support_]
minimal_features

In [None]:
# go to feature_sets and fill in the above features then restart kernel
feature_dict = {'minimal_features_rfecv_3year_tte': feature_sets.minimal_features_rfecv_3year_tte['feature'].values}
feature_dict

In [None]:
run_optuna_studies(
    X = X_train_imputed_scaled_3year_tte,
    y = y_train_3year_tte,
    feature_dict = {'minimal_features_rfecv_3year_tte': feature_sets.minimal_features_rfecv_3year_tte['feature'].values},
    n_trials = 1000,
    
    objective_class = optuna_util.LogisticRegressionSGD_OptunaObjective,
    save_dir = './optuna_results/logistic_regression_sgd_3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
algorithms = {
    'logistic_regression_sgd_3year_tte': sklearn.linear_model.SGDClassifier,
    'svm_3year_tte': sklearn.svm.SVC,
    'knn_3year_tte': sklearn.neighbors.KNeighborsClassifier,
    'randomforest_3year_tte': sklearn.ensemble.RandomForestClassifier,
    'xgboost_3year_tte': xgb.XGBClassifier,
    'lightgbm_3year_tte': lgb.LGBMClassifier,
}

feature_set_dfs = []
optuna_results_dir = './optuna_results'
fitted_models_dir = './best_hyperparams_fitted/'

for model_name, feature_set in model_feature_dict.items():
    feature_set_hyperparam_df = pd.DataFrame(columns=['Algorithm', 'Hyperparameter', model_name])

    model_save_dir = f'{fitted_models_dir}/{model_name}'

    for algorithm, estimator_class in algorithms.items():
        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        best_params = study.best_trial.params
        #best_params = study.best_trial.user_attrs['all_params']

        for k, v in best_params.items():
            if type(v) == str:
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, v]
            else:
                x = '%s' % float('%.4g' % v)
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, x]

    feature_set_dfs.append(feature_set_hyperparam_df)

# Combine and save results
combo_df = reduce(lambda df1, df2: pd.merge(df1, df2, on=['Algorithm', 'Hyperparameter'], how='outer'), feature_set_dfs)
combo_df.set_index('Algorithm', inplace=True)
combo_df = combo_df.loc[list(algorithms.keys())]
combo_df.to_csv('./optuna_results/best_hyperparameter_results_3year_tte.tsv', sep='\t', index=True)

<br>

## 5 year

### models

In [None]:
run_optuna_studies(
    X = X_train_imputed_scaled_5year_tte,
    y = y_train_5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LGBM_OptunaObjective,
    save_dir = './optuna_results/lightgbm_5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# XGBoost
run_optuna_studies(
    X = X_train_imputed_scaled_5year_tte,
    y = y_train_5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.XGBoost_OptunaObjective,
    save_dir = './optuna_results/xgboost_5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# random forest
run_optuna_studies(
    X = X_train_imputed_scaled_5year_tte,
    y = y_train_5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.RF_OptunaObjective,
    save_dir = './optuna_results/randomforest_5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# KNN
run_optuna_studies(
    X = X_train_imputed_scaled_5year_tte,
    y = y_train_5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.KNN_OptunaObjective,
    save_dir = './optuna_results/knn_5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# SVM
run_optuna_studies(
    X = X_train_imputed_scaled_5year_tte,
    y = y_train_5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.SVC_OptunaObjective,
    save_dir = './optuna_results/svm_5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# logistic regression
run_optuna_studies(
    X = X_train_imputed_scaled_5year_tte,
    y = y_train_5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LogisticRegressionSGD_OptunaObjective,
    save_dir = './optuna_results/logistic_regression_sgd_5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
optuna_results_dir = './optuna_results'
fitted_models_dir = './best_hyperparams_fitted/'

algorithms = {
    'logistic_regression_sgd_5year_tte': sklearn.linear_model.SGDClassifier,
    'svm_5year_tte': sklearn.svm.SVC,
    'knn_5year_tte': sklearn.neighbors.KNeighborsClassifier,
    'randomforest_5year_tte': sklearn.ensemble.RandomForestClassifier,
    'xgboost_5year_tte': xgb.XGBClassifier,
    'lightgbm_5year_tte': lgb.LGBMClassifier}

In [None]:
# Refit models with best hyperparams on entire train set & save (+save params)

for model_name, feature_set in model_feature_dict.items():
    print(feature_set)
    model_save_dir = f'{fitted_models_dir}/{model_name}'
    if not os.path.exists(model_save_dir):
        os.makedirs(model_save_dir)
        
    X = X_train_imputed_scaled_5year_tte[feature_set]

    for algorithm, estimator_class in algorithms.items():
        print(algorithm)
        print(f'Fitting {model_name} {algorithm}')

        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        print(study)
        best_params = study.best_trial.user_attrs['all_params']
        best_params_str = f'\n'.join([': '.join([key, str(val)]) for key, val in best_params.items()])
    
        # Save params as txt
        with open(f'{model_save_dir}/{algorithm}_best_params.txt', 'w+') as txt:
            txt.write(best_params_str)
    
        # Fit and save model
        estimator = estimator_class(**best_params)
        estimator.fit(X, y_train_5year_tte)
        dump(estimator, f'{model_save_dir}/{algorithm}.pkl')

In [None]:
# now go to model eval

### minimal feature set

In [None]:
# Using xgboost with ODSL  as it is the best predictor as of model_evaluation.ipynb

rfe_obj_5year_tte = RFECV(
    estimator=xgb.XGBClassifier(
        objective='binary:logistic',
        random_state=2024,
        n_jobs=-1,
        verbosity=0,  
        tree_method='hist' 
    ),
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=2024),
    n_jobs=1,
    step=1,
    verbose=100,
    min_features_to_select=5
)

rfe_obj_5year_tte.fit(
    X_train_imputed_scaled_5year_tte[model_feature_dict['ODSL']], 
    y_train_5year_tte
)

#### change feature set to applicable one
print(f"Optimal number of features: {rfe_obj_5year_tte.n_features_}")
print(f"Selected features: {np.array(model_feature_dict['ODSL'])[rfe_obj_5year_tte.support_]}")

dump(rfe_obj_5year_tte, './rfecv_fitted_5year_tte.pkl')

In [None]:
# Get minimal features selected by RFECV
minimal_features = np.array(model_feature_dict['ODSL'])[rfe_obj_5year_tte.support_]
minimal_features

In [None]:
# go to feature_sets and fill in the above features then restart kernel
feature_dict = {'minimal_features_rfecv_5year_tte': feature_sets.minimal_features_rfecv_5year_tte['feature'].values}
feature_dict

In [None]:
print("=== TRAINING DATA ===")
print(f"Total training samples: {len(y_train_5year_tte)}")
print(f"Training controls (y=0): {sum(y_train_5year_tte == 0)}")
print(f"Training cases (y=1): {sum(y_train_5year_tte == 1)}")
print(f"Training case prevalence: {sum(y_train_5year_tte == 1)/len(y_train_5year_tte):.3%}")


In [None]:
## CHANGE TO APPLICABLE MODEL

run_optuna_studies(
    X = X_train_imputed_scaled_5year_tte,
    y = y_train_5year_tte,
    feature_dict = {'minimal_features_rfecv_5year_tte': feature_sets.minimal_features_rfecv_5year_tte['feature'].values},
    n_trials = 1000,
    
    objective_class = optuna_util.XGBoost_OptunaObjective,
    save_dir = './optuna_results/xgboost_5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
algorithms = {
    'logistic_regression_sgd_5year_tte': sklearn.linear_model.SGDClassifier,
    'svm_5year_tte': sklearn.svm.SVC,
    'knn_5year_tte': sklearn.neighbors.KNeighborsClassifier,
    'randomforest_5year_tte': sklearn.ensemble.RandomForestClassifier,
    'xgboost_5year_tte': xgb.XGBClassifier,
    'lightgbm_5year_tte': lgb.LGBMClassifier,
}

feature_set_dfs = []
optuna_results_dir = './optuna_results'
fitted_models_dir = './best_hyperparams_fitted/'

for model_name, feature_set in model_feature_dict.items():
    feature_set_hyperparam_df = pd.DataFrame(columns=['Algorithm', 'Hyperparameter', model_name])

    model_save_dir = f'{fitted_models_dir}/{model_name}'

    for algorithm, estimator_class in algorithms.items():
        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        best_params = study.best_trial.params
        #best_params = study.best_trial.user_attrs['all_params']

        for k, v in best_params.items():
            if type(v) == str:
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, v]
            else:
                x = '%s' % float('%.4g' % v)
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, x]

    feature_set_dfs.append(feature_set_hyperparam_df)

# Combine and save results
combo_df = reduce(lambda df1, df2: pd.merge(df1, df2, on=['Algorithm', 'Hyperparameter'], how='outer'), feature_set_dfs)
combo_df.set_index('Algorithm', inplace=True)
combo_df = combo_df.loc[list(algorithms.keys())]
combo_df.to_csv('./optuna_results/best_hyperparameter_results_5year_tte.tsv', sep='\t', index=True)

<br>

## 10 year

### models

In [None]:
run_optuna_studies(
    X = X_train_imputed_scaled_10year_tte,
    y = y_train_10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LGBM_OptunaObjective,
    save_dir = './optuna_results/lightgbm_10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# XGBoost
run_optuna_studies(
    X = X_train_imputed_scaled_10year_tte,
    y = y_train_10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.XGBoost_OptunaObjective,
    save_dir = './optuna_results/xgboost_10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# random forest
run_optuna_studies(
    X = X_train_imputed_scaled_10year_tte,
    y = y_train_10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.RF_OptunaObjective,
    save_dir = './optuna_results/randomforest_10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# KNN
run_optuna_studies(
    X = X_train_imputed_scaled_10year_tte,
    y = y_train_10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.KNN_OptunaObjective,
    save_dir = './optuna_results/knn_10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# SVM
run_optuna_studies(
    X = X_train_imputed_scaled_10year_tte,
    y = y_train_10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.SVC_OptunaObjective,
    save_dir = './optuna_results/svm_10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# logistic regression
run_optuna_studies(
    X = X_train_imputed_scaled_10year_tte,
    y = y_train_10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LogisticRegressionSGD_OptunaObjective,
    save_dir = './optuna_results/logistic_regression_sgd_10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
optuna_results_dir = './optuna_results'
fitted_models_dir = './best_hyperparams_fitted/'

algorithms = {
    'logistic_regression_sgd_10year_tte': sklearn.linear_model.SGDClassifier,
    'svm_10year_tte': sklearn.svm.SVC,
    'knn_10year_tte': sklearn.neighbors.KNeighborsClassifier,
    'randomforest_10year_tte': sklearn.ensemble.RandomForestClassifier,
    'xgboost_10year_tte': xgb.XGBClassifier,
    'lightgbm_10year_tte': lgb.LGBMClassifier}

In [None]:
# Refit models with best hyperparams on entire train set & save (+save params)

for model_name, feature_set in model_feature_dict.items():
    print(feature_set)
    model_save_dir = f'{fitted_models_dir}/{model_name}'
    if not os.path.exists(model_save_dir):
        os.makedirs(model_save_dir)
        
    X = X_train_imputed_scaled_10year_tte[feature_set]

    for algorithm, estimator_class in algorithms.items():
        print(algorithm)
        print(f'Fitting {model_name} {algorithm}')

        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        print(study)
        best_params = study.best_trial.user_attrs['all_params']
        best_params_str = f'\n'.join([': '.join([key, str(val)]) for key, val in best_params.items()])
    
        # Save params as txt
        with open(f'{model_save_dir}/{algorithm}_best_params.txt', 'w+') as txt:
            txt.write(best_params_str)
    
        # Fit and save model
        estimator = estimator_class(**best_params)
        estimator.fit(X, y_train_10year_tte)
        dump(estimator, f'{model_save_dir}/{algorithm}.pkl')

In [None]:
# now go to model eval

### minimal feature set

In [None]:
# Using lgbm with od as it is the best predictor as of model_evaluation.ipynb

rfe_obj_10year_tte = RFECV(
    estimator=lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        random_state=2024,
        n_jobs=-1,
        verbose=-1  # Suppresses LightGBM output
    ),
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=2024),
    n_jobs=1,
    step=1,
    verbose=100,
    min_features_to_select=5
)

rfe_obj_10year_tte.fit(
    X_train_imputed_scaled_10year_tte[model_feature_dict['OD']], 
    y_train_10year_tte
)

#### change feature set to applicable one
print(f"Optimal number of features: {rfe_obj_10year_tte.n_features_}")
print(f"Selected features: {np.array(model_feature_dict['OD'])[rfe_obj_10year_tte.support_]}")

dump(rfe_obj_10year_tte, './rfecv_fitted_10year_tte.pkl')

In [None]:
# Get minimal features selected by RFECV
minimal_features = np.array(model_feature_dict['OD'])[rfe_obj_10year_tte.support_]
minimal_features

In [None]:
# go to feature_sets and fill in the above features then restart kernel
feature_dict = {'minimal_features_rfecv_10year_tte': feature_sets.minimal_features_rfecv_10year_tte['feature'].values}
feature_dict

In [None]:
## CHANGE TO APPLICABLE MODEL

run_optuna_studies(
    X = X_train_imputed_scaled_10year_tte,
    y = y_train_10year_tte,
    feature_dict = {'minimal_features_rfecv_10year_tte': feature_sets.minimal_features_rfecv_10year_tte['feature'].values},
    n_trials = 1000,
    
    objective_class = optuna_util.LGBM_OptunaObjective,
    save_dir = './optuna_results/lightgbm_10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
algorithms = {
    'logistic_regression_sgd_10year_tte': sklearn.linear_model.SGDClassifier,
    'svm_10year_tte': sklearn.svm.SVC,
    'knn_10year_tte': sklearn.neighbors.KNeighborsClassifier,
    'randomforest_10year_tte': sklearn.ensemble.RandomForestClassifier,
    'xgboost_10year_tte': xgb.XGBClassifier,
    'lightgbm_10year_tte': lgb.LGBMClassifier,
}

feature_set_dfs = []
optuna_results_dir = './optuna_results'
fitted_models_dir = './best_hyperparams_fitted/'

for model_name, feature_set in model_feature_dict.items():
    feature_set_hyperparam_df = pd.DataFrame(columns=['Algorithm', 'Hyperparameter', model_name])

    model_save_dir = f'{fitted_models_dir}/{model_name}'

    for algorithm, estimator_class in algorithms.items():
        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        best_params = study.best_trial.params
        #best_params = study.best_trial.user_attrs['all_params']

        for k, v in best_params.items():
            if type(v) == str:
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, v]
            else:
                x = '%s' % float('%.4g' % v)
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, x]

    feature_set_dfs.append(feature_set_hyperparam_df)

# Combine and save results
combo_df = reduce(lambda df1, df2: pd.merge(df1, df2, on=['Algorithm', 'Hyperparameter'], how='outer'), feature_set_dfs)
combo_df.set_index('Algorithm', inplace=True)
combo_df = combo_df.loc[list(algorithms.keys())]
combo_df.to_csv('./optuna_results/best_hyperparameter_results_10year_tte.tsv', sep='\t', index=True)