In [None]:
import numpy as np
import csv
import pandas as pd
import miceforest as mf

import joblib
from joblib import load

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import scipy.stats as stats

import importlib
import sys
sys.path.append('../')
import feature_sets_rnfl
importlib.reload(feature_sets_rnfl)

import pandas as pd
import optuna
from optuna import pruners
from optuna.visualization import plot_optimization_history, plot_param_importances
from optuna.samplers import TPESampler

import imblearn
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours
from imblearn.combine import SMOTEENN

import sklearn

import xgboost as xgb
import lightgbm as lgb

import numpy as np
import matplotlib.pyplot as plt
import importlib

from joblib import dump, load
import os
import math
from functools import reduce

import torch
import torch.nn as nn
from torch.nn import ReLU
import random

from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFE, RFECV, SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression



import optuna_util
importlib.reload(optuna_util)
from optuna_util import run_optuna_studies


import model_util
importlib.reload(model_util)

#/usr/bin/python3

### adding gcl and rnfl

In [None]:
# merged_df = pd.read_pickle('/mnt/shared_folders/eResearch_glaucoma_project/emma_summer2023/honours/data/derived/derived_cols_merged.pkl')
odsl_feature_list = feature_sets_rnfl.ODSL_features['feature'].values
merged_df = pd.read_csv('../incidence_merged_df.csv')
merged_df

In [None]:
# Set categorical features

odsl_categorical_features = feature_sets_rnfl.ODSL_features[feature_sets_rnfl.ODSL_features['coding_type'].isin(['binary', 'nominal'])]['feature'].values
merged_df[odsl_categorical_features] = merged_df[odsl_categorical_features].astype('category')

In [None]:
# getting RNFL and GCL

raw_data = pd.read_pickle('../data/derived/mixed_derived_and_extracted_merged.pkl') 

In [None]:
rnfl_gcl = raw_data[['f.28500.0.0', 'f.28504.0.0']].rename(
    columns=dict(zip(['f.28500.0.0', 'f.28504.0.0'], ['RNFL', 'GCL'])))

# Merge into target_df
merged_df = pd.concat([merged_df, rnfl_gcl], axis=1)

In [None]:
IOP_subcohort_df = merged_df[merged_df['IOP subcohort'] == 1] # 112,156 individuals
len(IOP_subcohort_df)

In [None]:
len(IOP_subcohort_df) - (IOP_subcohort_df['RNFL'].isnull().sum()) # total RNFL: 63,312

In [None]:
len(IOP_subcohort_df) - (IOP_subcohort_df['GCL'].isnull().sum()) # total GCL: 63,312

In [None]:
# demographics
IOP_subcohort_df[IOP_subcohort_df['tte_3year'] == 'Glaucoma']['Sex'].astype(int).mean()
IOP_subcohort_df[IOP_subcohort_df['tte_3year'] == 'Glaucoma']['Age at initial assesement'].mean()



<br>

---

# imputation:



## 3 year

In [None]:
# 1. First, separate cases and controls
cases = IOP_subcohort_df[IOP_subcohort_df['tte_3year'] == 'Glaucoma']
controls = IOP_subcohort_df[IOP_subcohort_df['tte_3year'] == 'Control']

# 2. Split cases into train/test (80/20)
case_train, case_test = train_test_split(
    cases,
    test_size=0.2,
    random_state=42,
    stratify=cases['tte_3year']  # Stratify by glaucoma status (though all are cases)
)

# 3. For controls, randomly sample to match the case split ratio
# (Ensure controls are independent of cases since no matched IDs exist)
control_train, control_test = train_test_split(
    controls,
    test_size=0.2,
    random_state=42,
    stratify=controls['tte_3year']  # Stratify by control status
)

# 4. Combine and label splits
train_df = pd.concat([case_train, control_train])
test_df = pd.concat([case_test, control_test])

train_df['tte_3year_80_20_split'] = 'train'
test_df['tte_3year_80_20_split'] = 'test'

# 5. Merge back into original DataFrame
IOP_subcohort_df = pd.concat([train_df, test_df])

# 6. QC Check
print("Final Split Proportions:")
print(IOP_subcohort_df['tte_3year_80_20_split'].value_counts(normalize=True))

print("\nCase/Control Distribution in Each Split:")
print(pd.crosstab(
    IOP_subcohort_df['tte_3year_80_20_split'],
    IOP_subcohort_df['tte_3year'],
    normalize='index'
))

In [None]:
X_train_3year_tte, y_train_3year_tte, X_test_3year_tte, y_test_3year_tte = model_util.get_train_test_datasets(
    IOP_subcohort_df, 
    'tte_3year_80_20_split', 
    'tte_3year', 
    odsl_feature_list
)

In [None]:
print("=== TRAINING DATA ===")
print(f"Total training samples: {len(y_train_3year_tte)}")
print(f"Training controls (y=0): {sum(y_train_3year_tte == 0)}")
print(f"Training cases (y=1): {sum(y_train_3year_tte == 1)}")

# Check the test data
print("\n=== TEST DATA ===")
print(f"Total test samples: {len(y_test_3year_tte)}")
print(f"Test controls (y=0): {sum(y_test_3year_tte == 0)}")
print(f"Test cases (y=1): {sum(y_test_3year_tte == 1)}")

In [None]:
imputation_kernel_3year_tte = mf.ImputationKernel(
    X_train_3year_tte,
    num_datasets=1,
    random_state=2024,
    mean_match_strategy='normal',
    mean_match_candidates=10,
    save_all_iterations_data=True,
    imputation_order='descending',
)

In [None]:
imputation_kernel_3year_tte.mice(
    verbose=True,
    iterations=13, 

    # LGBM parameters 
    n_estimators=200,
    max_bin=512,
    # max_depth=10,
    # num_leaves=1023,
    # learning_rate=0.1,
)

joblib.dump(imputation_kernel_3year_tte, './data/imputed/imputation_kernel_13iter_rnfl3year_tte.pkl')

In [None]:
X_train_imputed_3year_tte = imputation_kernel_3year_tte.complete_data().reset_index(drop=True)
X_test_imputed_3year_tte = imputation_kernel_3year_tte.impute_new_data(X_test_3year_tte).complete_data().reset_index(drop=True)

joblib.dump(X_train_imputed_3year_tte, '../data/imputed/IOPsubcohort_X_train_imputed_rnfl3year_tte.pkl')
joblib.dump(X_test_imputed_3year_tte, '../data/imputed/IOPsubcohort_X_test_imputed_rnfl3year_tte.pkl')

joblib.dump(y_train_3year_tte.reset_index(drop=True), '../data/imputed/IOPsubcohort_y_train_rnfl3year_tte.pkl')
joblib.dump(y_test_3year_tte.reset_index(drop=True), '../data/imputed/IOPsubcohort_y_test_rnfl3year_tte.pkl')

X_merged_imputed_3year_tte = pd.concat((X_train_imputed_3year_tte, X_test_imputed_3year_tte), ignore_index=True)
y_merged_3year_tte = pd.concat((y_train_3year_tte, y_test_3year_tte), ignore_index=True)

joblib.dump(X_merged_imputed_3year_tte, '../data/imputed/IOPsubcohort_X_merged_imputed_rnfl3year_tte.pkl')
joblib.dump(y_merged_3year_tte, '../data/imputed/IOPsubcohort_y_merged_rnfl3year_tte.pkl')




# Apply scaling

scaler = MinMaxScaler()
scaler.fit(X_train_imputed_3year_tte)

X_train_scaled_3year_tte = pd.DataFrame(scaler.transform(X_train_imputed_3year_tte), columns=X_train_3year_tte.columns)
X_test_scaled_3year_tte = pd.DataFrame(scaler.transform(X_test_imputed_3year_tte), columns=X_test_3year_tte.columns)

joblib.dump(scaler, '../data/imputed/min_max_scaler.pkl')
joblib.dump(X_train_scaled_3year_tte, '../data/imputed/IOPsubcohort_X_train_imputed_scaled_rnfl3year_tte.pkl')
joblib.dump(X_test_scaled_3year_tte, '../data/imputed/IOPsubcohort_X_test_imputed_scaled_rnfl3year_tte.pkl')

<br>

## 5 year - restart kernel and don't touch 3 year to run this correctly

In [None]:
# 1. First, separate cases and controls
cases = IOP_subcohort_df[IOP_subcohort_df['tte_5year'] == 'Glaucoma']
controls = IOP_subcohort_df[IOP_subcohort_df['tte_5year'] == 'Control']

# 2. Split cases into train/test (80/20)
case_train, case_test = train_test_split(
    cases,
    test_size=0.2,
    random_state=42,
    stratify=cases['tte_5year']  # Stratify by glaucoma status 
)

# 3. For controls, randomly sample to match the case split ratio
# (Ensure controls are independent of cases since no matched IDs exist)
control_train, control_test = train_test_split(
    controls,
    test_size=0.2,
    random_state=42,
    stratify=controls['tte_5year']  # Stratify by control status
)

# 4. Combine and label splits
train_df = pd.concat([case_train, control_train])
test_df = pd.concat([case_test, control_test])

train_df['tte_5year_80_20_split'] = 'train'
test_df['tte_5year_80_20_split'] = 'test'

# 5. Merge back into original DataFrame
IOP_subcohort_df = pd.concat([train_df, test_df])

# 6. QC Check
print("Final Split Proportions:")
print(IOP_subcohort_df['tte_5year_80_20_split'].value_counts(normalize=True))

print("\nCase/Control Distribution in Each Split:")
print(pd.crosstab(
    IOP_subcohort_df['tte_5year_80_20_split'],
    IOP_subcohort_df['tte_5year'],
    normalize='index'
))

In [None]:
IOP_subcohort_df['tte_5year'].value_counts()

In [None]:
X_train_5year_tte, y_train_5year_tte, X_test_5year_tte, y_test_5year_tte = model_util.get_train_test_datasets(
    IOP_subcohort_df, 
    'tte_5year_80_20_split', 
    'tte_5year', 
    odsl_feature_list )

In [None]:
print("=== TRAINING DATA ===")
print(f"Total training samples: {len(y_train_5year_tte)}")
print(f"Training controls (y=0): {sum(y_train_5year_tte == 0)}")
print(f"Training cases (y=1): {sum(y_train_5year_tte == 1)}")
print(f"Training case prevalence: {sum(y_train_5year_tte == 1)/len(y_train_5year_tte):.3%}")

# Check the test data
print("\n=== TEST DATA ===")
print(f"Total test samples: {len(y_test_5year_tte)}")
print(f"Test controls (y=0): {sum(y_test_5year_tte == 0)}")
print(f"Test cases (y=1): {sum(y_test_5year_tte == 1)}")
print(f"Test case prevalence: {sum(y_test_5year_tte == 1)/len(y_test_5year_tte):.3%}")

In [None]:
imputation_kernel_5year_tte = mf.ImputationKernel(
    X_train_5year_tte,
    num_datasets=1,
    random_state=2024,
    mean_match_strategy='normal',
    mean_match_candidates=10,
    save_all_iterations_data=True,
    imputation_order='descending', 
)

In [None]:
imputation_kernel_5year_tte.mice(
    verbose=True,
    iterations=13, 

    # LGBM parameters 
    n_estimators=200,
    max_bin=512,
    # max_depth=10,
    # num_leaves=1023,
    # learning_rate=0.1,
)

joblib.dump(imputation_kernel_5year_tte, '../data/imputed/imputation_kernel_13iter_rnfl5year_tte.pkl')

In [None]:
X_train_imputed_5year_tte = imputation_kernel_5year_tte.complete_data().reset_index(drop=True)
X_test_imputed_5year_tte = imputation_kernel_5year_tte.impute_new_data(X_test_5year_tte).complete_data().reset_index(drop=True)

joblib.dump(X_train_imputed_5year_tte, '../data/imputed/IOPsubcohort_X_train_imputed_rnfl5year_tte.pkl')
joblib.dump(X_test_imputed_5year_tte, '../data/imputed/IOPsubcohort_X_test_imputed_rnfl5year_tte.pkl')

joblib.dump(y_train_5year_tte.reset_index(drop=True), '../data/imputed/IOPsubcohort_y_train_rnfl5year_tte.pkl')
joblib.dump(y_test_5year_tte.reset_index(drop=True), '../data/imputed/IOPsubcohort_y_test_rnfl5year_tte.pkl')

X_merged_imputed_5year_tte = pd.concat((X_train_imputed_5year_tte, X_test_imputed_5year_tte), ignore_index=True)
y_merged_5year_tte = pd.concat((y_train_5year_tte, y_test_5year_tte), ignore_index=True)

joblib.dump(X_merged_imputed_5year_tte, '../data/imputed/IOPsubcohort_X_merged_imputed_rnfl5year_tte.pkl')
joblib.dump(y_merged_5year_tte, '../data/imputed/IOPsubcohort_y_merged_rnfl5year_tte.pkl')


# Apply scaling

scaler = MinMaxScaler()
scaler.fit(X_train_imputed_5year_tte)

X_train_scaled_5year_tte = pd.DataFrame(scaler.transform(X_train_imputed_5year_tte), columns=X_train_5year_tte.columns)
X_test_scaled_5year_tte = pd.DataFrame(scaler.transform(X_test_imputed_5year_tte), columns=X_test_5year_tte.columns)

joblib.dump(scaler, '../data/imputed/min_max_scaler.pkl')
joblib.dump(X_train_scaled_5year_tte, '../data/imputed/IOPsubcohort_X_train_imputed_scaled_rnfl5year_tte.pkl')
joblib.dump(X_test_scaled_5year_tte, '../data/imputed/IOPsubcohort_X_test_imputed_scaled_rnfl5year_tte.pkl')

<br>

## 10 year - same as 5

In [None]:
# 1. First, separate cases and controls
cases = IOP_subcohort_df[IOP_subcohort_df['tte_10year'] == 'Glaucoma']
controls = IOP_subcohort_df[IOP_subcohort_df['tte_10year'] == 'Control']

# 2. Split cases into train/test (80/20)
case_train, case_test = train_test_split(
    cases,
    test_size=0.2,
    random_state=42,
    stratify=cases['tte_10year']  # Stratify by glaucoma status (though all are cases)
)

# 3. For controls, randomly sample to match the case split ratio
# (Ensure controls are independent of cases since no matched IDs exist)
control_train, control_test = train_test_split(
    controls,
    test_size=0.2,
    random_state=42,
    stratify=controls['tte_10year']  # Stratify by control status
)

# 4. Combine and label splits
train_df = pd.concat([case_train, control_train])
test_df = pd.concat([case_test, control_test])

train_df['tte_10year_80_20_split'] = 'train'
test_df['tte_10year_80_20_split'] = 'test'

# 5. Merge back into original DataFrame
IOP_subcohort_df = pd.concat([train_df, test_df])

# 6. QC Check
print("Final Split Proportions:")
print(IOP_subcohort_df['tte_10year_80_20_split'].value_counts(normalize=True))

print("\nCase/Control Distribution in Each Split:")
print(pd.crosstab(
    IOP_subcohort_df['tte_10year_80_20_split'],
    IOP_subcohort_df['tte_10year'],
    normalize='index'
))

In [None]:
X_train_10year_tte, y_train_10year_tte, X_test_10year_tte, y_test_10year_tte = model_util.get_train_test_datasets(
    IOP_subcohort_df, 
    'tte_10year_80_20_split', 
    'tte_10year', 
    odsl_feature_list
)

In [None]:
print("=== TRAINING DATA ===")
print(f"Total training samples: {len(y_train_10year_tte)}")
print(f"Training controls (y=0): {sum(y_train_10year_tte == 0)}")
print(f"Training cases (y=1): {sum(y_train_10year_tte == 1)}")

In [None]:
imputation_kernel_10year_tte = mf.ImputationKernel(
    X_train_10year_tte,
    num_datasets=1,
    random_state=2024,
    mean_match_strategy='normal',
    mean_match_candidates=10,
    save_all_iterations_data=True,
    imputation_order='descending',
)

In [None]:
imputation_kernel_10year_tte.mice(
    verbose=True,
    iterations=13, 

    # LGBM parameters 
    n_estimators=200,
    max_bin=512,
    # max_depth=10,
    # num_leaves=1023,
    # learning_rate=0.1,
)
joblib.dump(imputation_kernel_10year_tte, '../data/imputed/imputation_kernel_13iter_rnfl10year_tte.pkl')

In [None]:
X_train_imputed_10year_tte = imputation_kernel_10year_tte.complete_data().reset_index(drop=True)
X_test_imputed_10year_tte = imputation_kernel_10year_tte.impute_new_data(X_test_10year_tte).complete_data().reset_index(drop=True)

joblib.dump(X_train_imputed_10year_tte, '../data/imputed/IOPsubcohort_X_train_imputed_rnfl10year_tte.pkl')
joblib.dump(X_test_imputed_10year_tte, '../data/imputed/IOPsubcohort_X_test_imputed_rnfl10year_tte.pkl')

joblib.dump(y_train_10year_tte.reset_index(drop=True), '../data/imputed/IOPsubcohort_y_train_rnfl10year_tte.pkl')
joblib.dump(y_test_10year_tte.reset_index(drop=True), '../data/imputed/IOPsubcohort_y_test_rnfl10year_tte.pkl')

X_merged_imputed_10year_tte = pd.concat((X_train_imputed_10year_tte, X_test_imputed_10year_tte), ignore_index=True)
y_merged_10year_tte = pd.concat((y_train_10year_tte, y_test_10year_tte), ignore_index=True)

joblib.dump(X_merged_imputed_10year_tte, '../data/imputed/IOPsubcohort_X_merged_imputed_rnfl10year_tte.pkl')
joblib.dump(y_merged_10year_tte, '../data/imputed/IOPsubcohort_y_merged_rnfl10year_tte.pkl')


# Apply scaling

scaler = MinMaxScaler()
scaler.fit(X_train_imputed_10year_tte)

X_train_scaled_10year_tte = pd.DataFrame(scaler.transform(X_train_imputed_10year_tte), columns=X_train_10year_tte.columns)
X_test_scaled_10year_tte = pd.DataFrame(scaler.transform(X_test_imputed_10year_tte), columns=X_test_10year_tte.columns)

joblib.dump(scaler, '../data/imputed/min_max_scaler.pkl')
joblib.dump(X_train_scaled_10year_tte, '../data/imputed/IOPsubcohort_X_train_imputed_scaled_rnfl10year_tte.pkl')
<joblib.dump(X_test_scaled_10year_tte, '../data/imputed/IOPsubcohort_X_test_imputed_scaled_rnfl10year_tte.pkl')



<br>

<br>

---

# IOP removed

In [None]:
odsl_feature_list = np.delete(odsl_feature_list, [0,1])
IOP_subcohort_df.drop(columns = ['IOPg pre-treatment', 'IOPg pre-treatment inter-eye difference'], inplace = True)



## 3 year IOP removed

In [None]:
# 1. First, separate cases and controls
cases = IOP_subcohort_df[IOP_subcohort_df['tte_3year'] == 'Glaucoma']
controls = IOP_subcohort_df[IOP_subcohort_df['tte_3year'] == 'Control']

# 2. Split cases into train/test (80/20)
case_train, case_test = train_test_split(
    cases,
    test_size=0.2,
    random_state=42,
    stratify=cases['tte_3year']  # Stratify by glaucoma status (though all are cases)
)

# 3. For controls, randomly sample to match the case split ratio
# (Ensure controls are independent of cases since no matched IDs exist)
control_train, control_test = train_test_split(
    controls,
    test_size=0.2,
    random_state=42,
    stratify=controls['tte_3year']  # Stratify by control status
)

# 4. Combine and label splits
train_df = pd.concat([case_train, control_train])
test_df = pd.concat([case_test, control_test])

train_df['tte_3year_80_20_split'] = 'train'
test_df['tte_3year_80_20_split'] = 'test'

# 5. Merge back into original DataFrame
IOP_subcohort_df = pd.concat([train_df, test_df])

# 6. QC Check
print("Final Split Proportions:")
print(IOP_subcohort_df['tte_3year_80_20_split'].value_counts(normalize=True))

print("\nCase/Control Distribution in Each Split:")
print(pd.crosstab(
    IOP_subcohort_df['tte_3year_80_20_split'],
    IOP_subcohort_df['tte_3year'],
    normalize='index'
))

In [None]:
X_train_3year_tte, y_train_3year_tte, X_test_3year_tte, y_test_3year_tte = model_util.get_train_test_datasets(
    IOP_subcohort_df, 
    'tte_3year_80_20_split', 
    'tte_3year', 
    odsl_feature_list
)

In [None]:
print("=== TRAINING DATA ===")
print(f"Total training samples: {len(y_train_3year_tte)}")
print(f"Training controls (y=0): {sum(y_train_3year_tte == 0)}")
print(f"Training cases (y=1): {sum(y_train_3year_tte == 1)}")

# Check the test data
print("\n=== TEST DATA ===")
print(f"Total test samples: {len(y_test_3year_tte)}")
print(f"Test controls (y=0): {sum(y_test_3year_tte == 0)}")
print(f"Test cases (y=1): {sum(y_test_3year_tte == 1)}")

In [None]:
imputation_kernel_3year_tte = mf.ImputationKernel(
    X_train_3year_tte,
    num_datasets=1,
    random_state=2024,
    mean_match_strategy='normal',
    mean_match_candidates=10,
    save_all_iterations_data=True,
    imputation_order='descending',
)

In [None]:
imputation_kernel_3year_tte.mice(
    verbose=True,
    iterations=13, 

    # LGBM parameters 
    n_estimators=200,
    max_bin=512,
    # max_depth=10,
    # num_leaves=1023,
    # learning_rate=0.1,
)

joblib.dump(imputation_kernel_3year_tte, '../data/imputed/imputation_kernel_13iter_noIOPrnfl3year_tte.pkl')

In [None]:
X_train_imputed_3year_tte = imputation_kernel_3year_tte.complete_data().reset_index(drop=True)
X_test_imputed_3year_tte = imputation_kernel_3year_tte.impute_new_data(X_test_3year_tte).complete_data().reset_index(drop=True)

joblib.dump(X_train_imputed_3year_tte, '../data/imputed/IOPsubcohort_X_train_imputed_noIOPrnfl3year_tte.pkl')
joblib.dump(X_test_imputed_3year_tte, '../data/imputed/IOPsubcohort_X_test_imputed_noIOPrnfl3year_tte.pkl')

joblib.dump(y_train_3year_tte.reset_index(drop=True), '../data/imputed/IOPsubcohort_y_train_noIOPrnfl3year_tte.pkl')
joblib.dump(y_test_3year_tte.reset_index(drop=True), '../data/imputed/IOPsubcohort_y_test_noIOPrnfl3year_tte.pkl')

X_merged_imputed_3year_tte = pd.concat((X_train_imputed_3year_tte, X_test_imputed_3year_tte), ignore_index=True)
y_merged_3year_tte = pd.concat((y_train_3year_tte, y_test_3year_tte), ignore_index=True)

joblib.dump(X_merged_imputed_3year_tte, '../data/imputed/IOPsubcohort_X_merged_imputed_noIOPrnfl3year_tte.pkl')
joblib.dump(y_merged_3year_tte, '../data/imputed/IOPsubcohort_y_merged_noIOPrnfl3year_tte.pkl')




# Apply scaling

scaler = MinMaxScaler()
scaler.fit(X_train_imputed_3year_tte)

X_train_scaled_3year_tte = pd.DataFrame(scaler.transform(X_train_imputed_3year_tte), columns=X_train_3year_tte.columns)
X_test_scaled_3year_tte = pd.DataFrame(scaler.transform(X_test_imputed_3year_tte), columns=X_test_3year_tte.columns)

joblib.dump(scaler, '../data/imputed/min_max_scaler.pkl')
joblib.dump(X_train_scaled_3year_tte, '../data/imputed/IOPsubcohort_X_train_imputed_scaled_noIOPrnfl3year_tte.pkl')
joblib.dump(X_test_scaled_3year_tte, '../data/imputed/IOPsubcohort_X_test_imputed_scaled_noIOPrnfl3year_tte.pkl')

<br>

## 5 year IOP removed

In [None]:
#IOP_subcohort_df.drop(columns = ['IOPg pre-treatment', 'IOPg pre-treatment inter-eye difference'], inplace = True)

# 1. First, separate cases and controls
cases = IOP_subcohort_df[IOP_subcohort_df['tte_5year'] == 'Glaucoma']
controls = IOP_subcohort_df[IOP_subcohort_df['tte_5year'] == 'Control']

# 2. Split cases into train/test (80/20)
case_train, case_test = train_test_split(
    cases,
    test_size=0.2,
    random_state=42,
    stratify=cases['tte_5year']  # Stratify by glaucoma status (though all are cases)
)

# 3. For controls, randomly sample to match the case split ratio
# (Ensure controls are independent of cases since no matched IDs exist)
control_train, control_test = train_test_split(
    controls,
    test_size=0.2,
    random_state=42,
    stratify=controls['tte_5year']  # Stratify by control status
)

# 4. Combine and label splits
train_df = pd.concat([case_train, control_train])
test_df = pd.concat([case_test, control_test])

train_df['tte_5year_80_20_split'] = 'train'
test_df['tte_5year_80_20_split'] = 'test'

# 5. Merge back into original DataFrame
IOP_subcohort_df = pd.concat([train_df, test_df])

# 6. QC Check
print("Final Split Proportions:")
print(IOP_subcohort_df['tte_5year_80_20_split'].value_counts(normalize=True))

print("\nCase/Control Distribution in Each Split:")
print(pd.crosstab(
    IOP_subcohort_df['tte_5year_80_20_split'],
    IOP_subcohort_df['tte_5year'],
    normalize='index'
))

In [None]:
X_train_5year_tte, y_train_5year_tte, X_test_5year_tte, y_test_5year_tte = model_util.get_train_test_datasets(
    IOP_subcohort_df, 
    'tte_5year_80_20_split', 
    'tte_5year', 
    odsl_feature_list
)

In [None]:
print("=== TRAINING DATA ===")
print(f"Total training samples: {len(y_train_5year_tte)}")
print(f"Training controls (y=0): {sum(y_train_5year_tte == 0)}")
print(f"Training cases (y=1): {sum(y_train_5year_tte == 1)}")

# Check the test data
print("\n=== TEST DATA ===")
print(f"Total test samples: {len(y_test_5year_tte)}")
print(f"Test controls (y=0): {sum(y_test_5year_tte == 0)}")
print(f"Test cases (y=1): {sum(y_test_5year_tte == 1)}")

In [None]:
imputation_kernel_5year_tte = mf.ImputationKernel(
    X_train_5year_tte,
    num_datasets=1,
    random_state=2024,
    mean_match_strategy='normal',
    mean_match_candidates=10,
    save_all_iterations_data=True,
    imputation_order='descending',
)

In [None]:
imputation_kernel_5year_tte.mice(
    verbose=True,
    iterations=13, 

    # LGBM parameters 
    n_estimators=200,
    max_bin=512,
    # max_depth=10,
    # num_leaves=1023,
    # learning_rate=0.1,
)

joblib.dump(imputation_kernel_5year_tte, '../data/imputed/imputation_kernel_13iter_noIOPrnfl5year_tte.pkl')

In [None]:
X_train_imputed_5year_tte = imputation_kernel_5year_tte.complete_data().reset_index(drop=True)
X_test_imputed_5year_tte = imputation_kernel_5year_tte.impute_new_data(X_test_5year_tte).complete_data().reset_index(drop=True)

joblib.dump(X_train_imputed_5year_tte, '../data/imputed/IOPsubcohort_X_train_imputed_noIOPrnfl5year_tte.pkl')
joblib.dump(X_test_imputed_5year_tte, '../data/imputed/IOPsubcohort_X_test_imputed_noIOPrnfl5year_tte.pkl')

joblib.dump(y_train_5year_tte.reset_index(drop=True), '../data/imputed/IOPsubcohort_y_train_noIOPrnfl5year_tte.pkl')
joblib.dump(y_test_5year_tte.reset_index(drop=True), '../data/imputed/IOPsubcohort_y_test_noIOPrnfl5year_tte.pkl')

X_merged_imputed_5year_tte = pd.concat((X_train_imputed_5year_tte, X_test_imputed_5year_tte), ignore_index=True)
y_merged_5year_tte = pd.concat((y_train_5year_tte, y_test_5year_tte), ignore_index=True)

joblib.dump(X_merged_imputed_5year_tte, '../data/imputed/IOPsubcohort_X_merged_imputed_noIOPrnfl5year_tte.pkl')
joblib.dump(y_merged_5year_tte, '../data/imputed/IOPsubcohort_y_merged_noIOPrnfl5year_tte.pkl')




# Apply scaling

scaler = MinMaxScaler()
scaler.fit(X_train_imputed_5year_tte)

X_train_scaled_5year_tte = pd.DataFrame(scaler.transform(X_train_imputed_5year_tte), columns=X_train_5year_tte.columns)
X_test_scaled_5year_tte = pd.DataFrame(scaler.transform(X_test_imputed_5year_tte), columns=X_test_5year_tte.columns)

joblib.dump(scaler, '../data/imputed/min_max_scaler.pkl')
joblib.dump(X_train_scaled_5year_tte, '../data/imputed/IOPsubcohort_X_train_imputed_scaled_noIOPrnfl5year_tte.pkl')
joblib.dump(X_test_scaled_5year_tte, '../data/imputed/IOPsubcohort_X_test_imputed_scaled_noIOPrnfl5year_tte.pkl')

<br>

## 10 year IOP removed

In [None]:
# 1. First, separate cases and controls
cases = IOP_subcohort_df[IOP_subcohort_df['tte_10year'] == 'Glaucoma']
controls = IOP_subcohort_df[IOP_subcohort_df['tte_10year'] == 'Control']

# 2. Split cases into train/test (80/20)
case_train, case_test = train_test_split(
    cases,
    test_size=0.2,
    random_state=42,
    stratify=cases['tte_10year']  # Stratify by glaucoma status (though all are cases)
)

# 3. For controls, randomly sample to match the case split ratio
# (Ensure controls are independent of cases since no matched IDs exist)
control_train, control_test = train_test_split(
    controls,
    test_size=0.2,
    random_state=42,
    stratify=controls['tte_10year']  # Stratify by control status
)

# 4. Combine and label splits
train_df = pd.concat([case_train, control_train])
test_df = pd.concat([case_test, control_test])

train_df['tte_10year_80_20_split'] = 'train'
test_df['tte_10year_80_20_split'] = 'test'

# 5. Merge back into original DataFrame
IOP_subcohort_df = pd.concat([train_df, test_df])

# 6. QC Check
print("Final Split Proportions:")
print(IOP_subcohort_df['tte_10year_80_20_split'].value_counts(normalize=True))

print("\nCase/Control Distribution in Each Split:")
print(pd.crosstab(
    IOP_subcohort_df['tte_10year_80_20_split'],
    IOP_subcohort_df['tte_10year'],
    normalize='index'
))

In [None]:
X_train_10year_tte, y_train_10year_tte, X_test_10year_tte, y_test_10year_tte = model_util.get_train_test_datasets(
    IOP_subcohort_df, 
    'tte_10year_80_20_split', 
    'tte_10year', 
    odsl_feature_list
)

In [None]:
print("=== TRAINING DATA ===")
print(f"Total training samples: {len(y_train_10year_tte)}")
print(f"Training controls (y=0): {sum(y_train_10year_tte == 0)}")
print(f"Training cases (y=1): {sum(y_train_10year_tte == 1)}")

# Check the test data
print("\n=== TEST DATA ===")
print(f"Total test samples: {len(y_test_10year_tte)}")
print(f"Test controls (y=0): {sum(y_test_10year_tte == 0)}")
print(f"Test cases (y=1): {sum(y_test_10year_tte == 1)}")

In [None]:
imputation_kernel_10year_tte = mf.ImputationKernel(
    X_train_10year_tte,
    num_datasets=1,
    random_state=2024,
    mean_match_strategy='normal',
    mean_match_candidates=10,
    save_all_iterations_data=True,
    imputation_order='descending',
)

In [None]:
imputation_kernel_10year_tte.mice(
    verbose=True,
    iterations=13, 

    # LGBM parameters 
    n_estimators=200,
    max_bin=512,
    # max_depth=10,
    # num_leaves=1023,
    # learning_rate=0.1,
)

joblib.dump(imputation_kernel_10year_tte, './data/imputed/imputation_kernel_13iter_noIOPrnfl10year_tte.pkl')

In [None]:
X_train_imputed_10year_tte = imputation_kernel_10year_tte.complete_data().reset_index(drop=True)
X_test_imputed_10year_tte = imputation_kernel_10year_tte.impute_new_data(X_test_10year_tte).complete_data().reset_index(drop=True)

joblib.dump(X_train_imputed_10year_tte, '../data/imputed/IOPsubcohort_X_train_imputed_noIOPrnfl10year_tte.pkl')
joblib.dump(X_test_imputed_10year_tte, '../data/imputed/IOPsubcohort_X_test_imputed_noIOPrnfl10year_tte.pkl')

joblib.dump(y_train_10year_tte.reset_index(drop=True), '../data/imputed/IOPsubcohort_y_train_noIOPrnfl10year_tte.pkl')
joblib.dump(y_test_10year_tte.reset_index(drop=True), '../data/imputed/IOPsubcohort_y_test_noIOPrnfl10year_tte.pkl')

X_merged_imputed_10year_tte = pd.concat((X_train_imputed_10year_tte, X_test_imputed_10year_tte), ignore_index=True)
y_merged_10year_tte = pd.concat((y_train_10year_tte, y_test_10year_tte), ignore_index=True)

joblib.dump(X_merged_imputed_10year_tte, '../data/imputed/IOPsubcohort_X_merged_imputed_noIOPrnfl10year_tte.pkl')
joblib.dump(y_merged_10year_tte, '../data/imputed/IOPsubcohort_y_merged_noIOPrnfl10year_tte.pkl')




# Apply scaling

scaler = MinMaxScaler()
scaler.fit(X_train_imputed_10year_tte)

X_train_scaled_10year_tte = pd.DataFrame(scaler.transform(X_train_imputed_10year_tte), columns=X_train_10year_tte.columns)
X_test_scaled_10year_tte = pd.DataFrame(scaler.transform(X_test_imputed_10year_tte), columns=X_test_10year_tte.columns)

joblib.dump(scaler, '../data/imputed/min_max_scaler.pkl')
joblib.dump(X_train_scaled_10year_tte, '../data/imputed/IOPsubcohort_X_train_imputed_scaled_noIOPrnfl10year_tte.pkl')
joblib.dump(X_test_scaled_10year_tte, '../data/imputed/IOPsubcohort_X_test_imputed_scaled_noIOPrnfl10year_tte.pkl')

<br>

<br>

---
# hyperparameter tuning

<br>

#### RNFL + GCL + IOP

In [None]:
model_feature_dict = {
    'ophthalmic': feature_sets_rnfl.ophthalmic_features['feature'].values,
    'demographic': feature_sets_rnfl.demographic_features['feature'].values,
    'systemic': feature_sets_rnfl.systemic_features['feature'].values,
    'lifestyle': feature_sets_rnfl.lifestyle_features['feature'].values,

    'OD': feature_sets_rnfl.OD_features['feature'].values,
    'SL': feature_sets_rnfl.SL_features['feature'].values,
    'ODSL': feature_sets_rnfl.ODSL_features['feature'].values,
    'DSL': feature_sets_rnfl.DSL_features['feature'].values}

n_trials = 100
n_cv_folds = 5
scoring_metric = 'roc_auc'

# 3 year
X_train_imputed_scaled_3year_tte = load('../data/imputed/IOPsubcohort_X_train_imputed_scaled_rnfl3year_tte.pkl')
y_train_3year_tte = load('../data/imputed/IOPsubcohort_y_train_rnfl3year_tte.pkl')

# 5 year
X_train_imputed_scaled_5year_tte = load('../data/imputed/IOPsubcohort_X_train_imputed_scaled_rnfl5year_tte.pkl')
y_train_5year_tte = load('../data/imputed/IOPsubcohort_y_train_rnfl5year_tte.pkl')

# 10 year
X_train_imputed_scaled_10year_tte = load('../data/imputed/IOPsubcohort_X_train_imputed_scaled_rnfl10year_tte.pkl')
y_train_10year_tte = load('../data/imputed/IOPsubcohort_y_train_rnfl10year_tte.pkl')

<br>

#### RNFL + GCL

In [None]:
model_feature_dict = {
    'ophthalmic': feature_sets_rnfl.ophthalmic_features['feature'].values[~np.isin(feature_sets_rnfl.ophthalmic_features['feature'].values, ['IOPg pre-treatment', 'IOPg pre-treatment inter-eye difference'])],
    'demographic': feature_sets_rnfl.demographic_features['feature'].values,
    'systemic': feature_sets_rnfl.systemic_features['feature'].values,
    'lifestyle': feature_sets_rnfl.lifestyle_features['feature'].values,

    'OD': feature_sets_rnfl.OD_features['feature'].values[~np.isin(feature_sets_rnfl.OD_features['feature'].values, ['IOPg pre-treatment', 'IOPg pre-treatment inter-eye difference'])],
    'SL': feature_sets_rnfl.SL_features['feature'].values,
    'ODSL': feature_sets_rnfl.ODSL_features['feature'].values[~np.isin(feature_sets_rnfl.ODSL_features['feature'].values, ['IOPg pre-treatment', 'IOPg pre-treatment inter-eye difference'])],
    'DSL': feature_sets_rnfl.DSL_features['feature'].values}

n_trials = 100
n_cv_folds = 5
scoring_metric = 'roc_auc'

# 3 year
X_train_imputed_scaled_3year_tte = load('../data/imputed/IOPsubcohort_X_train_imputed_scaled_noIOPrnfl3year_tte.pkl')
y_train_3year_tte = load('../data/imputed/IOPsubcohort_y_train_noIOPrnfl3year_tte.pkl')

# 5 year
X_train_imputed_scaled_5year_tte = load('../data/imputed/IOPsubcohort_X_train_imputed_scaled_noIOPrnfl5year_tte.pkl')
y_train_5year_tte = load('../data/imputed/IOPsubcohort_y_train_noIOPrnfl5year_tte.pkl')

# 10 year
X_train_imputed_scaled_10year_tte = load('../data/imputed/IOPsubcohort_X_train_imputed_scaled_noIOPrnfl10year_tte.pkl')
y_train_10year_tte = load('../data/imputed/IOPsubcohort_y_train_noIOPrnfl10year_tte.pkl')

---

## RNFL + GCL + IOP

<br>

### 3 year

In [None]:
run_optuna_studies(
    X = X_train_imputed_scaled_3year_tte,
    y = y_train_3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LGBM_OptunaObjective,
    save_dir = './optuna_results/lightgbm_rnfl3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# XGBoost
run_optuna_studies(
    X = X_train_imputed_scaled_3year_tte,
    y = y_train_3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.XGBoost_OptunaObjective,
    save_dir = './optuna_results/xgboost_rnfl3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# random forest
run_optuna_studies(
    X = X_train_imputed_scaled_3year_tte,
    y = y_train_3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.RF_OptunaObjective,
    save_dir = './optuna_results/randomforest_rnfl3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# KNN
run_optuna_studies(
    X = X_train_imputed_scaled_3year_tte,
    y = y_train_3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.KNN_OptunaObjective,
    save_dir = './optuna_results/knn_rnfl3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# SVM
run_optuna_studies(
    X = X_train_imputed_scaled_3year_tte,
    y = y_train_3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.SVC_OptunaObjective,
    save_dir = './optuna_results/svm_rnfl3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# logistic regression
run_optuna_studies(
    X = X_train_imputed_scaled_3year_tte,
    y = y_train_3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LogisticRegressionSGD_OptunaObjective,
    save_dir = './optuna_results/logistic_regression_sgd_rnfl3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
optuna_results_dir = './optuna_results'
fitted_models_dir = './best_hyperparams_fitted/'

algorithms = {
    'logistic_regression_sgd_rnfl3year_tte': sklearn.linear_model.SGDClassifier,
    'svm_rnfl3year_tte': sklearn.svm.SVC,
    'knn_rnfl3year_tte': sklearn.neighbors.KNeighborsClassifier,
    'randomforest_rnfl3year_tte': sklearn.ensemble.RandomForestClassifier,
    'xgboost_rnfl3year_tte': xgb.XGBClassifier,
    'lightgbm_rnfl3year_tte': lgb.LGBMClassifier}


# Refit models with best hyperparams on entire train set & save (+save params)

for model_name, feature_set in model_feature_dict.items():
    print(feature_set)
    model_save_dir = f'{fitted_models_dir}/{model_name}'
    if not os.path.exists(model_save_dir):
        os.makedirs(model_save_dir)
        
    X = X_train_imputed_scaled_3year_tte[feature_set]

    for algorithm, estimator_class in algorithms.items():
        print(algorithm)
        print(f'Fitting {model_name} {algorithm}')

        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        print(study)
        best_params = study.best_trial.user_attrs['all_params']
        best_params_str = f'\n'.join([': '.join([key, str(val)]) for key, val in best_params.items()])
    
        # Save params as txt
        with open(f'{model_save_dir}/{algorithm}_best_params.txt', 'w+') as txt:
            txt.write(best_params_str)
    
        # Fit and save model
        estimator = estimator_class(**best_params)
        estimator.fit(X, y_train_3year_tte)
        dump(estimator, f'{model_save_dir}/{algorithm}.pkl')

In [None]:
# best model is logistic regression with OD

### minimal feature model

In [None]:
# Using LR with OD as it is the best predictor as of model_evaluation.ipynb
rfe_obj_3year_tte = RFECV(
    estimator=LogisticRegression(
        penalty='l2',
        C=1.0,
        solver='lbfgs',
        max_iter=1000,
        random_state=2024,
        n_jobs=-1
    ),
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=2024),
    n_jobs=1,
    step=1,
    verbose=100,
    min_features_to_select=5
)

rfe_obj_3year_tte.fit(
    X_train_imputed_scaled_3year_tte[model_feature_dict['OD']], 
    y_train_3year_tte
)

#### change feature set to applicable one
print(f"Optimal number of features: {rfe_obj_3year_tte.n_features_}")
print(f"Selected features: {np.array(model_feature_dict['OD'])[rfe_obj_3year_tte.support_]}")

dump(rfe_obj_3year_tte, './rfecv_fitted_rnfl3year_tte.pkl')

In [None]:
# Get minimal features selected by RFECV
minimal_features = np.array(model_feature_dict['OD'])[rfe_obj_3year_tte.support_]
minimal_features

In [None]:
# go to feature_sets and fill in the above features then restart kernel
feature_dict = {'minimal_features_rfecv_rnfl3year_tte': feature_sets_rnfl.minimal_features_rfecv_rnfl3year_tte['feature'].values}
feature_dict

In [None]:
run_optuna_studies(
    X = X_train_imputed_scaled_3year_tte,
    y = y_train_3year_tte,
    feature_dict = {'minimal_features_rfecv_rnfl3year_tte': feature_sets_rnfl.minimal_features_rfecv_rnfl3year_tte['feature'].values},
    n_trials = 1000,
    
    objective_class = optuna_util.LGBM_OptunaObjective,
    save_dir = './optuna_results/logistic_regression_sgd_rnfl3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
algorithms = {
    'logistic_regression_sgd_rnfl3year_tte': sklearn.linear_model.SGDClassifier,
    'svm_rnfl3year_tte': sklearn.svm.SVC,
    'knn_rnfl3year_tte': sklearn.neighbors.KNeighborsClassifier,
    'randomforest_rnfl3year_tte': sklearn.ensemble.RandomForestClassifier,
    'xgboost_rnfl3year_tte': xgb.XGBClassifier,
    'lightgbm_rnfl3year_tte': lgb.LGBMClassifier,
}

feature_set_dfs = []
optuna_results_dir = './optuna_results'
fitted_models_dir = './best_hyperparams_fitted/'

for model_name, feature_set in model_feature_dict.items():
    feature_set_hyperparam_df = pd.DataFrame(columns=['Algorithm', 'Hyperparameter', model_name])

    model_save_dir = f'{fitted_models_dir}/{model_name}'

    for algorithm, estimator_class in algorithms.items():
        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        best_params = study.best_trial.params
        #best_params = study.best_trial.user_attrs['all_params']

        for k, v in best_params.items():
            if type(v) == str:
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, v]
            else:
                x = '%s' % float('%.4g' % v)
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, x]

    feature_set_dfs.append(feature_set_hyperparam_df)

# Combine and save results
combo_df = reduce(lambda df1, df2: pd.merge(df1, df2, on=['Algorithm', 'Hyperparameter'], how='outer'), feature_set_dfs)
combo_df.set_index('Algorithm', inplace=True)
combo_df = combo_df.loc[list(algorithms.keys())]
combo_df.to_csv('./optuna_results/best_hyperparameter_results_rnfl3year_tte.tsv', sep='\t', index=True)

<br>

## 5 year

In [None]:
run_optuna_studies(
    X = X_train_imputed_scaled_5year_tte,
    y = y_train_5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LGBM_OptunaObjective,
    save_dir = './optuna_results/lightgbm_rnfl5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# XGBoost
run_optuna_studies(
    X = X_train_imputed_scaled_5year_tte,
    y = y_train_5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.XGBoost_OptunaObjective,
    save_dir = './optuna_results/xgboost_rnfl5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# random forest
run_optuna_studies(
    X = X_train_imputed_scaled_5year_tte,
    y = y_train_5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.RF_OptunaObjective,
    save_dir = './optuna_results/randomforest_rnfl5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# KNN
run_optuna_studies(
    X = X_train_imputed_scaled_5year_tte,
    y = y_train_5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.KNN_OptunaObjective,
    save_dir = './optuna_results/knn_rnfl5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# SVM
run_optuna_studies(
    X = X_train_imputed_scaled_5year_tte,
    y = y_train_5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.SVC_OptunaObjective,
    save_dir = './optuna_results/svm_rnfl5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# logistic regression
run_optuna_studies(
    X = X_train_imputed_scaled_5year_tte,
    y = y_train_5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LogisticRegressionSGD_OptunaObjective,
    save_dir = './optuna_results/logistic_regression_sgd_rnfl5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
optuna_results_dir = './optuna_results'
fitted_models_dir = './best_hyperparams_fitted/'

algorithms = {
    'logistic_regression_sgd_rnfl5year_tte': sklearn.linear_model.SGDClassifier,
    'svm_rnfl5year_tte': sklearn.svm.SVC,
    'knn_rnfl5year_tte': sklearn.neighbors.KNeighborsClassifier,
    'randomforest_rnfl5year_tte': sklearn.ensemble.RandomForestClassifier,
    'xgboost_rnfl5year_tte': xgb.XGBClassifier,
    'lightgbm_rnfl5year_tte': lgb.LGBMClassifier}


# Refit models with best hyperparams on entire train set & save (+save params)

for model_name, feature_set in model_feature_dict.items():
    print(feature_set)
    model_save_dir = f'{fitted_models_dir}/{model_name}'
    if not os.path.exists(model_save_dir):
        os.makedirs(model_save_dir)
        
    X = X_train_imputed_scaled_5year_tte[feature_set]

    for algorithm, estimator_class in algorithms.items():
        print(algorithm)
        print(f'Fitting {model_name} {algorithm}')

        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        print(study)
        best_params = study.best_trial.user_attrs['all_params']
        best_params_str = f'\n'.join([': '.join([key, str(val)]) for key, val in best_params.items()])
    
        # Save params as txt
        with open(f'{model_save_dir}/{algorithm}_best_params.txt', 'w+') as txt:
            txt.write(best_params_str)
    
        # Fit and save model
        estimator = estimator_class(**best_params)
        estimator.fit(X, y_train_5year_tte)
        dump(estimator, f'{model_save_dir}/{algorithm}.pkl')

In [None]:
# best model is XGBoost with ODSL

<br>

### minimal feature model

In [None]:
# Using xgboost with ODSL  as it is the best predictor as of model_evaluation.ipynb

rfe_obj_5year_tte = RFECV(
    estimator=xgb.XGBClassifier(
        objective='binary:logistic',
        random_state=2024,
        n_jobs=-1,
        verbosity=0,  
        tree_method='hist' 
    ),
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=2024),
    n_jobs=1,
    step=1,
    verbose=100,
    min_features_to_select=5
)

rfe_obj_5year_tte.fit(
    X_train_imputed_scaled_5year_tte[model_feature_dict['ODSL']], 
    y_train_5year_tte
)

#### change feature set to applicable one
print(f"Optimal number of features: {rfe_obj_5year_tte.n_features_}")
print(f"Selected features: {np.array(model_feature_dict['ODSL'])[rfe_obj_5year_tte.support_]}")

dump(rfe_obj_5year_tte, './rfecv_fitted_rnfl5year_tte.pkl')

In [None]:
# Get minimal features selected by RFECV
minimal_features = np.array(model_feature_dict['ODSL'])[rfe_obj_5year_tte.support_]
minimal_features

In [None]:
# go to feature_sets and fill in the above features then restart kernel
feature_dict = {'minimal_features_rfecv_rnfl5year_tte': feature_sets_rnfl.minimal_features_rfecv_rnfl5year_tte['feature'].values}
feature_dict

In [None]:
print("=== TRAINING DATA ===")
print(f"Total training samples: {len(y_train_5year_tte)}")
print(f"Training controls (y=0): {sum(y_train_5year_tte == 0)}")
print(f"Training cases (y=1): {sum(y_train_5year_tte == 1)}")
print(f"Training case prevalence: {sum(y_train_5year_tte == 1)/len(y_train_5year_tte):.3%}")


In [None]:
run_optuna_studies(
    X = X_train_imputed_scaled_5year_tte,
    y = y_train_5year_tte,
    feature_dict = {'minimal_features_rfecv_rnfl5year_tte': feature_sets_rnfl.minimal_features_rfecv_rnfl5year_tte['feature'].values},
    n_trials = 1000,
    
    objective_class = optuna_util.XGBoost_OptunaObjective,
    save_dir = './optuna_results/xgboost_rnfl5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
algorithms = {
    'logistic_regression_sgd_rnfl5year_tte': sklearn.linear_model.SGDClassifier,
    'svm_rnfl5year_tte': sklearn.svm.SVC,
    'knn_rnfl5year_tte': sklearn.neighbors.KNeighborsClassifier,
    'randomforest_rnfl5year_tte': sklearn.ensemble.RandomForestClassifier,
    'xgboost_rnfl5year_tte': xgb.XGBClassifier,
    'lightgbm_rnfl5year_tte': lgb.LGBMClassifier,
}

feature_set_dfs = []
optuna_results_dir = './optuna_results'
fitted_models_dir = './best_hyperparams_fitted/'

for model_name, feature_set in model_feature_dict.items():
    feature_set_hyperparam_df = pd.DataFrame(columns=['Algorithm', 'Hyperparameter', model_name])

    model_save_dir = f'{fitted_models_dir}/{model_name}'

    for algorithm, estimator_class in algorithms.items():
        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        best_params = study.best_trial.params
        #best_params = study.best_trial.user_attrs['all_params']

        for k, v in best_params.items():
            if type(v) == str:
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, v]
            else:
                x = '%s' % float('%.4g' % v)
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, x]

    feature_set_dfs.append(feature_set_hyperparam_df)

# Combine and save results
combo_df = reduce(lambda df1, df2: pd.merge(df1, df2, on=['Algorithm', 'Hyperparameter'], how='outer'), feature_set_dfs)
combo_df.set_index('Algorithm', inplace=True)
combo_df = combo_df.loc[list(algorithms.keys())]
combo_df.to_csv('./optuna_results/best_hyperparameter_results_rnfl5year_tte.tsv', sep='\t', index=True)

<br>

## 10 year

In [None]:
run_optuna_studies(
    X = X_train_imputed_scaled_10year_tte,
    y = y_train_10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LGBM_OptunaObjective,
    save_dir = './optuna_results/lightgbm_rnfl10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# XGBoost
run_optuna_studies(
    X = X_train_imputed_scaled_10year_tte,
    y = y_train_10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.XGBoost_OptunaObjective,
    save_dir = './optuna_results/xgboost_rnfl10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# random forest
run_optuna_studies(
    X = X_train_imputed_scaled_10year_tte,
    y = y_train_10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.RF_OptunaObjective,
    save_dir = './optuna_results/randomforest_rnfl10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# KNN
run_optuna_studies(
    X = X_train_imputed_scaled_10year_tte,
    y = y_train_10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.KNN_OptunaObjective,
    save_dir = './optuna_results/knn_rnfl10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# SVM
run_optuna_studies(
    X = X_train_imputed_scaled_10year_tte,
    y = y_train_10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.SVC_OptunaObjective,
    save_dir = './optuna_results/svm_rnfl10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# logistic regression
run_optuna_studies(
    X = X_train_imputed_scaled_10year_tte,
    y = y_train_10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LogisticRegressionSGD_OptunaObjective,
    save_dir = './optuna_results/logistic_regression_sgd_rnfl10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
optuna_results_dir = './optuna_results'
fitted_models_dir = './best_hyperparams_fitted/'

algorithms = {
    'logistic_regression_sgd_rnfl10year_tte': sklearn.linear_model.SGDClassifier,
    'svm_rnfl10year_tte': sklearn.svm.SVC,
    'knn_rnfl10year_tte': sklearn.neighbors.KNeighborsClassifier,
    'randomforest_rnfl10year_tte': sklearn.ensemble.RandomForestClassifier,
    'xgboost_rnfl10year_tte': xgb.XGBClassifier,
    'lightgbm_rnfl10year_tte': lgb.LGBMClassifier}


# Refit models with best hyperparams on entire train set & save (+save params)

for model_name, feature_set in model_feature_dict.items():
    print(feature_set)
    model_save_dir = f'{fitted_models_dir}/{model_name}'
    if not os.path.exists(model_save_dir):
        os.makedirs(model_save_dir)
        
    X = X_train_imputed_scaled_10year_tte[feature_set]

    for algorithm, estimator_class in algorithms.items():
        print(algorithm)
        print(f'Fitting {model_name} {algorithm}')

        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        print(study)
        best_params = study.best_trial.user_attrs['all_params']
        best_params_str = f'\n'.join([': '.join([key, str(val)]) for key, val in best_params.items()])
    
        # Save params as txt
        with open(f'{model_save_dir}/{algorithm}_best_params.txt', 'w+') as txt:
            txt.write(best_params_str)
    
        # Fit and save model
        estimator = estimator_class(**best_params)
        estimator.fit(X, y_train_10year_tte)
        dump(estimator, f'{model_save_dir}/{algorithm}.pkl')

In [None]:
# best model is lgbm with odsl

### minimal feature model

In [None]:
# Using LR with OD as it is the best predictor as of model_evaluation.ipynb
rfe_obj_10year_tte = RFECV(
    estimator=lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        random_state=2024,
        n_jobs=-1,
        verbose=-1  # Suppresses LightGBM output
    ),
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=2024),
    n_jobs=1,
    step=1,
    verbose=100,
    min_features_to_select=5
)

rfe_obj_10year_tte.fit(
    X_train_imputed_scaled_10year_tte[model_feature_dict['ODSL']], 
    y_train_10year_tte
)

#### change feature set to applicable one
print(f"Optimal number of features: {rfe_obj_10year_tte.n_features_}")
print(f"Selected features: {np.array(model_feature_dict['ODSL'])[rfe_obj_10year_tte.support_]}")

dump(rfe_obj_10year_tte, './rfecv_fitted_rnfl10year_tte.pkl')

In [None]:
# Get minimal features selected by RFECV
minimal_features = np.array(model_feature_dict['ODSL'])[rfe_obj_10year_tte.support_]
minimal_features

In [None]:
# go to feature_sets and fill in the above features then restart kernel
feature_dict = {'minimal_features_rfecv_rnfl10year_tte': feature_sets_rnfl.minimal_features_rfecv_rnfl10year_tte['feature'].values}
feature_dict

In [None]:
print("=== TRAINING DATA ===")
print(f"Total training samples: {len(y_train_10year_tte)}")
print(f"Training controls (y=0): {sum(y_train_10year_tte == 0)}")
print(f"Training cases (y=1): {sum(y_train_10year_tte == 1)}")


In [None]:
run_optuna_studies(
    X = X_train_imputed_scaled_10year_tte,
    y = y_train_10year_tte,
    feature_dict = {'minimal_features_rfecv_rnfl10year_tte': feature_sets_rnfl.minimal_features_rfecv_rnfl10year_tte['feature'].values},
    n_trials = 1000,
    
    objective_class = optuna_util.LGBM_OptunaObjective,
    save_dir = './optuna_results/lightgbm_rnfl5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
algorithms = {
    'logistic_regression_sgd_rnfl10year_tte': sklearn.linear_model.SGDClassifier,
    'svm_rnfl10year_tte': sklearn.svm.SVC,
    'knn_rnfl10year_tte': sklearn.neighbors.KNeighborsClassifier,
    'randomforest_rnfl10year_tte': sklearn.ensemble.RandomForestClassifier,
    'xgboost_rnfl10year_tte': xgb.XGBClassifier,
    'lightgbm_rnfl10year_tte': lgb.LGBMClassifier,
}

feature_set_dfs = []
optuna_results_dir = './optuna_results'
fitted_models_dir = './best_hyperparams_fitted/'

for model_name, feature_set in model_feature_dict.items():
    feature_set_hyperparam_df = pd.DataFrame(columns=['Algorithm', 'Hyperparameter', model_name])

    model_save_dir = f'{fitted_models_dir}/{model_name}'

    for algorithm, estimator_class in algorithms.items():
        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        best_params = study.best_trial.params
        #best_params = study.best_trial.user_attrs['all_params']

        for k, v in best_params.items():
            if type(v) == str:
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, v]
            else:
                x = '%s' % float('%.4g' % v)
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, x]

    feature_set_dfs.append(feature_set_hyperparam_df)

# Combine and save results
combo_df = reduce(lambda df1, df2: pd.merge(df1, df2, on=['Algorithm', 'Hyperparameter'], how='outer'), feature_set_dfs)
combo_df.set_index('Algorithm', inplace=True)
combo_df = combo_df.loc[list(algorithms.keys())]
combo_df.to_csv('./optuna_results/best_hyperparameter_results_rnfl10year_tte.tsv', sep='\t', index=True)

---

<br>

## RNFL + GCL

### 3 year

In [None]:
# LightGBM
run_optuna_studies(
    X = X_train_imputed_scaled_3year_tte,
    y = y_train_3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LGBM_OptunaObjective,
    save_dir = './optuna_results/lightgbm_noIOPrnfl3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# XGBoost
run_optuna_studies(
    X = X_train_imputed_scaled_3year_tte,
    y = y_train_3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.XGBoost_OptunaObjective,
    save_dir = './optuna_results/xgboost_noIOPrnfl3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# random forest
run_optuna_studies(
    X = X_train_imputed_scaled_3year_tte,
    y = y_train_3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.RF_OptunaObjective,
    save_dir = './optuna_results/randomforest_noIOPrnfl3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# KNN
run_optuna_studies(
    X = X_train_imputed_scaled_3year_tte,
    y = y_train_3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.KNN_OptunaObjective,
    save_dir = './optuna_results/knn_noIOPrnfl3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# SVM
run_optuna_studies(
    X = X_train_imputed_scaled_3year_tte,
    y = y_train_3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.SVC_OptunaObjective,
    save_dir = './optuna_results/svm_noIOPrnfl3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# logistic regression
run_optuna_studies(
    X = X_train_imputed_scaled_3year_tte,
    y = y_train_3year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LogisticRegressionSGD_OptunaObjective,
    save_dir = './optuna_results/logistic_regression_sgd_noIOPrnfl3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
optuna_results_dir = './optuna_results'
fitted_models_dir = './best_hyperparams_fitted/'

algorithms = {
    'logistic_regression_sgd_noIOPrnfl3year_tte': sklearn.linear_model.SGDClassifier,
    'svm_noIOPrnfl3year_tte': sklearn.svm.SVC,
    'knn_noIOPrnfl3year_tte': sklearn.neighbors.KNeighborsClassifier,
    'randomforest_noIOPrnfl3year_tte': sklearn.ensemble.RandomForestClassifier,
    'xgboost_noIOPrnfl3year_tte': xgb.XGBClassifier,
    'lightgbm_noIOPrnfl3year_tte': lgb.LGBMClassifier}


# Refit models with best hyperparams on entire train set & save (+save params)

for model_name, feature_set in model_feature_dict.items():
    print(feature_set)
    model_save_dir = f'{fitted_models_dir}/{model_name}'
    if not os.path.exists(model_save_dir):
        os.makedirs(model_save_dir)
        
    X = X_train_imputed_scaled_3year_tte[feature_set]

    for algorithm, estimator_class in algorithms.items():
        print(algorithm)
        print(f'Fitting {model_name} {algorithm}')

        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        print(study)
        best_params = study.best_trial.user_attrs['all_params']
        best_params_str = f'\n'.join([': '.join([key, str(val)]) for key, val in best_params.items()])
    
        # Save params as txt
        with open(f'{model_save_dir}/{algorithm}_best_params.txt', 'w+') as txt:
            txt.write(best_params_str)
    
        # Fit and save model
        estimator = estimator_class(**best_params)
        estimator.fit(X, y_train_3year_tte)
        dump(estimator, f'{model_save_dir}/{algorithm}.pkl')

In [None]:
# best model is lightgbm with OD

### minimal feature model

In [None]:
# change to applicable model + feature set
rfe_obj_3year_tte = RFECV(
    estimator=lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        random_state=2024,
        n_jobs=-1,
    ),
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=2024),
    n_jobs=1,
    step=1,
    verbose=100,
    min_features_to_select=5
)

rfe_obj_3year_tte.fit(
    X_train_imputed_scaled_3year_tte[model_feature_dict['OD']], 
    y_train_3year_tte
)

#### change feature set to applicable one
print(f"Optimal number of features: {rfe_obj_3year_tte.n_features_}")
print(f"Selected features: {np.array(model_feature_dict['OD'])[rfe_obj_3year_tte.support_]}")

dump(rfe_obj_3year_tte, './rfecv_fitted_noIOPrnfl3year_tte.pkl')

# Get minimal features selected by RFECV
minimal_features = np.array(model_feature_dict['OD'])[rfe_obj_3year_tte.support_]
minimal_features

In [None]:
# go to feature_sets and fill in the above features then restart kernel
feature_dict = {'minimal_features_rfecv_noIOPrnfl3year_tte': feature_sets_rnfl.minimal_features_rfecv_noIOPrnfl3year_tte['feature'].values}
feature_dict

In [None]:
run_optuna_studies(
    X = X_train_imputed_scaled_3year_tte,
    y = y_train_3year_tte,
    feature_dict = {'minimal_features_rfecv_noIOPrnfl3year_tte': feature_sets_rnfl.minimal_features_rfecv_noIOPrnfl3year_tte['feature'].values},
    n_trials = 1000,
    
    objective_class = optuna_util.LGBM_OptunaObjective,
    save_dir = './optuna_results/logistic_regression_sgd_noIOPrnfl3year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
algorithms = {
    'logistic_regression_sgd_noIOPrnfl3year_tte': sklearn.linear_model.SGDClassifier,
    'svm_noIOPrnfl3year_tte': sklearn.svm.SVC,
    'knn_noIOPrnfl3year_tte': sklearn.neighbors.KNeighborsClassifier,
    'randomforest_noIOPrnfl3year_tte': sklearn.ensemble.RandomForestClassifier,
    'xgboost_noIOPrnfl3year_tte': xgb.XGBClassifier,
    'lightgbm_noIOPrnfl3year_tte': lgb.LGBMClassifier,
}

feature_set_dfs = []
optuna_results_dir = './optuna_results'
fitted_models_dir = './best_hyperparams_fitted/'

for model_name, feature_set in model_feature_dict.items():
    feature_set_hyperparam_df = pd.DataFrame(columns=['Algorithm', 'Hyperparameter', model_name])

    model_save_dir = f'{fitted_models_dir}/{model_name}'

    for algorithm, estimator_class in algorithms.items():
        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        best_params = study.best_trial.params
        #best_params = study.best_trial.user_attrs['all_params']

        for k, v in best_params.items():
            if type(v) == str:
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, v]
            else:
                x = '%s' % float('%.4g' % v)
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, x]

    feature_set_dfs.append(feature_set_hyperparam_df)

# Combine and save results
combo_df = reduce(lambda df1, df2: pd.merge(df1, df2, on=['Algorithm', 'Hyperparameter'], how='outer'), feature_set_dfs)
combo_df.set_index('Algorithm', inplace=True)
combo_df = combo_df.loc[list(algorithms.keys())]
combo_df.to_csv('./optuna_results/best_hyperparameter_results_noIOPrnfl3year_tte.tsv', sep='\t', index=True)

<br>

## 5 year

In [None]:
# LightGBM
run_optuna_studies(
    X = X_train_imputed_scaled_5year_tte,
    y = y_train_5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LGBM_OptunaObjective,
    save_dir = './optuna_results/lightgbm_noIOPrnfl5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# XGBoost
run_optuna_studies(
    X = X_train_imputed_scaled_5year_tte,
    y = y_train_5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.XGBoost_OptunaObjective,
    save_dir = './optuna_results/xgboost_noIOPrnfl5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# random forest
run_optuna_studies(
    X = X_train_imputed_scaled_5year_tte,
    y = y_train_5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.RF_OptunaObjective,
    save_dir = './optuna_results/randomforest_noIOPrnfl5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# KNN
run_optuna_studies(
    X = X_train_imputed_scaled_5year_tte,
    y = y_train_5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.KNN_OptunaObjective,
    save_dir = './optuna_results/knn_noIOPrnfl5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# SVM
run_optuna_studies(
    X = X_train_imputed_scaled_5year_tte,
    y = y_train_5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.SVC_OptunaObjective,
    save_dir = './optuna_results/svm_noIOPrnfl5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# logistic regression
run_optuna_studies(
    X = X_train_imputed_scaled_5year_tte,
    y = y_train_5year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LogisticRegressionSGD_OptunaObjective,
    save_dir = './optuna_results/logistic_regression_sgd_noIOPrnfl5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
optuna_results_dir = './optuna_results'
fitted_models_dir = './best_hyperparams_fitted/'

algorithms = {
    'logistic_regression_sgd_noIOPrnfl5year_tte': sklearn.linear_model.SGDClassifier,
    'svm_noIOPrnfl5year_tte': sklearn.svm.SVC,
    'knn_noIOPrnfl5year_tte': sklearn.neighbors.KNeighborsClassifier,
    'randomforest_noIOPrnfl5year_tte': sklearn.ensemble.RandomForestClassifier,
    'xgboost_noIOPrnfl5year_tte': xgb.XGBClassifier,
    'lightgbm_noIOPrnfl5year_tte': lgb.LGBMClassifier}


# Refit models with best hyperparams on entire train set & save (+save params)

for model_name, feature_set in model_feature_dict.items():
    print(feature_set)
    model_save_dir = f'{fitted_models_dir}/{model_name}'
    if not os.path.exists(model_save_dir):
        os.makedirs(model_save_dir)
        
    X = X_train_imputed_scaled_5year_tte[feature_set]

    for algorithm, estimator_class in algorithms.items():
        print(algorithm)
        print(f'Fitting {model_name} {algorithm}')

        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        print(study)
        best_params = study.best_trial.user_attrs['all_params']
        best_params_str = f'\n'.join([': '.join([key, str(val)]) for key, val in best_params.items()])
    
        # Save params as txt
        with open(f'{model_save_dir}/{algorithm}_best_params.txt', 'w+') as txt:
            txt.write(best_params_str)
    
        # Fit and save model
        estimator = estimator_class(**best_params)
        estimator.fit(X, y_train_5year_tte)
        dump(estimator, f'{model_save_dir}/{algorithm}.pkl')

In [None]:
# ODSL with LightGBM

### minimal feature model

In [None]:
# change to applicable model + feature set
rfe_obj_5year_tte = RFECV(
    estimator=lgb.LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        random_state=2024,
        n_jobs=-1,
    ),
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=2024),
    n_jobs=1,
    step=1,
    verbose=100,
    min_features_to_select=5
)

rfe_obj_5year_tte.fit(
    X_train_imputed_scaled_5year_tte[model_feature_dict['ODSL']], 
    y_train_5year_tte
)

#### change feature set to applicable one
print(f"Optimal number of features: {rfe_obj_5year_tte.n_features_}")
print(f"Selected features: {np.array(model_feature_dict['ODSL'])[rfe_obj_5year_tte.support_]}")

dump(rfe_obj_5year_tte, './rfecv_fitted_noIOPrnfl5year_tte.pkl')

# Get minimal features selected by RFECV
minimal_features = np.array(model_feature_dict['ODSL'])[rfe_obj_5year_tte.support_]
minimal_features

In [None]:
# go to feature_sets and fill in the above features then restart kernel
feature_dict = {'minimal_features_rfecv_noIOPrnfl5year_tte': feature_sets_rnfl.minimal_features_rfecv_noIOPrnfl5year_tte['feature'].values}
feature_dict

In [None]:
run_optuna_studies(
    X = X_train_imputed_scaled_5year_tte,
    y = y_train_5year_tte,
    feature_dict = {'minimal_features_rfecv_noIOPrnfl5year_tte': feature_sets_rnfl.minimal_features_rfecv_noIOPrnfl5year_tte['feature'].values},
    n_trials = 1000,
    
    objective_class = optuna_util.LGBM_OptunaObjective,
    save_dir = './optuna_results/logistic_regression_sgd_noIOPrnfl5year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
algorithms = {
    'logistic_regression_sgd_noIOPrnfl5year_tte': sklearn.linear_model.SGDClassifier,
    'svm_noIOPrnfl5year_tte': sklearn.svm.SVC,
    'knn_noIOPrnfl5year_tte': sklearn.neighbors.KNeighborsClassifier,
    'randomforest_noIOPrnfl5year_tte': sklearn.ensemble.RandomForestClassifier,
    'xgboost_noIOPrnfl5year_tte': xgb.XGBClassifier,
    'lightgbm_noIOPrnfl5year_tte': lgb.LGBMClassifier,
}

feature_set_dfs = []
optuna_results_dir = './optuna_results'
fitted_models_dir = './best_hyperparams_fitted/'

for model_name, feature_set in model_feature_dict.items():
    feature_set_hyperparam_df = pd.DataFrame(columns=['Algorithm', 'Hyperparameter', model_name])

    model_save_dir = f'{fitted_models_dir}/{model_name}'

    for algorithm, estimator_class in algorithms.items():
        study_dir = f'{optuna_results_dir}/{algorithm}/{model_name}/optuna_study_{model_name}.pkl'
        study = load(study_dir)
        best_params = study.best_trial.params
        #best_params = study.best_trial.user_attrs['all_params']

        for k, v in best_params.items():
            if type(v) == str:
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, v]
            else:
                x = '%s' % float('%.4g' % v)
                feature_set_hyperparam_df.loc[len(feature_set_hyperparam_df)] = [algorithm, k, x]

    feature_set_dfs.append(feature_set_hyperparam_df)

# Combine and save results
combo_df = reduce(lambda df1, df2: pd.merge(df1, df2, on=['Algorithm', 'Hyperparameter'], how='outer'), feature_set_dfs)
combo_df.set_index('Algorithm', inplace=True)
combo_df = combo_df.loc[list(algorithms.keys())]
combo_df.to_csv('./optuna_results/best_hyperparameter_results_noIOPrnfl5year_tte.tsv', sep='\t', index=True)

<br>

## 10 year

In [None]:
# LightGBM
run_optuna_studies(
    X = X_train_imputed_scaled_10year_tte,
    y = y_train_10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LGBM_OptunaObjective,
    save_dir = './optuna_results/lightgbm_noIOPrnfl10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# XGBoost
run_optuna_studies(
    X = X_train_imputed_scaled_10year_tte,
    y = y_train_10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.XGBoost_OptunaObjective,
    save_dir = './optuna_results/xgboost_noIOPrnfl10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# random forest
run_optuna_studies(
    X = X_train_imputed_scaled_10year_tte,
    y = y_train_10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.RF_OptunaObjective,
    save_dir = './optuna_results/randomforest_noIOPrnfl10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# KNN
run_optuna_studies(
    X = X_train_imputed_scaled_10year_tte,
    y = y_train_10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.KNN_OptunaObjective,
    save_dir = './optuna_results/knn_noIOPrnfl10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# SVM
run_optuna_studies(
    X = X_train_imputed_scaled_10year_tte,
    y = y_train_10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.SVC_OptunaObjective,
    save_dir = './optuna_results/svm_noIOPrnfl10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)

In [None]:
# logistic regression
run_optuna_studies(
    X = X_train_imputed_scaled_10year_tte,
    y = y_train_10year_tte,
    feature_dict = model_feature_dict,
    n_trials = n_trials,
    
    objective_class = optuna_util.LogisticRegressionSGD_OptunaObjective,
    save_dir = './optuna_results/logistic_regression_sgd_noIOPrnfl10year_tte',

    # kwargs passed to study class
    n_cv_folds = n_cv_folds,
    scoring_metric = scoring_metric,
)