In [1]:
import pandas as pd
import os
import re
import sys
import numpy as np
from merf import MERF
import matplotlib.pyplot as plt
import seaborn as sns
import itertools 
sns.set_context("poster")
from sklearn.ensemble import RandomForestRegressor
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (11,8)
from merf.merf import MERF
from sklearn.model_selection import train_test_split, KFold
from merf.viz import plot_merf_training_stats

current_dir = os.getcwd() # Get the current working directory
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.append(parent_dir)
from em_utils import * # import the utils

Useful Functions 

In [None]:
# Create output directory if it doesn't exist
output_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_plots"
df_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_dfs/"
os.makedirs(output_dir, exist_ok=True)

print("---------- Read metadata ----------")
m1_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/clinical/transformed/aim2"
test = read_data(m1_dir, "a2_test_samples_standard_clinical_feb20.csv")
train = read_data(m1_dir, "a2_train_samples_standard_clinical_feb20.csv")
full = read_data(m1_dir, "a2_meta_Transformed_standard_clinical_feb20.csv")
full_raw = read_data(m1_dir, "a2_meta_not_Transformed_standard_clinical_feb20.csv")

print(full_raw.columns.to_list() == train.columns.to_list())
print(train.columns.to_list() == test.columns.to_list())

In [6]:
# Print dimensions of each DataFrame
print("Dimensions of train_long:", train.shape)
print("Dimensions of test_long:", test.shape)
print("Dimensions of full_long:", full.shape)

Dimensions of train_long: (97, 29)
Dimensions of test_long: (24, 29)
Dimensions of full_long: (121, 29)


In [4]:
print("Train columns :", train.columns.to_list())
print("Test columns:", test.columns.to_list())

Train columns : ['Unnamed: 0', 'record_id', 'subject_id', 'randomized_group', 'cohort_number', 'sex', 'race', 'age', 'outcome_BMI_fnl_BL', 'Glucose_BL', 'HOMA_IR_BL', 'Insulin_endo_BL', 'HDL_Total_Direct_lipid_BL', 'LDL_Calculated_BL', 'Triglyceride_lipid_BL', 'outcome_BMI_fnl_6m', 'Glucose_6m', 'HOMA_IR_6m', 'Insulin_endo_6m', 'HDL_Total_Direct_lipid_6m', 'LDL_Calculated_6m', 'Triglyceride_lipid_6m', 'outcome_BMI_fnl_12m', 'Glucose_12m', 'HOMA_IR_12m', 'Insulin_endo_12m', 'HDL_Total_Direct_lipid_12m', 'LDL_Calculated_12m', 'Triglyceride_lipid_12m']
Test columns: ['Unnamed: 0', 'record_id', 'subject_id', 'randomized_group', 'cohort_number', 'sex', 'race', 'age', 'outcome_BMI_fnl_BL', 'Glucose_BL', 'HOMA_IR_BL', 'Insulin_endo_BL', 'HDL_Total_Direct_lipid_BL', 'LDL_Calculated_BL', 'Triglyceride_lipid_BL', 'outcome_BMI_fnl_6m', 'Glucose_6m', 'HOMA_IR_6m', 'Insulin_endo_6m', 'HDL_Total_Direct_lipid_6m', 'LDL_Calculated_6m', 'Triglyceride_lipid_6m', 'outcome_BMI_fnl_12m', 'Glucose_12m', 'HO

### Make long format

In [13]:
# Step 1: Identify columns to melt
value_vars = [col for col in train.columns if col.endswith(('_BL', '_6m', '_12m'))]
# Identify the original columns to keep
original_columns = [col for col in train.columns if col not in value_vars]
print("Train columns :", train.columns.to_list())
try:
   long_format = pd.melt(train, id_vars=train['subject_id'] + original_columns, value_vars=value_vars, 
                             var_name='variable', value_name='value')
except KeyError as e:
    print(f"KeyError: {e}")
    print("Available columns:", train.columns.tolist())

# Step 2: Melt the DataFrame
long_format = pd.melt(train, id_vars=train['subject_id'] + original_columns, value_vars=value_vars, 
                      var_name='variable', value_name='value')

# Step 3: Extract time information and prefix
long_format['time'] = long_format['variable'].str.extract(r'_(BL|6m|12m)')[0]
long_format['time'] = long_format['time'].replace({'BL': 0, '6m': 6, '12m': 12})

# Step 4: Extract the prefix (remove the suffix)
long_format['variable'] = long_format['variable'].str.replace(r'_(BL|6m|12m)', '', regex=True)

# Step 5: Pivot the DataFrame to get the desired format
final_long_format = long_format.pivot_table(index=['subject_id', 'time'] + original_columns, 
                                             columns='variable', 
                                             values='value').reset_index()

# Optional: Rename the columns to flatten the MultiIndex
final_long_format.columns.name = None  # Remove the name of the columns
final_long_format = final_long_format.rename_axis(None, axis=1)  # Remove the name of the index

# Display the final long format DataFrame
print(final_long_format)

Train columns : ['Unnamed: 0', 'record_id', 'subject_id', 'randomized_group', 'cohort_number', 'sex', 'race', 'age', 'outcome_BMI_fnl_BL', 'Glucose_BL', 'HOMA_IR_BL', 'Insulin_endo_BL', 'HDL_Total_Direct_lipid_BL', 'LDL_Calculated_BL', 'Triglyceride_lipid_BL', 'outcome_BMI_fnl_6m', 'Glucose_6m', 'HOMA_IR_6m', 'Insulin_endo_6m', 'HDL_Total_Direct_lipid_6m', 'LDL_Calculated_6m', 'Triglyceride_lipid_6m', 'outcome_BMI_fnl_12m', 'Glucose_12m', 'HOMA_IR_12m', 'Insulin_endo_12m', 'HDL_Total_Direct_lipid_12m', 'LDL_Calculated_12m', 'Triglyceride_lipid_12m']


ValueError: operands could not be broadcast together with shapes (97,) (8,) 

In [9]:
# Step 1: Identify columns to melt
value_vars = [col for col in train.columns if col.endswith(('_BL', '_6m', '_12m'))]

# Step 2: Melt the DataFrame
long_format = pd.melt(train, id_vars=['subject_id'], value_vars=value_vars, 
                      var_name='variable', value_name='value')

# Step 3: Extract time information and prefix
long_format['time'] = long_format['variable'].str.extract(r'_(BL|6m|12m)')[0]
long_format['time'] = long_format['time'].replace({'BL': 0, '6m': 6, '12m': 12})

# Step 4: Extract the prefix (remove the suffix)
long_format['variable'] = long_format['variable'].str.replace(r'_(BL|6m|12m)', '', regex=True)

# Step 5: Pivot the DataFrame to get the desired format
final_long_format = long_format.pivot_table(index=['subject_id', 'time'], 
                                             columns='variable', 
                                             values='value').reset_index()

# Optional: Rename the columns to flatten the MultiIndex
final_long_format.columns.name = None  # Remove the name of the columns
final_long_format = final_long_format.rename_axis(None, axis=1)  # Remove the name of the index

# Display the final long format DataFrame
print(final_long_format)
print("Dimensions of full_long:", final_long_format.shape)

    subject_id  time    Glucose  HDL_Total_Direct_lipid   HOMA_IR  \
0      ACO-053     0  11.565222                6.636147  1.299463   
1      ACO-053     6  10.663916                6.611455  1.193159   
2      ACO-053    12  10.712033                7.619975  0.603652   
3      ADA-105     0  10.076629                4.136819  0.990680   
4      ADA-105     6  12.013779                4.166122  1.351743   
..         ...   ...        ...                     ...       ...   
286    YOR-103     6  12.013779                4.166122  1.351743   
287    YOR-103    12  10.952753                4.498539  0.540065   
288    YSU-097     0  11.794237                3.705900  1.325195   
289    YSU-097     6  12.958683                4.166122  1.242784   
290    YSU-097    12  12.517432                3.672277  0.617218   

     Insulin_endo  LDL_Calculated  Triglyceride_lipid  outcome_BMI_fnl  
0        1.311064        3.100593            1.467331         7.907706  
1        1.509470        

  long_format['time'] = long_format['time'].replace({'BL': 0, '6m': 6, '12m': 12})


In [7]:
# Step 1: Identify columns to melt
value_vars = [col for col in train.columns if col.endswith(('_BL', '_6m', '_12m'))]

# Step 2: Melt the DataFrame
long_format_train = pd.melt(train, id_vars=['subject_id'], value_vars=value_vars, 
                      var_name='variable', value_name='value')

# Step 3: Extract time information and prefix
long_format_train['time'] = long_format_train['variable'].str.extract(r'_(BL|6m|12m)')[0]
long_format_train['time'] = long_format_train['time'].replace({'BL': 0, '6m': 6, '12m': 12})

# Step 4: Extract the prefix (remove the suffix)
long_format_train['variable'] = long_format_train['variable'].str.replace(r'_(BL|6m|12m)', '', regex=True)

# Optional: Drop the original 'variable' column if not needed
# long_format = long_format.drop(columns=['variable'])

# Display the long format DataFrame
print(long_format_train)

     subject_id            variable     value  time
0       LFI-003     outcome_BMI_fnl  7.833650     0
1       ROL-006     outcome_BMI_fnl  8.042152     0
2       AKE-009     outcome_BMI_fnl  6.549259     0
3       HGI-010     outcome_BMI_fnl  6.708535     0
4       AKI-011     outcome_BMI_fnl  7.772516     0
...         ...                 ...       ...   ...
2032    SKA-195  Triglyceride_lipid  1.565815    12
2033    KHU-196  Triglyceride_lipid  1.275251    12
2034    LPF-198  Triglyceride_lipid  1.307536    12
2035    KEL-199  Triglyceride_lipid  1.985518    12
2036    BIN-201  Triglyceride_lipid  1.275251    12

[2037 rows x 4 columns]


  long_format_train['time'] = long_format_train['time'].replace({'BL': 0, '6m': 6, '12m': 12})


In [5]:
# Print subject_id values from train_long DataFrame
print("Subject IDs in train_long:", train['subject_id'].to_list())

# Print subject_id values from test_long DataFrame
print("Subject IDs in test_long:", test['subject_id'].to_list())

# Print subject_id values from full_long DataFrame
print("Subject IDs in full_long:", full['subject_id'].to_list())

Subject IDs in train_long: ['LFI-003', 'ROL-006', 'AKE-009', 'HGI-010', 'AKI-011', 'CSH-012', 'ASO-013', 'LBU-015', 'TFA-016', 'LVA-017', 'SBO-020', 'NTA-021', 'RHP-023', 'MST-025', 'CED-026', 'JFU-027', 'SSH-028', 'KGI-029', 'JUT-032', 'KCE-034', 'VCA-041', 'SCA-043', 'EKR-045', 'LBL-047', 'NBI-048', 'CLE-049', 'AWA-052', 'ACO-053', 'AHE-055', 'TSL-056', 'CAM-057', 'SGA-062', 'NDI-067', 'MES-068', 'NBI-069', 'AGA-071', 'KRI-072', 'MGA-076', 'SEG-080', 'AWA-083', 'KBU-085', 'TDU-086', 'MCA-088', 'MSH-091', 'EJO-092', 'YSU-097', 'YOR-103', 'BKN-104', 'ADA-105', 'JCA-109', 'LMC-111', 'MWE-112', 'TRO-113', 'RAE-114', 'TBU-115', 'JUG-116', 'MHO-117', 'MFB-118', 'MAR-119', 'KWA-122', 'JKN-127', 'AME-128', 'ATA-129', 'EBE-130', 'RLA-132', 'MWO-133', 'EKA-135', 'AKO-139', 'KWA-141', 'TSH-146', 'LEL-147', 'LDO-148', 'NPO-149', 'SDA-150', 'MWY-152', 'BMI-156', 'KBR-162', 'ALO-163', 'BMO-164', 'QNG-166', 'KHE-170', 'NCO-171', 'TFA-172', 'BSA-174', 'MBA-176', 'EHI-177', 'SLO-178', 'EPO-182', 'EVO

Make long format 

In [None]:
# Apply the function to each meta dataset
print("---------- Convert metadata to long format ----------")
full_long = make_long(full_raw)
full_long['x_t'] = full_long['subject_id'].astype(str) + '.' + full_long['time'].astype(str)

train_long = make_long(train)
train_long['x_t'] = train_long['subject_id'].astype(str) + '.' + train_long['time'].astype(str)

test_long = make_long(test)
test_long['x_t'] = test_long['subject_id'].astype(str) + '.' + test_long['time'].astype(str)

In [None]:
print("train data outcome_BMI_fnl values:", train_long['outcome_BMI_fnl'])
print("Full columns after transformation:", full_long.columns.to_list())
print("Train columns after transformation:", train_long.columns.to_list())
print("Test columns after transformation:", test_long.columns.to_list())

Drop columns only if they exist in the DataFrame (since some may not be present after merge)

In [5]:
columns_to_drop = ['Unnamed: 0', 'cohort_number', 'record_id', 'x_t']
full_long = full_long.drop([col for col in columns_to_drop if col in full.columns], axis=1)
train_long = train_long.drop([col for col in columns_to_drop if col in train_long.columns], axis=1)
test_long = test_long.drop([col for col in columns_to_drop if col in test_long.columns], axis=1)

In [None]:
# Check the final columns
print("Final columns after drop:", full_long.columns.to_list())
print("Final test columns after drop:", test_long.columns.to_list())

In [7]:
# Drop NA 
test_long = test_long.dropna()
train_long = train_long.dropna()
full_long = full_long.dropna()
raw_train = full_long[full_long['subject_id'].isin(train_long['subject_id'])]
raw_test = full_long[full_long['subject_id'].isin(test_long['subject_id'])]

In [None]:
print("---------- raw train and tax ----------")
print("raw_train shape = ", raw_train.shape)
print("raw_test shape = ", raw_test.shape)
print("---------- preprocessed train and tax ----------")
print("test_long shape = ", test_long.shape)
print("train_long shape = ", train_long.shape)

In [None]:
print("---------- Select predictors for Basic Raw training set ----------")
train_set = raw_train
X = train_set.drop(['outcome_BMI_fnl', 'subject_id'], axis=1)
#X = X.drop(columns=['Unnamed: 0_tax', 'x_t'], errors='ignore')
Y = train_set[['outcome_BMI_fnl']]
Y = Y['outcome_BMI_fnl'].to_numpy() # Convert Y to numeric array
clusters_train = train_set['subject_id'].to_numpy() # Get ID variables
Z = np.ones((train_set.shape[0], 1)) # Create random effects matrix with ones
time = train_set['time'].astype(float).to_numpy() # Get time values as numeric array 

# Check the final columns
print("Final columns after drop:", X.columns.to_list())
print("X train values:", train_set['outcome_BMI_fnl'])

print("---------- Select predictors for Basic Raw test set ----------")
test_set = raw_test
X_new = test_set.drop(['outcome_BMI_fnl', 'subject_id'], axis=1)
X_new = X_new[X.columns]  # Reorder and select columns to match training set
X_new = X_new.astype(X.dtypes)  # Ensure data types match

Y_new = test_set['outcome_BMI_fnl'].to_numpy()  # Convert Y to numeric array
clusters_new = pd.Series(test_set['subject_id'])  # Convert to pandas Series
# Create random effects matrix with ones
Z_new = np.ones((len(X_new), 1))
time_new = test_set['time'].astype(float).to_numpy()  # Convert time values to numeric array

In [None]:
print("---------- RUN MERF preprocessed with participant RE 🌱🌸 ----------")
 # Mixed Effects Random Forest Training with participant RE and time cluster 
train_set = train_long 
X_train = train_set.drop(['outcome_BMI_fnl', 'subject_id', 'time'], axis=1).to_numpy()
Z_train = np.array((np.ones(len(train_set)), train_set['subject_id'].apply(lambda s: int(s[-3:])))).T
clusters_train = pd.Series(train_set['subject_id'].apply(lambda s: int(s[-3:]))).astype(float)  # Convert to float if necessary
y_train = train_set[['outcome_BMI_fnl']]
y_train = y_train['outcome_BMI_fnl'].to_numpy() # Convert Y to numeric array

In [None]:
print(f"Dimensions of X_train: {X_train.shape}")
print(f"Dimensions of Z_train: {Z_train.shape}")
print(f"Number of unique inputs for clusters_train: {clusters_train.nunique()}")
print(f"Inputs to clusters_train: {clusters_train}")
print(f"Dimensions of y_train: {y_train.shape}")

In Mixed Effects Random Forests (MERF), the Generalized Log-Likelihood (GLL) is used to evaluate the quality of the model at each iteration

Test MERF if RE on test data

In [12]:
# Test data (repeat similar steps)
test_set = test_long
X_test = test_set.drop(['outcome_BMI_fnl', 'subject_id', 'time'], axis=1).to_numpy()
Z_test = np.array((np.ones(len(test_set)), test_set['subject_id'].apply(lambda s: int(s[-3:])))).T
clusters_test = pd.Series(test_set['subject_id'].apply(lambda s: int(s[-3:]))).astype(float)  # Convert to float if necessary

Fine Tuning wth PREV and PTEV.
ptev is calculated as the ratio of fixed effect variance to total variance, while prev is calculated as the ratio of random effect variance to total variance.

A higher PTEV is often preferable because it suggests that your model explains a larger portion of the observed variability. However, excessively high PTEV could signal overfitting, where the model memorizes rather than generalizes.

The optimal PREV depends on your data and goals. If your model is designed to capture group- or subject-level variability, a higher PREV might be desirable. However, in most cases, excessively high PREV might indicate the model isn't capturing key fixed effects, potentially leading to underfitting.

In [None]:
print("---------- MERF with finetuning RE 🌱 ----------")
import itertools 
import pandas as pd

print("---------- Select predictors for Basic Raw training set ----------")
train_set = raw_train
X = train_set.drop(['outcome_BMI_fnl', 'subject_id', 'x_t'], axis=1)
print(f"X dimensions: {X.shape}")
y = train_set[['outcome_BMI_fnl']]
y = y['outcome_BMI_fnl'].to_numpy() # Convert Y to numeric array
print(f"y dimensions: {y.shape}")
clusters = train_set['subject_id'].to_numpy() # Get ID variables
print(f"clusters dimensions: {clusters.shape}")
z = np.ones((train_set.shape[0], 1)) # Create random effects matrix with ones
print(f"z dimensions: {z.shape}")

# Hyperparameters to tune
param_grid = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [None],
    'min_samples_split': [0.05, 0.1, 0.15],
    'max_iter': [2, 10],
    'n_splits': [3, 5, 10]  # Added n_splits for cross-validation
}

best_score = float('inf')
best_params = {}

# Initialize a list to store the results of each iteration
results = []

# Loop through all possible combinations of parameters
for params in itertools.product(*param_grid.values()):
        n_estimators, max_depth, min_samples_split, max_iter, n_splits = params
        print(f"Combination: {params}\n")
        scores = []
        prev = []
        ptev = []
        oob_scores = []  # Initialize a list to store OOB scores

        # K-fold cross-validation with variable n_splits
        kf = KFold(n_splits=n_splits)
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]  # Use .iloc for row selection
            y_train, y_test = y[train_index], y[test_index]
            clusters_train, clusters_test = clusters[train_index], pd.Series(clusters[test_index])
            z_train, z_test = z[train_index], z[test_index]
            model = MERF(
                # Specify the fixed effects model as a Random Forest Regressor
                fixed_effects_model=RandomForestRegressor(
                    n_estimators=n_estimators,  # Number of trees in the forest
                    max_depth=max_depth,  # Maximum depth of each tree
                    min_samples_split=min_samples_split,  # Minimum samples required to split an internal node
                    n_jobs=1,  # Number of jobs to run in parallel
                    oob_score=True  # Whether to use out-of-bag samples to estimate the R^2 on unseen data
                ),
                # Generalized Linear Model (GLM) early stopping threshold
                gll_early_stop_threshold=None,  # No early stopping threshold set
                # Maximum number of iterations for the MERF algorithm
                max_iterations=max_iter  # Maximum number of iterations to run the MERF algorithm
            )
            model.fit(X_train.select_dtypes(include=[np.number]), z_train, pd.Series(clusters_train), y_train)
            y_pred = model.predict(X_test, z_test, clusters_test)
            scores.append(np.mean((y_pred - y_test) ** 2)) # MSE
            
            # Calculate ptev and prev
            total_variance = np.var(y_test) #calculates the total variance of the predicted values
            random_effect_variance = np.var(y_test - y_pred)  # Variance of residuals
            fixed_effect_variance = total_variance - random_effect_variance

            ptev.append(np.mean(fixed_effect_variance / total_variance if total_variance > 0 else 0))
            prev.append(np.mean(random_effect_variance / total_variance if total_variance > 0 else 0))

            # Calculate OOB score
            forest = model.trained_fe_model
            oob_score = round(forest.oob_score_*100, 1)  # percent variation
            oob_scores.append(oob_score)  # Append OOB score to the list

            # Print ptev, prev, and OOB score for the current iteration
            print(f"Combination, ptev: {np.mean(ptev):.4f}, prev: {np.mean(prev):.4f}, OOB Score: {oob_score:.4f}")

        # Calculate the mean of the scores for the current combination of parameters
        mean_score = np.mean(scores)
        mean_prev = np.mean(prev)
        mean_ptev = np.mean(ptev)
        mean_oob_score = np.mean(oob_scores)  # Calculate the mean of OOB scores
        if mean_score < best_score:
            best_score = mean_score
            best_params = params

        # Append the results of the current iteration to the results list
        # Create a result dictionary with individual scores and mean scores
        result_dict = {
            'n_estimators': n_estimators,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'max_iter': max_iter,
            'n_splits': n_splits,  # Added n_splits to the result dictionary
            'mse_score_1': scores[0] if len(scores) > 0 else None,
            'mse_score_2': scores[1] if len(scores) > 1 else None,
            'mse_score_3': scores[2] if len(scores) > 2 else None,
            'mse_score_4': scores[3] if len(scores) > 3 else None,
            'mse_score_5': scores[4] if len(scores) > 4 else None,
            'mse_score_6': scores[5] if len(scores) > 5 else None,
            'mse_score_7': scores[6] if len(scores) > 6 else None,
            'mse_score_8': scores[7] if len(scores) > 7 else None,
            'mse_score_9': scores[8] if len(scores) > 8 else None,
            'mse_score_10': scores[9] if len(scores) > 9 else None,
            'mean_mse_score': mean_score,
            'prev_1': prev[0] if len(prev) > 0 else None,
            'prev_2': prev[1] if len(prev) > 1 else None,
            'prev_3': prev[2] if len(prev) > 2 else None,
            'prev_4': prev[3] if len(prev) > 3 else None,
            'prev_5': prev[4] if len(prev) > 4 else None,
            'prev_6': prev[5] if len(prev) > 5 else None,
            'prev_7': prev[6] if len(prev) > 6 else None,
            'prev_8': prev[7] if len(prev) > 7 else None,
            'prev_9': prev[8] if len(prev) > 8 else None,
            'prev_10': prev[9] if len(prev) > 9 else None,
            'mean_prev': mean_prev,
            'ptev_1': ptev[0] if len(ptev) > 0 else None,
            'ptev_2': ptev[1] if len(ptev) > 1 else None,
            'ptev_3': ptev[2] if len(ptev) > 2 else None,
            'ptev_4': ptev[3] if len(ptev) > 3 else None,
            'ptev_5': ptev[4] if len(ptev) > 4 else None,
            'ptev_6': ptev[5] if len(ptev) > 5 else None,
            'ptev_7': ptev[6] if len(ptev) > 6 else None,
            'ptev_8': ptev[7] if len(ptev) > 7 else None,
            'ptev_9': ptev[8] if len(ptev) > 8 else None,
            'ptev_10': ptev[9] if len(ptev) > 9 else None,
            'mean_ptev': mean_ptev,
            'oob_1': oob_scores[0] if len(oob_scores) > 0 else None,
            'oob_2': oob_scores[1] if len(oob_scores) > 1 else None,
            'oob_3': oob_scores[2] if len(oob_scores) > 2 else None,
            'oob_4': oob_scores[3] if len(oob_scores) > 3 else None,
            'oob_5': oob_scores[4] if len(oob_scores) > 4 else None,
            'oob_6': oob_scores[5] if len(oob_scores) > 5 else None,
            'oob_7': oob_scores[6] if len(oob_scores) > 6 else None,
            'oob_8': oob_scores[7] if len(oob_scores) > 7 else None,
            'oob_9': oob_scores[8] if len(oob_scores) > 8 else None,
            'oob_10': oob_scores[9] if len(oob_scores) > 9 else None,
            'oob_score': mean_oob_score
        }
        # Append the result dictionary to the results list
        results.append(result_dict)

print("Best parameters:", best_params)
print("Best score:", best_score)

# Convert the results list to a DataFrame and save it to a CSV file
df_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_dfs/1.clinical"
results_df = pd.DataFrame(results)
results_df.to_csv(f'{df_dir}/dec5_maggie_params_tuning_raw_clinical_oob.csv', index=False)

In [None]:
print("Best parameters:", best_params)
print("Best score:", best_score)