In [None]:
import pandas as pd
import os
import re
import numpy as np
from merf import MERF
import matplotlib.pyplot as plt
import seaborn as sns
import itertools 
sns.set_context("poster")
from sklearn.ensemble import RandomForestRegressor
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (11,8)
from merf.merf import MERF
from sklearn.model_selection import train_test_split, KFold
from merf.viz import plot_merf_training_stats
from em_utils import *
# Create output directory if it doesn't exist
output_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_plots/2.taxa"
df_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_dfs/2.taxa"
os.makedirs(output_dir, exist_ok=True)

print("---------- Read taxonomy data ---------- ")
t_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/taxa/aim2_transformed/"
tax_test = read_data(t_dir, "genus/aim2_clr_testing.csv")
tax_train = read_data(t_dir, "genus/aim2_clr_training.csv") 
tax_full = read_data(t_dir, "genus/clr_taxa_all.csv")

print("---------- Read metadata ----------")
m1_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/clinical/transformed/aim2"
test = read_data(m1_dir, "a2_test_samples_standard_clinical.csv")
train = read_data(m1_dir, "a2_train_samples_standard_clinical.csv")
full = read_data(m1_dir, "a2_meta_Transformed_standard_clinical.csv")
full_raw = read_data(m1_dir, "a2_meta_not_Transformed_standard_clinical.csv")

In [None]:
# Process Taxa Input data
# FULL dataset
# Split X column into character_id and timepoint
print("---------- Split X column into character_id and timepoint ----------")
tax_full_t = tax_full.copy()
X_LABEL = 'Unnamed: 0'
print(tax_full_t[X_LABEL])

In [3]:
tax_full_t[['character_id', 'timepoint']] = tax_full_t[X_LABEL].str.split('.', expand=True)

In [None]:
print("---------- Create time column ----------")
tax_full_t['t'] = create_t_column(tax_full_t)

print("---------- Create x_t column combining character_id and t ----------")
tax_full_t['x_t'] = tax_full_t['character_id'] + '.' + tax_full_t['t']

In [None]:
print("---------- Filter and select columns ----------")
tax = tax_full_t[~tax_full_t['t'].isin(['3', '18'])]
tax = tax.drop(['t', 'timepoint', 'character_id', X_LABEL], axis=1)

In [None]:
print("---------- Build training dataset ----------")
train_t = tax_train.copy()
train_t[['character_id', 'timepoint']] = train_t[X_LABEL].str.split('.', expand=True)
train_t['t'] = create_t_column(train_t)
train_t['x_t'] = train_t['character_id'] + '.' + train_t['t']
train_t = train_t[~train_t['t'].isin(['3', '18'])]


In [None]:
print("---------- Build testing dataset ----------")
test_t = tax_test.copy()
test_t[['character_id', 'timepoint']] = test_t[X_LABEL].str.split('.', expand=True)
test_t['t'] = create_t_column(test_t)
test_t['x_t'] = test_t['character_id'] + '.' + test_t['t']
test_t = test_t[~test_t['t'].isin(['3', '18'])]

In [None]:
print(full_raw.columns.to_list() == train.columns.to_list())
print(train.columns.to_list() == test.columns.to_list())

In [None]:
# Apply the function to each meta dataset
print("---------- Convert metadata to long format ----------")
full_long = make_long(full_raw)
full_long['x_t'] = full_long['subject_id'].astype(str) + '.' + full_long['time'].astype(str)

train_long = make_long(train)
train_long['x_t'] = train_long['subject_id'].astype(str) + '.' + train_long['time'].astype(str)

test_long = make_long(test)
test_long['x_t'] = test_long['subject_id'].astype(str) + '.' + test_long['time'].astype(str)

In [None]:
print("---------- Select and prepare metadata for merging ----------")
full_meta = full_long[['x_t', 'outcome_BMI_fnl']]
test_meta = test_long[['x_t', 'outcome_BMI_fnl']]
train_meta = train_long[['x_t', 'outcome_BMI_fnl']]
print("full_meta shape = ", full_meta.shape)

print("---------- Merge training data ----------")
full_train_tax = train_t.merge(full_meta, on='x_t', how='inner')
train_tax = train_t.merge(train_meta, on='x_t')
train_tax = train_tax.drop(['x_t', X_LABEL, 'character_id', 'timepoint'], axis=1)
print("train_tax shape = ", train_tax.shape)
print("full_train_tax shape = ", full_train_tax.shape)

print("---------- Merge testing data ----------")
full_test_tax = test_t.merge(full_meta, on='x_t', how='inner')
test_tax = test_t.merge(test_meta, on='x_t')
test_tax = test_tax.drop(['x_t', X_LABEL, 'character_id', 'timepoint'], axis=1)
print("test_tax shape = ", test_tax.shape)
print("full_test_tax shape = ", full_test_tax.shape)


In [None]:
# Inspect the columns of the tax_full_t DataFrame
print(tax_full_t.columns.to_list())

# Perform the merge with custom suffixes to prevent 'X.x' and 'X.y'
print("---------- Merge full dataset ----------")
full = tax_full_t.merge(full_long, on='x_t', how='left', suffixes=('_tax', '_long'))
print("Columns after merge:", full.columns.to_list())

# Define columns to drop after merge (including Unnamed: 0 if present)
columns_to_drop = ['Unnamed: 0', 'X.y', 'X.x', 'randomized_group', 'cohort_number', 'record_id',
                  'subject_id', 'character_id', 'cohort_number', 'age', 'race', 'sex', 
                  'time', 'timepoint', 'HOMA_IR', 'Insulin_endo', 'HDL_Total_Direct_lipid',
                  'Glucose', 'LDL_Calculated', 'Triglyceride_lipid']

# Drop columns only if they exist in the DataFrame (since some may not be present after merge)
full = full.drop([col for col in columns_to_drop if col in full.columns], axis=1)
print("Final columns after drop:", full.columns.to_list())

In [None]:
print("---------- Remove NAs and filter by time ----------")
full_no_na = full.dropna()

# Get rows where t is NaN and display them
nan_rows = full[full['t'].isna()]
print("\nRows with NaN in 't' column:")
print(nan_rows)
print(f"\nTotal rows with NaN in 't': {len(nan_rows)}")

test_tax_no_na = test_tax.dropna()
train_tax_no_na = train_tax.dropna()
full_train_tax = full_train_tax.dropna()
full_test_tax = full_test_tax.dropna()

print("test_tax_no_na shape = ", test_tax_no_na.shape)
print("train_tax_no_na shape = ", train_tax_no_na.shape)
print("---------- raw train and tax ----------")
print("full_train_tax shape = ", full_train_tax.shape)
print("full_test_tax shape = ", full_test_tax.shape)

print("---------- Create demo datasets filtered by time ----------")
demo_train = full_no_na[full_no_na['t'].astype(int) < 12]
demo_test = full_no_na[full_no_na['t'].astype(int) == 12]
print("demo_test shape = ", demo_test.shape)


In [13]:
#data_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/taxa/aim2_transformed/genus/merf_ready_sets/"
#test_tax_no_na.to_csv(os.path.join(data_dir, 'test_tax_no_na.csv'), index=False)  # Save test_tax_no_na
#train_tax_no_na.to_csv(os.path.join(data_dir, 'train_tax_no_na.csv'), index=False)  # Save train_tax_no_na
#full_train_tax.to_csv(os.path.join(data_dir, 'full_train_tax.csv'), index=False)  # Save full_train_tax
#full_test_tax.to_csv(os.path.join(data_dir, 'full_test_tax.csv'), index=False) 

Make train set

In [None]:
print("---------- Select predictors for training set ----------")
train_set = full_train_tax
X = train_set.drop(['t', 'outcome_BMI_fnl', 'all_samples'], axis=1)
X = X.drop(columns=['Unnamed: 0_tax', 'x_t'], errors='ignore')
Y = train_set[['outcome_BMI_fnl']]
Y = Y['outcome_BMI_fnl'].to_numpy() # Convert Y to numeric array
clusters_train = train_set['all_samples'].to_numpy() # Get ID variables
Z = np.ones((train_set.shape[0], 1)) # Create random effects matrix with ones
time = train_set['t'].astype(float).to_numpy() # Get time values as numeric array 

Make test set

In [None]:
print("---------- Select predictors for test set ----------")
test_set = full_test_tax
X_new = test_set.drop(['t', 'outcome_BMI_fnl', 'all_samples'], axis=1)
# Drop any unwanted columns and align test set features with training features
X_new = test_set.drop(['t', 'outcome_BMI_fnl', 'all_samples'], axis=1)  # Drop non-predictor columns
X_new = X_new[X.columns]  # Reorder and select columns to match training set
X_new = X_new.astype(X.dtypes)  # Ensure data types match
X_new = X_new.drop(columns=['Unnamed: 0_tax', 'x_t'], errors='ignore')
X_new = X_new.drop(columns=['Unnamed: 0', 'character_id', 'timepoint'], errors='ignore')


Y_new = test_set['outcome_BMI_fnl'].to_numpy()  # Convert Y to numeric array
clusters_new = pd.Series(test_set['all_samples'])  # Convert to pandas Series
# Create random effects matrix with ones
Z_new = np.ones((len(X_new), 1))
time_new = test_set['t'].astype(float).to_numpy()  # Convert time values to numeric array

In [None]:
print("---------- MERF with finetuning RE 🤞🏼 ----------")

train_set = full_train_tax
columns_to_drop = ['t', 'outcome_BMI_fnl', 'all_samples', 
                   'Unnamed: 0_tax', 'x_t', 'character_id', 
                   'timepoint', 'Unnamed: 0']

# Create training features
X = train_set.drop(columns=columns_to_drop, errors='ignore')
y = train_set[['outcome_BMI_fnl']]
y = y['outcome_BMI_fnl'].to_numpy() # Convert Y to numeric array
clusters = train_set['all_samples'].to_numpy() # Get ID variables
z = np.ones((train_set.shape[0], 1)) # Create random effects matrix with ones

# Hyperparameters to tune
param_grid = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [None],
    'min_samples_split': [0.05, 0.1, 0.15],
    'max_iter': [2, 10],
    'n_splits': [3, 5, 10]  # Added n_splits for cross-validation
}

best_score = float('inf')
best_params = {}

# Initialize a list to store the results of each iteration
results = []

# Loop through all possible combinations of parameters
for params in itertools.product(*param_grid.values()):
        n_estimators, max_depth, min_samples_split, max_iter, n_splits = params
        print(f"Combination: {params}\n")
        scores = []
        prev = []
        ptev = []
        oob_scores = []  # Initialize a list to store OOB scores

        # K-fold cross-validation with variable n_splits
        kf = KFold(n_splits=n_splits)
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]  # Use .iloc for row selection
            y_train, y_test = y[train_index], y[test_index]
            clusters_train, clusters_test = clusters[train_index], pd.Series(clusters[test_index])
            z_train, z_test = z[train_index], z[test_index]
            model = MERF(
                # Specify the fixed effects model as a Random Forest Regressor
                fixed_effects_model=RandomForestRegressor(
                    n_estimators=n_estimators,  # Number of trees in the forest
                    max_depth=max_depth,  # Maximum depth of each tree
                    min_samples_split=min_samples_split,  # Minimum samples required to split an internal node
                    n_jobs=1,  # Number of jobs to run in parallel
                    oob_score=True  # Whether to use out-of-bag samples to estimate the R^2 on unseen data
                ),
                # Generalized Linear Model (GLM) early stopping threshold
                gll_early_stop_threshold=None,  # No early stopping threshold set
                # Maximum number of iterations for the MERF algorithm
                max_iterations=max_iter  # Maximum number of iterations to run the MERF algorithm
            )
            model.fit(X_train.select_dtypes(include=[np.number]), z_train, pd.Series(clusters_train), y_train)
            y_pred = model.predict(X_test, z_test, clusters_test)
            scores.append(np.mean((y_pred - y_test) ** 2)) # MSE
            
            # Calculate ptev and prev
            total_variance = np.var(y_test) #calculates the total variance of the predicted values
            random_effect_variance = np.var(y_test - y_pred)  # Variance of residuals
            fixed_effect_variance = total_variance - random_effect_variance

            ptev.append(np.mean(fixed_effect_variance / total_variance if total_variance > 0 else 0))
            prev.append(np.mean(random_effect_variance / total_variance if total_variance > 0 else 0))

            # Calculate OOB score
            forest = model.trained_fe_model
            oob_score = round(forest.oob_score_*100, 1)  # percent variation
            oob_scores.append(oob_score)  # Append OOB score to the list

            # Print ptev, prev, and OOB score for the current iteration
            print(f"Combination, ptev: {np.mean(ptev):.4f}, prev: {np.mean(prev):.4f}, OOB Score: {oob_score:.4f}")

        # Calculate the mean of the scores for the current combination of parameters
        mean_score = np.mean(scores)
        mean_prev = np.mean(prev)
        mean_ptev = np.mean(ptev)
        mean_oob_score = np.mean(oob_scores)  # Calculate the mean of OOB scores
        if mean_score < best_score:
            best_score = mean_score
            best_params = params

        # Append the results of the current iteration to the results list
        # Create a result dictionary with individual scores and mean scores
        result_dict = {
            'n_estimators': n_estimators,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'max_iter': max_iter,
            'n_splits': n_splits,  # Added n_splits to the result dictionary
            'mse_score_1': scores[0] if len(scores) > 0 else None,
            'mse_score_2': scores[1] if len(scores) > 1 else None,
            'mse_score_3': scores[2] if len(scores) > 2 else None,
            'mse_score_4': scores[3] if len(scores) > 3 else None,
            'mse_score_5': scores[4] if len(scores) > 4 else None,
            'mse_score_6': scores[5] if len(scores) > 5 else None,
            'mse_score_7': scores[6] if len(scores) > 6 else None,
            'mse_score_8': scores[7] if len(scores) > 7 else None,
            'mse_score_9': scores[8] if len(scores) > 8 else None,
            'mse_score_10': scores[9] if len(scores) > 9 else None,
            'mean_mse_score': mean_score,
            'prev_1': prev[0] if len(prev) > 0 else None,
            'prev_2': prev[1] if len(prev) > 1 else None,
            'prev_3': prev[2] if len(prev) > 2 else None,
            'prev_4': prev[3] if len(prev) > 3 else None,
            'prev_5': prev[4] if len(prev) > 4 else None,
            'prev_6': prev[5] if len(prev) > 5 else None,
            'prev_7': prev[6] if len(prev) > 6 else None,
            'prev_8': prev[7] if len(prev) > 7 else None,
            'prev_9': prev[8] if len(prev) > 8 else None,
            'prev_10': prev[9] if len(prev) > 9 else None,
            'mean_prev': mean_prev,
            'ptev_1': ptev[0] if len(ptev) > 0 else None,
            'ptev_2': ptev[1] if len(ptev) > 1 else None,
            'ptev_3': ptev[2] if len(ptev) > 2 else None,
            'ptev_4': ptev[3] if len(ptev) > 3 else None,
            'ptev_5': ptev[4] if len(ptev) > 4 else None,
            'ptev_6': ptev[5] if len(ptev) > 5 else None,
            'ptev_7': ptev[6] if len(ptev) > 6 else None,
            'ptev_8': ptev[7] if len(ptev) > 7 else None,
            'ptev_9': ptev[8] if len(ptev) > 8 else None,
            'ptev_10': ptev[9] if len(ptev) > 9 else None,
            'mean_ptev': mean_ptev,
            'oob_1': oob_scores[0] if len(oob_scores) > 0 else None,
            'oob_2': oob_scores[1] if len(oob_scores) > 1 else None,
            'oob_3': oob_scores[2] if len(oob_scores) > 2 else None,
            'oob_4': oob_scores[3] if len(oob_scores) > 3 else None,
            'oob_5': oob_scores[4] if len(oob_scores) > 4 else None,
            'oob_6': oob_scores[5] if len(oob_scores) > 5 else None,
            'oob_7': oob_scores[6] if len(oob_scores) > 6 else None,
            'oob_8': oob_scores[7] if len(oob_scores) > 7 else None,
            'oob_9': oob_scores[8] if len(oob_scores) > 8 else None,
            'oob_10': oob_scores[9] if len(oob_scores) > 9 else None,
            'oob_score': mean_oob_score
        }
        # Append the result dictionary to the results list
        results.append(result_dict)

print("Best parameters:", best_params)
print("Best score:", best_score)

# Convert the results list to a DataFrame and save it to a CSV file
df_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_dfs/2.taxa"
results_df = pd.DataFrame(results)
results_df.to_csv(f'{df_dir}/dec6_maggie_params_tuning_raw_taxa_oob.csv', index=False)

# this took 130 minutes to run 

In [None]:
print("Best parameters: " + str(best_params) + "\n")
print("Best score: " + str(best_score) + "\n")

    'n_estimators': [100],
    'max_depth': [100],
    'min_samples_split': [2],
    'max_iter': [2, 3, 10, 20]

Once fit, the model can be used to predict on new samples given X, Z, and id's. The predict code handles whether or not to apply the random effect correction based on if the id of the new sample was seen in training or not.

Look up longitudinal R - squared values 