In [1]:
import pandas as pd
import os
import re
import numpy as np
from merf import MERF
import matplotlib.pyplot as plt
import seaborn as sns
import itertools 
sns.set_context("poster")
from sklearn.ensemble import RandomForestRegressor
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (11,8)
from merf.merf import MERF
from sklearn.model_selection import train_test_split, KFold
from merf.viz import plot_merf_training_stats
from em_utils import *
# Create output directory if it doesn't exist
output_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_plots/3.functional"
df_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_dfs/3.functional"
os.makedirs(output_dir, exist_ok=True)

In [None]:
print("---------- Read data ----------")
func_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/functional/aim2"
test = read_data(func_dir, "all_clr_testing.csv")
train = read_data(func_dir, "all_clr_training.csv")

print("---------- Read metadata ----------")
m1_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/clinical/transformed/aim2"
met_test = read_data(m1_dir, "a2_test_samples_standard_clinical.csv")
met_train = read_data(m1_dir, "a2_train_samples_standard_clinical.csv")
met_full = read_data(m1_dir, "a2_meta_Transformed_standard_clinical.csv")
met_full_raw = read_data(m1_dir, "a2_meta_not_Transformed_standard_clinical.csv")

print("---------- Columns in meta data ----------")
print(met_test.columns.to_list())
print(met_train.columns.to_list())
print("---------- Columns in meta unnamed ----------")
print(met_test['Unnamed: 0'])
print(met_train['Unnamed: 0'])
print("---------- meta subject_ids ----------")
print(met_test['subject_id'])
print(met_train['subject_id'])
print("---------- Meta record_id ----------")
print(met_test['record_id'])
print(met_train['record_id'])

print("---------- Columns in functional data ----------")
print(test.columns.to_list())
print(train.columns.to_list())
print("---------- Do test and train data have the same columns? ----------")
print(test.columns.to_list() == train.columns.to_list())
print(train.columns.to_list() == test.columns.to_list())
print("---------- WTF is the unnammed ----------")
print(train['Unnamed: 0'])
print(test['Unnamed: 0'])
print("---------- Sample _ IDs ----------")
print(train['SampleID'])
print(test['SampleID'])
print("---------- All Samples ----------")
print(train['all_samples'])
print(test['all_samples'])


In [None]:
# Split X column into character_id and timepoint
print("---------- Split SampleID column into character_id and timepoint ---------")
train[['character_id', 'timepoint']] = train['SampleID'].str.split('.', expand=True)
test[['character_id', 'timepoint']] = test['SampleID'].str.split('.', expand=True)

print(test.columns.to_list())
print(train.columns.to_list())

# Remove 3 and 18 months 
train = train[~train['timepoint'].isin(['3', '18'])]
test = test[~test['timepoint'].isin(['3', '18'])]

In [None]:
print(test.columns.to_list())
print(train.columns.to_list())

Edit meta data prior to merge

In [None]:
# Apply the function to each meta dataset
print("---------- Convert metadata to long format ----------")
met_full_long = make_long(met_full_raw)
met_full_long['x_t'] = met_full_long['subject_id'].astype(str) + '.' + met_full_long['time'].astype(str)

met_train_long = make_long(met_train)
met_train_long['x_t'] = met_train_long['subject_id'].astype(str) + '.' + met_train_long['time'].astype(str)

met_test_long = make_long(met_test)
met_test_long['x_t'] = met_test_long['subject_id'].astype(str) + '.' + met_test_long['time'].astype(str)

In [None]:
# Print x_t values 
print("taxa character_id", train['character_id'])
print("meta x_t", met_full_long['x_t'])

test['time'] = test['timepoint'].replace(['BL', '12m', '6m'], ['0', '12', '6'])
test = test[~test['time'].isin(['3m', '18m'])]
test['x_t'] = test['character_id'].astype(str) + '.' + test['time'].astype(str)


train['time'] = train['timepoint'].replace(['BL', '12m', '6m'], ['0', '12', '6'])
train = train[~train['time'].isin(['3m', '18m'])]
train['x_t'] = train['character_id'].astype(str) + '.' + train['time'].astype(str)

In [None]:
print("test x_t", test['x_t'])

In [None]:
print("---------- Select and prepare metadata for merging ----------")
met_full_long = met_full_long[['x_t', 'outcome_BMI_fnl']]
met_test_long = met_test_long[['x_t', 'outcome_BMI_fnl']]
met_train_long = met_train_long[['x_t', 'outcome_BMI_fnl']]
print("full_meta shape = ", met_full_long.shape)

print("---------- Merge training data ----------")
full_train_tax = train.merge(met_full_long, on='x_t', how='inner')
train_tax = train.merge(met_train_long, on='x_t')
train_tax = train_tax.drop(['x_t', 'character_id', 'timepoint'], axis=1)
print("train_tax shape = ", train_tax.shape)
print("full_train_tax shape = ", full_train_tax.shape)

print("---------- Merge testing data ----------")
full_test_tax = test.merge(met_full_long, on='x_t', how='inner')
test_tax = test.merge(met_test_long, on='x_t')
test_tax = test_tax.drop(['x_t', 'character_id', 'timepoint'], axis=1)
print("test_tax shape = ", test_tax.shape)
print("full_test_tax shape = ", full_test_tax.shape)

In [9]:
test_tax_no_na = test_tax.dropna()
train_tax_no_na = train_tax.dropna()
full_train_tax = full_train_tax.dropna()
full_test_tax = full_test_tax.dropna()

In [10]:
#data_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/functional/aim2/"
#test_tax_no_na.to_csv(os.path.join(data_dir, 'test_functional_no_na.csv'), index=False)  # Save test_tax_no_na
#train_tax_no_na.to_csv(os.path.join(data_dir, 'train_functional_no_na.csv'), index=False)  # Save train_tax_no_na
#full_train_tax.to_csv(os.path.join(data_dir, 'full_train_functional.csv'), index=False)  # Save full_train_tax
#full_test_tax.to_csv(os.path.join(data_dir, 'full_test_functional.csv'), index=False) 

Make Train Set

In [None]:
print("Columns full :", full_train_tax.columns.to_list())

In [None]:
print("---------- Select predictors for training set ----------")
train_set = full_train_tax
X = train_set.drop(['Unnamed: 0', 'SampleID','all_samples', 'character_id', 'timepoint', 'time', 'x_t', 'outcome_BMI_fnl'], axis=1)
#X = X.drop(columns=['Unnamed: 0_tax', 'x_t'], errors='ignore')
Y = train_set[['outcome_BMI_fnl']]
Y = Y['outcome_BMI_fnl'].to_numpy() # Convert Y to numeric array
clusters_train = train_set['all_samples'].to_numpy() # Get ID variables
Z = np.ones((train_set.shape[0], 1)) # Create random effects matrix with ones

In [None]:
print("---------- Select predictors for test set ----------")
test_set = full_test_tax
X_new = test_set.drop(['Unnamed: 0', 'SampleID','all_samples', 'character_id', 'timepoint', 'time', 'x_t', 'outcome_BMI_fnl'], axis=1)
X_new = X_new[X.columns]  # Reorder and select columns to match training set
X_new = X_new.astype(X.dtypes)  # Ensure data types match
X_new = X_new.drop(columns=['Unnamed: 0_tax', 'x_t'], errors='ignore')
X_new = X_new.drop(columns=['Unnamed: 0', 'character_id', 'timepoint'], errors='ignore')
Y_new = test_set['outcome_BMI_fnl'].to_numpy()  # Convert Y to numeric array
clusters_new = pd.Series(test_set['all_samples'])  # Convert to pandas Series
# Create random effects matrix with ones
Z_new = np.ones((len(X_new), 1))

In [14]:
# Hyperparameters to tune
param_grid = {
    'n_estimators': [100, 500, 1000],
    'max_depth': [None],
    'min_samples_split': [0.05, 0.1, 0.15],
    'max_iter': [2, 10],
    'n_splits': [3, 5, 10]  # Added n_splits for cross-validation
}

# Run the tuning Loop 

In [None]:
train_set = full_train_tax
columns_to_drop = ['Unnamed: 0', 'SampleID','all_samples', 'character_id', 'timepoint', 'time', 'x_t', 'outcome_BMI_fnl']

# Create training features
X = train_set.drop(columns=columns_to_drop, errors='ignore')
y = train_set[['outcome_BMI_fnl']]
y = y['outcome_BMI_fnl'].to_numpy() # Convert Y to numeric array
clusters = train_set['all_samples'].to_numpy() # Get ID variables
z = np.ones((train_set.shape[0], 1)) # Create random effects matrix with ones


best_score = float('inf')
best_params = {}
# Initialize a list to store the results of each iteration
results = []

# Loop through all possible combinations of parameters
for params in itertools.product(*param_grid.values()):
        n_estimators, max_depth, min_samples_split, max_iter, n_splits = params
        print(f"Combination: {params}\n")
        scores = []
        prev = []
        ptev = []
        oob_scores = []  # Initialize a list to store OOB scores

        # K-fold cross-validation with variable n_splits
        kf = KFold(n_splits=n_splits)
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]  # Use .iloc for row selection
            y_train, y_test = y[train_index], y[test_index]
            clusters_train, clusters_test = clusters[train_index], pd.Series(clusters[test_index])
            z_train, z_test = z[train_index], z[test_index]
            model = MERF(
                # Specify the fixed effects model as a Random Forest Regressor
                fixed_effects_model=RandomForestRegressor(
                    n_estimators=n_estimators,  # Number of trees in the forest
                    max_depth=max_depth,  # Maximum depth of each tree
                    min_samples_split=min_samples_split,  # Minimum samples required to split an internal node
                    n_jobs=1,  # Number of jobs to run in parallel
                    oob_score=True  # Whether to use out-of-bag samples to estimate the R^2 on unseen data
                ),
                # Generalized Linear Model (GLM) early stopping threshold
                gll_early_stop_threshold=None,  # No early stopping threshold set
                # Maximum number of iterations for the MERF algorithm
                max_iterations=max_iter  # Maximum number of iterations to run the MERF algorithm
            )
            model.fit(X_train.select_dtypes(include=[np.number]), z_train, pd.Series(clusters_train), y_train)
            y_pred = model.predict(X_test, z_test, clusters_test)
            scores.append(np.mean((y_pred - y_test) ** 2)) # MSE
            
            # Calculate ptev and prev
            total_variance = np.var(y_test) #calculates the total variance of the predicted values
            random_effect_variance = np.var(y_test - y_pred)  # Variance of residuals
            fixed_effect_variance = total_variance - random_effect_variance

            ptev.append(np.mean(fixed_effect_variance / total_variance if total_variance > 0 else 0))
            prev.append(np.mean(random_effect_variance / total_variance if total_variance > 0 else 0))

            # Calculate OOB score
            forest = model.trained_fe_model
            oob_score = round(forest.oob_score_*100, 1)  # percent variation
            oob_scores.append(oob_score)  # Append OOB score to the list

            # Print ptev, prev, and OOB score for the current iteration
            print(f"Combination, ptev: {np.mean(ptev):.4f}, prev: {np.mean(prev):.4f}, OOB Score: {oob_score:.4f}")

        # Calculate the mean of the scores for the current combination of parameters
        mean_score = np.mean(scores)
        mean_prev = np.mean(prev)
        mean_ptev = np.mean(ptev)
        mean_oob_score = np.mean(oob_scores)  # Calculate the mean of OOB scores
        if mean_score < best_score:
            best_score = mean_score
            best_params = params
        # Append the results of the current iteration to the results list
        # Create a result dictionary with individual scores and mean scores
        result_dict = {
            'n_estimators': n_estimators,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'max_iter': max_iter,
            'n_splits': n_splits,  # Added n_splits to the result dictionary
            'mse_score_1': scores[0] if len(scores) > 0 else None,
            'mse_score_2': scores[1] if len(scores) > 1 else None,
            'mse_score_3': scores[2] if len(scores) > 2 else None,
            'mse_score_4': scores[3] if len(scores) > 3 else None,
            'mse_score_5': scores[4] if len(scores) > 4 else None,
            'mse_score_6': scores[5] if len(scores) > 5 else None,
            'mse_score_7': scores[6] if len(scores) > 6 else None,
            'mse_score_8': scores[7] if len(scores) > 7 else None,
            'mse_score_9': scores[8] if len(scores) > 8 else None,
            'mse_score_10': scores[9] if len(scores) > 9 else None,
            'mean_mse_score': mean_score,
            'prev_1': prev[0] if len(prev) > 0 else None,
            'prev_2': prev[1] if len(prev) > 1 else None,
            'prev_3': prev[2] if len(prev) > 2 else None,
            'prev_4': prev[3] if len(prev) > 3 else None,
            'prev_5': prev[4] if len(prev) > 4 else None,
            'prev_6': prev[5] if len(prev) > 5 else None,
            'prev_7': prev[6] if len(prev) > 6 else None,
            'prev_8': prev[7] if len(prev) > 7 else None,
            'prev_9': prev[8] if len(prev) > 8 else None,
            'prev_10': prev[9] if len(prev) > 9 else None,
            'mean_prev': mean_prev,
            'ptev_1': ptev[0] if len(ptev) > 0 else None,
            'ptev_2': ptev[1] if len(ptev) > 1 else None,
            'ptev_3': ptev[2] if len(ptev) > 2 else None,
            'ptev_4': ptev[3] if len(ptev) > 3 else None,
            'ptev_5': ptev[4] if len(ptev) > 4 else None,
            'ptev_6': ptev[5] if len(ptev) > 5 else None,
            'ptev_7': ptev[6] if len(ptev) > 6 else None,
            'ptev_8': ptev[7] if len(ptev) > 7 else None,
            'ptev_9': ptev[8] if len(ptev) > 8 else None,
            'ptev_10': ptev[9] if len(ptev) > 9 else None,
            'mean_ptev': mean_ptev,
            'oob_1': oob_scores[0] if len(oob_scores) > 0 else None,
            'oob_2': oob_scores[1] if len(oob_scores) > 1 else None,
            'oob_3': oob_scores[2] if len(oob_scores) > 2 else None,
            'oob_4': oob_scores[3] if len(oob_scores) > 3 else None,
            'oob_5': oob_scores[4] if len(oob_scores) > 4 else None,
            'oob_6': oob_scores[5] if len(oob_scores) > 5 else None,
            'oob_7': oob_scores[6] if len(oob_scores) > 6 else None,
            'oob_8': oob_scores[7] if len(oob_scores) > 7 else None,
            'oob_9': oob_scores[8] if len(oob_scores) > 8 else None,
            'oob_10': oob_scores[9] if len(oob_scores) > 9 else None,
            'oob_score': mean_oob_score
        }
        # Append the result dictionary to the results list
        results.append(result_dict)

print("Best parameters:", best_params)
print("Best score:", best_score)

# Convert the results list to a DataFrame and save it to a CSV file
df_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_dfs/3.functional"
results_df = pd.DataFrame(results)
results_df.to_csv(f'{df_dir}/dec6_magggie_params_fine_tuning_results_raw_functional.csv', index=False)