In [1]:
import pandas as pd
import os
import re
import numpy as np
from merf import MERF
import matplotlib.pyplot as plt
import seaborn as sns
import itertools 
sns.set_context("poster")
from sklearn.ensemble import RandomForestRegressor
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (11,8)
from merf.merf import MERF
from sklearn.model_selection import train_test_split, KFold
from merf.viz import plot_merf_training_stats
from em_utils import *
# Create output directory if it doesn't exist
output_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_plots/2.taxa"
df_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_dfs/2.taxa"
os.makedirs(output_dir, exist_ok=True)

In [None]:
# Specify the directory where the files are located
data_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/taxa/aim2_transformed/genus/merf_ready_sets/"

# Read the CSV files into DataFrames
test_tax_no_na = pd.read_csv(os.path.join(data_dir, 'test_tax_no_na.csv'))
train_tax_no_na = pd.read_csv(os.path.join(data_dir, 'train_tax_no_na.csv'))
full_train_tax = pd.read_csv(os.path.join(data_dir, 'full_train_tax.csv'))
full_test_tax = pd.read_csv(os.path.join(data_dir, 'full_test_tax.csv'))

# Optionally, print the shapes of the DataFrames to confirm they were read correctly
print("test_tax_no_na shape:", test_tax_no_na.shape)
print("train_tax_no_na shape:", train_tax_no_na.shape)
print("full_train_tax shape:", full_train_tax.shape)
print("full_test_tax shape:", full_test_tax.shape)

In [None]:
print("---------- Select predictors for training set ----------")
train_set = full_train_tax
X = train_set.drop(['t', 'outcome_BMI_fnl', 'all_samples'], axis=1)
X = X.drop(columns=['Unnamed: 0_tax', 'x_t'], errors='ignore')
Y = train_set[['outcome_BMI_fnl']]
Y = Y['outcome_BMI_fnl'].to_numpy() # Convert Y to numeric array
clusters_train = train_set['all_samples'].to_numpy() # Get ID variables
Z = np.ones((train_set.shape[0], 1)) # Create random effects matrix with ones

print("---------- Select predictors for test set ----------")
test_set = full_test_tax
# Drop any unwanted columns and align test set features with training features
X_new = test_set.drop(['t', 'outcome_BMI_fnl', 'all_samples'], axis=1)  # Drop non-predictor columns
X_new = X_new[X.columns]  # Reorder and select columns to match training set
X_new = X_new.astype(X.dtypes)  # Ensure data types match
X_new = X_new.drop(columns=['Unnamed: 0_tax', 'x_t'], errors='ignore')
X_new = X_new.drop(columns=['Unnamed: 0', 'character_id', 'timepoint'], errors='ignore')

Y_new = test_set['outcome_BMI_fnl'].to_numpy()  # Convert Y to numeric array
clusters_new = pd.Series(test_set['all_samples'])  # Convert to pandas Series
# Create random effects matrix with ones
Z_new = np.ones((len(X_new), 1))

In [None]:
# set thresholds
best_mse_param_grid = {
    'n_estimators': [300],
    'max_depth': [None],
    'min_samples_split': [5],
    'max_iter': [2]
}

lowest_prev_param_grid = {
    'n_estimators': [100],
    'max_depth': [None],
    'min_samples_split': [5],
    'max_iter': [3]
}

lowest_ptev_param_grid = {
    'n_estimators': [1],
    'max_depth': [1],
    'min_samples_split': [2],
    'max_iter': [3]
}

tuned_param_grid = {
    'n_estimators': [300],
    'max_depth': [100],
    'min_samples_split': [2],
    'max_iter': [3]
}

mse_mrf = MERF(fixed_effects_model =
           RandomForestRegressor(n_estimators = best_mse_param_grid['n_estimators'][0], 
                                max_depth = best_mse_param_grid['max_depth'][0], 
                                min_samples_split = best_mse_param_grid['min_samples_split'][0], 
                                n_jobs = 1, 
                                oob_score= True),
            gll_early_stop_threshold=None,
            max_iterations = best_mse_param_grid['max_iter'][0])

prev_merf = MERF(fixed_effects_model =
           RandomForestRegressor(n_estimators = lowest_prev_param_grid['n_estimators'][0], 
                                max_depth = lowest_prev_param_grid['max_depth'][0], 
                                min_samples_split = lowest_prev_param_grid['min_samples_split'][0], 
                                n_jobs = 1, 
                                oob_score= True),
            gll_early_stop_threshold=None,
            max_iterations = lowest_prev_param_grid['max_iter'][0])

ptev_merf = MERF(fixed_effects_model =
           RandomForestRegressor(n_estimators = lowest_ptev_param_grid['n_estimators'][0], 
                                max_depth = lowest_ptev_param_grid['max_depth'][0], 
                                min_samples_split = lowest_ptev_param_grid['min_samples_split'][0], 
                                n_jobs = 1, 
                                oob_score= True),
            gll_early_stop_threshold=None,
            max_iterations = lowest_ptev_param_grid['max_iter'][0])

tuned_model = MERF(fixed_effects_model=RandomForestRegressor(
                    n_estimators= tuned_param_grid['n_estimators'][0],  # Number of trees in the forest
                    max_depth= tuned_param_grid['max_depth'][0],  # Maximum depth of each tree
                    min_samples_split= tuned_param_grid['min_samples_split'][0],  # Minimum samples required to split an internal node
                    n_jobs=1,  # Number of jobs to run in parallel
                    oob_score=True),  # Whether to use out-of-bag samples to estimate the R^2 on unseen data
                gll_early_stop_threshold=None,  # No early stopping threshold set
                max_iterations= tuned_param_grid['max_iter'][0])  # Maximum number of iterations to run the MERF algorithm

print("---------- RUN MERF RAW BASIC🌱 ----------")
mrf_mse = mse_mrf.fit(X.select_dtypes(include=[np.number]), 
        Z, 
        pd.Series(clusters_train), 
        Y)

mrf_prev = prev_merf.fit(X.select_dtypes(include=[np.number]), 
        Z, 
        pd.Series(clusters_train), 
        Y)

mrf_ptev = ptev_merf.fit(X.select_dtypes(include=[np.number]), 
        Z, 
        pd.Series(clusters_train), 
        Y)

tuned_model = tuned_model.fit(X.select_dtypes(include=[np.number]), 
                Z, 
                pd.Series(clusters_train), 
                Y)

### Now run merf with the tuned parameters above 

In [None]:
print("---------- RUN MERF 🌱 ----------")
mrf_mse = mse_mrf.fit(X.select_dtypes(include=[np.number]), 
        Z, 
        pd.Series(clusters_train), 
        Y)

mrf_prev = prev_merf.fit(X.select_dtypes(include=[np.number]), 
        Z, 
        pd.Series(clusters_train), 
        Y)

mrf_ptev = ptev_merf.fit(X.select_dtypes(include=[np.number]), 
        Z, 
        pd.Series(clusters_train), 
        Y)

tuned_model = tuned_model.fit(X.select_dtypes(include=[np.number]), 
                Z, 
                pd.Series(clusters_train), 
                Y)

In [None]:
plot_merf_training_stats(mrf_mse)
plt.savefig(os.path.join(output_dir, 'tuned_high_mse_merf_raw_metrics_tax.png'), dpi=300, bbox_inches='tight')

In [None]:
plot_merf_training_stats(mrf_prev)
plt.savefig(os.path.join(output_dir, 'tuned_low_prev_merf_raw_metrics_taxa.png'), dpi=300, bbox_inches='tight')

In [None]:
plot_merf_training_stats(mrf_ptev)
plt.savefig(os.path.join(output_dir, 'tuned_low_ptev_merf_raw_metrics_taxa.png'), dpi=300, bbox_inches='tight')

In [None]:
plot_merf_training_stats(tuned_model)
plt.savefig(os.path.join(output_dir, 'taxa_merf_raw_metrics_tuned_taxa.png'), dpi=300, bbox_inches='tight')

In [10]:
# Predict using the fitted model
X_new = X_new.drop(columns=['x_t'], errors='ignore')
y_hat_new_mse = mrf_mse.predict(X_new, Z_new, clusters_new)
y_hat_new_prev = mrf_prev.predict(X_new, Z_new, clusters_new)
y_hat_new_ptev = mrf_ptev.predict(X_new, Z_new, clusters_new)
y_hat_new_tuned = tuned_model.predict(X_new, Z_new, clusters_new)

In [None]:
plot_predicted_vs_actual(y_hat_new_mse, Y_new, output_dir, 'taxa_predicted_vs_actual_mse_tuned.png', best_mse_param_grid, plot_color='#F88F79')

In [None]:
plot_predicted_vs_actual(y_hat_new_prev, Y_new, 'taxa_predicted_vs_actual_highest_prev_tuned.png', None, '#F0F879')

In [None]:
plot_predicted_vs_actual(y_hat_new_ptev, Y_new, output_dir, 'clinical_predicted_vs_actual_highest_ptev_tuned.png', lowest_ptev_param_grid, '#ACF0F8')

In [None]:

plot_predicted_vs_actual(y_hat_new_tuned, Y_new, output_dir, 'clinical_predicted_vs_actual_tuned.png', tuned_param_grid, '#C9ACF8')

In [None]:
# mse
mse_forest = mrf_mse.trained_fe_model
mse_feature_names = mse_forest.feature_names_in_
mse_feature_importances = mse_forest.feature_importances_
plot_top_20_feature_importances(mse_feature_names, mse_feature_importances, 
                         output_dir, 'mse_feature_importances_taxa', '#F88F79')

In [None]:
# prev
prev_forest = prev_merf.trained_fe_model
prev_feature_names = prev_forest.feature_names_in_
prev_feature_importances = prev_forest.feature_importances_
plot_top_20_feature_importances(prev_feature_names, prev_feature_importances, 
                         output_dir, 'prev_feature_importances_taxa', '#F0F879')

In [None]:
# ptev
ptev_forest = ptev_merf.trained_fe_model
ptev_feature_names = ptev_forest.feature_names_in_
ptev_feature_importances = ptev_forest.feature_importances_
plot_top_20_feature_importances(ptev_feature_names, ptev_feature_importances, 
                         output_dir, 'ptev_feature_importances_taxa', '#ACF0F8')

In [None]:
# Tuned
forest = tuned_model.trained_fe_model
feature_names = forest.feature_names_in_
feature_importances = forest.feature_importances_
plot_top_20_feature_importances(feature_names, feature_importances, 
                                output_dir, 'tuned_feature_importances_taxa', '#C9ACF8')