In [1]:
import pandas as pd
import os
import re
import numpy as np
from merf import MERF
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context("poster")
from sklearn.ensemble import RandomForestRegressor
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (11,8)
from merf.merf import MERF
from sklearn.model_selection import train_test_split, KFold
from merf.viz import plot_merf_training_stats


In [2]:
# set thresholds
max_iter = 200
cv = KFold(n_splits=5, shuffle=True)
mse_rf = []
mse_rfplus = []
mse_mrf = []
mse_mrf_id = []
mse_mrf_id_both = []

In [None]:
# Create output directory if it doesn't exist
output_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_plots"
df_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_dfs"
os.makedirs(output_dir, exist_ok=True)


def read_data(directory, filename):
    """Read CSV data from specified directory and filename"""
    filepath = os.path.join(directory, filename)
    return pd.read_csv(filepath)

print("---------- Read metadata ----------")
m1_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/clinical/transformed/aim2"
test = read_data(m1_dir, "a2_test_samples_standard_clinical.csv")
train = read_data(m1_dir, "a2_train_samples_standard_clinical.csv")
full = read_data(m1_dir, "a2_meta_Transformed_standard_clinical.csv")
full_raw = read_data(m1_dir, "a2_meta_not_Transformed_standard_clinical.csv")

print(full_raw.columns.to_list() == train.columns.to_list())
print(train.columns.to_list() == test.columns.to_list())

Make long format 

In [None]:
# Process metadata to long format
def make_long(wide_data):
    """
    Converts a wide-format DataFrame into a long-format DataFrame,
    aligning with the structure produced by the R transformation.
    
    Args:
        wide_data (pd.DataFrame): Input DataFrame in wide format.
    
    Returns:
        pd.DataFrame: Transformed DataFrame in long format.
    """
    # Extract measurement columns and id columns
    id_vars = [col for col in wide_data.columns if not re.search(r'_(BL|6m|12m)$', col)]
    value_vars = [col for col in wide_data.columns if re.search(r'_(BL|6m|12m)$', col)]

    # Melt the DataFrame to long format
    long_data = wide_data.melt(
        id_vars=id_vars,
        value_vars=value_vars,
        var_name="measurement_time",
        value_name="value"
    )

    # Extract measurement type and time from the variable name
    long_data[['measurement_type', 'time']] = long_data['measurement_time'].str.extract(r'(.+)_(BL|6m|12m)')

    # Map time values
    time_mapping = {'BL': 0, '6m': 6, '12m': 12}
    long_data['time'] = long_data['time'].map(time_mapping)

    # Drop the original melted column
    long_data = long_data.drop(columns=['measurement_time'])

    # Pivot the data back to wide format for measurements
    long_data = long_data.pivot_table(
        index=id_vars + ['time'], 
        columns='measurement_type', 
        values='value'
    ).reset_index()

    # Flatten the column MultiIndex from pivot_table
    long_data.columns.name = None
    long_data.columns = [str(col) for col in long_data.columns]

    return long_data


# Apply the function to each meta dataset
print("---------- Convert metadata to long format ----------")
full_long = make_long(full_raw)
full_long['x_t'] = full_long['subject_id'].astype(str) + '.' + full_long['time'].astype(str)

train_long = make_long(train)
train_long['x_t'] = train_long['subject_id'].astype(str) + '.' + train_long['time'].astype(str)

test_long = make_long(test)
test_long['x_t'] = test_long['subject_id'].astype(str) + '.' + test_long['time'].astype(str)

print("train data outcome_BMI_fnl values:", train_long['outcome_BMI_fnl'])
print("Full columns after transformation:", full_long.columns.to_list())
print("Test columns after transformation:", test_long.columns.to_list())

Define columns to drop if necessary 

In [5]:
columns_to_drop = ['Unnamed: 0', 'cohort_number', 'record_id', 'x_t']
# Drop columns only if they exist in the DataFrame (since some may not be present after merge)
full_long = full_long.drop([col for col in columns_to_drop if col in full.columns], axis=1)
train_long = train_long.drop([col for col in columns_to_drop if col in train_long.columns], axis=1)
test_long = test_long.drop([col for col in columns_to_drop if col in test_long.columns], axis=1)

In [None]:
# Check the final columns
print("Final columns after drop:", full_long.columns.to_list())
print("Final test columns after drop:", test_long.columns.to_list())

In [None]:
# Drop NA 
test_long = test_long.dropna()
train_long = train_long.dropna()
full_long = full_long.dropna()
raw_train = full_long[full_long['subject_id'].isin(train_long['subject_id'])]
raw_test = full_long[full_long['subject_id'].isin(test_long['subject_id'])]

print("raw_train shape = ", raw_train.shape)
print("raw_test shape = ", raw_test.shape)

print("test_long shape = ", test_long.shape)
print("train_long shape = ", train_long.shape)

print("---------- raw train and tax ----------")
print("full shape = ", full_long.shape)

In [None]:
print("---------- Select predictors for training set ----------")
train_set = raw_train
X = train_set.drop(['outcome_BMI_fnl', 'subject_id'], axis=1)
#X = X.drop(columns=['Unnamed: 0_tax', 'x_t'], errors='ignore')
Y = train_set[['outcome_BMI_fnl']]
Y = Y['outcome_BMI_fnl'].to_numpy() # Convert Y to numeric array
clusters_train = train_set['subject_id'].to_numpy() # Get ID variables
Z = np.ones((train_set.shape[0], 1)) # Create random effects matrix with ones
time = train_set['time'].astype(float).to_numpy() # Get time values as numeric array 

# Check the final columns
print("Final columns after drop:", X.columns.to_list())
# print("X values:", train_long['outcome_BMI_fnl'])

In [None]:
print("---------- Select predictors for test set ----------")
test_set = raw_test
X_new = test_set.drop(['outcome_BMI_fnl', 'subject_id'], axis=1)

X_new = X_new[X.columns]  # Reorder and select columns to match training set
X_new = X_new.astype(X.dtypes)  # Ensure data types match
# X_new = X_new.drop(columns=['Unnamed: 0_tax', 'x_t'], errors='ignore')
# X_new = X_new.drop(columns=['Unnamed: 0', 'character_id', 'timepoint'], errors='ignore')

Y_new = test_set['outcome_BMI_fnl'].to_numpy()  # Convert Y to numeric array
clusters_new = pd.Series(test_set['subject_id'])  # Convert to pandas Series
# Create random effects matrix with ones
Z_new = np.ones((len(X_new), 1))
time_new = test_set['time'].astype(float).to_numpy()  # Convert time values to numeric array

In [None]:
print("---------- RUN MERF BASIC🌱 ----------")
#mrf = MERF()
mrf = MERF(fixed_effects_model=RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score= True),
        gll_early_stop_threshold=None,
        max_iterations=max_iter)

mrf.fit(X.select_dtypes(include=[np.number]), 
        Z, 
        pd.Series(clusters_train), 
        Y)

In [None]:
plot_merf_training_stats(mrf)
plt.savefig(os.path.join(output_dir, '1.clinical/merf_raw_metrics.png'), dpi=300, bbox_inches='tight')


In [None]:
# Predict using the fitted model
X_new = X_new.drop(columns=['x_t'], errors='ignore')
y_hat_new = mrf.predict(X_new, Z_new, clusters_new)
print(y_hat_new)

In [98]:
# Calculate and print RMSE and R-squared
rmse = np.sqrt(np.mean((Y_new - y_hat_new)**2))
correlation = np.corrcoef(Y_new, y_hat_new)[0,1]
print(f"Correlation between actual and predicted values: {correlation:.4f}")
r2 = 1 - (np.sum((Y_new - y_hat_new)**2) / np.sum((Y_new - np.mean(Y_new))**2))
print(f"Root Mean Squared Error: {rmse:.4f}")
print(f"R-squared Score: {r2:.4f}")

forest = mrf.trained_fe_model
oob = str(round(forest.oob_score_*100, 1))  # percent variation
print(f" % Variation in training set: {oob}")

Correlation between actual and predicted values: 0.2777
Root Mean Squared Error: 5.1978
R-squared Score: -0.0847
 % Variation: 65.8


In [92]:
# Save all the components of forest to a csv file
# Convert forest.__dict__ to a DataFrame with a single row
df = pd.DataFrame.from_dict(forest.__dict__, orient='index')
df_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_dfs"
df.to_csv(os.path.join(df_dir,'merf_basic_raw_forest_components.csv'))

R - squared version for mixed model
PREV maybe 

In [None]:
# Extract feature names and importances
feature_names = forest.feature_names_in_
feature_importances = forest.feature_importances_

# Plotting
plt.figure(figsize=(10, 8))
sorted_indices = np.argsort(feature_importances)[::-1]
plt.bar(np.array(feature_names)[sorted_indices], np.array(feature_importances)[sorted_indices], color='skyblue')
plt.xlabel('Feature Names')
plt.ylabel('Feature Importances')
plt.title('Feature Importances')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, '1.clinical/merf_raw_feature_importances.png'), dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# ~~~~~~~~~ Metrics Generation ~~~~~~~~~~ #
# compute the ptev and prev
sigma_fixed = self.m * sigma_g
ptev = 100 * ((sigma_fixed ** 2 + self.sigma_b ** 2) / (sigma_fixed ** 2 + self.sigma_b ** 2 + self.sigma_e ** 2))
prev = 100 * (self.sigma_b ** 2 / (sigma_fixed ** 2 + self.sigma_b ** 2))


In [None]:
# Create scatter plot of predicted vs actual values
plt.figure(figsize=(8, 6))
plt.scatter(Y_new, y_hat_new, alpha=0.5)
plt.xlabel('Actual BMI Values')
plt.ylabel('Clinical Predicted Values')
plt.title('Predicted vs Actual Values with Trend Line')

# Add trend line
z = np.polyfit(Y_new, y_hat_new, 1)
p = np.poly1d(z)
plt.plot(Y_new, p(Y_new), "r--", alpha=0.8)
plt.grid(True, alpha=0.3)

# Save plot as PNG and PDF
plt.savefig(os.path.join(output_dir, '1.clinical/clinical_predicted_vs_actual.png'), dpi=300, bbox_inches='tight')
plt.show()

In [None]:
print("---------- RUN MERF with participant RE 🌱🌸 ----------")
 # Mixed Effects Random Forest Training with participant RE and time cluster 
train_set = train_long 
X_train = train_set.drop(['outcome_BMI_fnl', 'subject_id', 'time'], axis=1).to_numpy()
Z_train = np.array((np.ones(len(train_set)), train_set['subject_id'].apply(lambda s: int(s[-3:])))).T
clusters_train = pd.Series(train_set['subject_id'].apply(lambda s: int(s[-3:]))).astype(float)  # Convert to float if necessary
y_train = train_set[['outcome_BMI_fnl']]
y_train = y_train['outcome_BMI_fnl'].to_numpy() # Convert Y to numeric array

In [None]:
print(f"Dimensions of X_train: {X_train.shape}")
print(f"Dimensions of Z_train: {Z_train.shape}")
print(f"Number of unique inputs for clusters_train: {clusters_train.nunique()}")
print(f"Inputs to clusters_train: {clusters_train}")
print(f"Dimensions of y_train: {y_train.shape}")

In [None]:
mrf_id_fe = MERF(fixed_effects_model=RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score= True),
        gll_early_stop_threshold=None,
        max_iterations=max_iter)
mrf_id_fe.fit(X_train, Z_train, clusters_train, y_train)

In [58]:
forest = mrf_id_fe.trained_fe_model
oob = str(round(forest.oob_score_*100, 1))  # percent variation
# Save all the components of forest to a csv file
# Convert forest.__dict__ to a DataFrame with a single row
df = pd.DataFrame.from_dict(forest.__dict__, orient='index')
df_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_dfs"
df.to_csv(os.path.join(df_dir,'merf_id_forest_components.csv'))

In Mixed Effects Random Forests (MERF), the Generalized Log-Likelihood (GLL) is used to evaluate the quality of the model at each iteration

In [None]:
plot_merf_training_stats(mrf_id_fe, num_clusters_to_plot=100)

Test MERF if RE on test data

In [None]:
# Test data (repeat similar steps)
test_set = test_long
X_test = test_set.drop(['outcome_BMI_fnl', 'subject_id', 'time'], axis=1).to_numpy()
Z_test = np.array((np.ones(len(test_set)), test_set['subject_id'].apply(lambda s: int(s[-3:])))).T
clusters_test = pd.Series(test_set['subject_id'].apply(lambda s: int(s[-3:]))).astype(float)  # Convert to float if necessary

# Make predictions
yhat_merf = mrf_id_fe.predict(X_test, Z_test, clusters_test)

# Evaluate performance
mse_merf = np.sqrt(np.mean((test_set['outcome_BMI_fnl'] - yhat_merf)**2))
mae_merf = np.mean(np.abs(test_set['outcome_BMI_fnl'] - yhat_merf))
r2_merf = np.corrcoef(test_set['outcome_BMI_fnl'], yhat_merf)[0, 1]**2

print(f"Mean Squared Error (MERF): {mse_merf}")
print(f"Mean Absolute Error (MERF): {mae_merf}")
print(f"R-Squared (MERF): {r2_merf}")

In [None]:
print("---------- RUN MERF with time RE 🌱🌸🌱🌸 ----------")
train_set = train_long 
mrf_time_fe = MERF(fixed_effects_model=RandomForestRegressor(n_estimators=100, n_jobs=-1),
        gll_early_stop_threshold=None,
        max_iterations=max_iter)

# Extract fixed effects (X), outcome (y), and clusters (subject_id)
X_train = train_set.drop(['outcome_BMI_fnl', 'subject_id', 'time'], axis=1).to_numpy()
y_train = train_set['outcome_BMI_fnl'].to_numpy()
clusters_train = train_set['subject_id']
# Create random effects design matrix (Z) : intercept + time random effects
Z_train = np.column_stack((np.ones(len(train_set)), train_set['time']))
# Z_train = np.array((np.ones(len(train_set)), train_set['subject_id'].apply(lambda s: int(s[-3:])))).T

print(f"Dimensions of X_train: {X_train.shape}")
print(f"Dimensions of Z_train: {Z_train.shape}")
print(f"Dimensions of clusters_train: {clusters_train.shape}")
print(f"Dimensions of y_train: {y_train.shape}")

# Fit MERF model
mrf_time_fe.fit(X_train, Z_train, clusters_train, y_train)


In [None]:
plot_merf_training_stats(mrf_time_fe, num_clusters_to_plot=100)