In [1]:
import pandas as pd
import os
import numpy as np
from merf import MERF

def read_data(directory, filename):
    """Read CSV data from specified directory and filename"""
    filepath = os.path.join(directory, filename)
    return pd.read_csv(filepath)

print("---------- Read taxonomy data ---------- ")
t_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/taxa/aim2_transformed/"
tax_test = read_data(t_dir, "genus/aim2_clr_testing.csv")
tax_train = read_data(t_dir, "genus/aim2_clr_training.csv") 
tax_full = read_data(t_dir, "genus/clr_taxa_all.csv")

print("---------- Read metadata ----------")
m1_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/clinical/transformed/aim2"
test = read_data(m1_dir, "a2_test_samples_standard_clinical.csv")
train = read_data(m1_dir, "a2_train_samples_standard_clinical.csv")
full = read_data(m1_dir, "a2_meta_Transformed_standard_clinical.csv")
full_raw = read_data(m1_dir, "a2_meta_not_Transformed_standard_clinical.csv")

---------- Read taxonomy data ---------- 
---------- Read metadata ----------


In [2]:
# Process Taxa Input data
# FULL dataset
# Split X column into character_id and timepoint
print("---------- Split X column into character_id and timepoint ----------")
tax_full_t = tax_full.copy()
X_LABEL = 'Unnamed: 0'
print(tax_full_t[X_LABEL])

---------- Split X column into character_id and timepoint ----------
0      AAL-144.12m
1       AAL-144.3m
2       AAL-144.6m
3       AAL-144.BL
4       ABR-079.3m
          ...     
590     YSU-097.6m
591     YSU-097.BL
592     ZVU-096.3m
593     ZVU-096.6m
594     ZVU-096.BL
Name: Unnamed: 0, Length: 595, dtype: object


In [3]:
tax_full_t[['character_id', 'timepoint']] = tax_full_t[X_LABEL].str.split('.', expand=True)

In [4]:
# Create time column (assuming create_t_column functionality maps timepoints to numeric values)
def create_t_column(df):
    # Map timepoints to numeric values
    time_map = {'BL': '0', 'V1': '1', 'V2': '2', 'V3': '3', 
                'V4': '6', 'V5': '12', 'V6': '18'}
    return df['timepoint'].map(time_map)

print("---------- Create time column ----------")
tax_full_t['t'] = create_t_column(tax_full_t)

print("---------- Create x_t column combining character_id and t ----------")
tax_full_t['x_t'] = tax_full_t['character_id'] + '.' + tax_full_t['t']

---------- Create time column ----------
---------- Create x_t column combining character_id and t ----------


In [5]:
print("---------- Filter and select columns ----------")
tax = tax_full_t[~tax_full_t['t'].isin(['3', '18'])]
tax = tax.drop(['t', 'timepoint', 'character_id', X_LABEL], axis=1)

---------- Filter and select columns ----------


In [7]:
print("---------- Build training dataset ----------")
train_t = tax_train.copy()
train_t[['character_id', 'timepoint']] = train_t[X_LABEL].str.split('.', expand=True)
train_t['t'] = create_t_column(train_t)
train_t['x_t'] = train_t['character_id'] + '.' + train_t['t']
train_t = train_t[~train_t['t'].isin(['3', '18'])]


---------- Build training dataset ----------


In [8]:
print("---------- Build testing dataset ----------")
test_t = tax_test.copy()
test_t[['character_id', 'timepoint']] = test_t[X_LABEL].str.split('.', expand=True)
test_t['t'] = create_t_column(test_t)
test_t['x_t'] = test_t['character_id'] + '.' + test_t['t']
test_t = test_t[~test_t['t'].isin(['3', '18'])]

---------- Build testing dataset ----------


In [9]:
print("---------- Clean up ----------")
del tax_test, tax_train, tax, tax_full

---------- Clean up ----------


In [10]:
print(full_raw.columns.to_list())
print("---")
print(train.columns.to_list())
print("---")
print(test.columns.to_list())

['Unnamed: 0', 'record_id', 'subject_id', 'randomized_group', 'cohort_number', 'sex', 'race', 'age', 'outcome_BMI_fnl_BL', 'Glucose_BL', 'HOMA_IR_BL', 'Insulin_endo_BL', 'HDL_Total_Direct_lipid_BL', 'LDL_Calculated_BL', 'Triglyceride_lipid_BL', 'outcome_BMI_fnl_6m', 'Glucose_6m', 'HOMA_IR_6m', 'Insulin_endo_6m', 'HDL_Total_Direct_lipid_6m', 'LDL_Calculated_6m', 'Triglyceride_lipid_6m', 'outcome_BMI_fnl_12m', 'Glucose_12m', 'HOMA_IR_12m', 'Insulin_endo_12m', 'HDL_Total_Direct_lipid_12m', 'LDL_Calculated_12m', 'Triglyceride_lipid_12m']
---
['Unnamed: 0', 'record_id', 'subject_id', 'randomized_group', 'cohort_number', 'sex', 'race', 'age', 'outcome_BMI_fnl_BL', 'Glucose_BL', 'HOMA_IR_BL', 'Insulin_endo_BL', 'HDL_Total_Direct_lipid_BL', 'LDL_Calculated_BL', 'Triglyceride_lipid_BL', 'outcome_BMI_fnl_6m', 'Glucose_6m', 'HOMA_IR_6m', 'Insulin_endo_6m', 'HDL_Total_Direct_lipid_6m', 'LDL_Calculated_6m', 'Triglyceride_lipid_6m', 'outcome_BMI_fnl_12m', 'Glucose_12m', 'HOMA_IR_12m', 'Insulin_endo_

In [11]:
print(full_raw.columns.to_list() == train.columns.to_list())
print(train.columns.to_list() == test.columns.to_list())

True
True


In [12]:
import pandas as pd
import re

# Process metadata to long format
def make_long(wide_data):
    """
    Converts a wide-format DataFrame into a long-format DataFrame,
    aligning with the structure produced by the R transformation.
    
    Args:
        wide_data (pd.DataFrame): Input DataFrame in wide format.
    
    Returns:
        pd.DataFrame: Transformed DataFrame in long format.
    """
    # Extract measurement columns and id columns
    id_vars = [col for col in wide_data.columns if not re.search(r'_(BL|6m|12m)$', col)]
    value_vars = [col for col in wide_data.columns if re.search(r'_(BL|6m|12m)$', col)]

    # Melt the DataFrame to long format
    long_data = wide_data.melt(
        id_vars=id_vars,
        value_vars=value_vars,
        var_name="measurement_time",
        value_name="value"
    )

    # Extract measurement type and time from the variable name
    long_data[['measurement_type', 'time']] = long_data['measurement_time'].str.extract(r'(.+)_(BL|6m|12m)')

    # Map time values
    time_mapping = {'BL': 0, '6m': 6, '12m': 12}
    long_data['time'] = long_data['time'].map(time_mapping)

    # Drop the original melted column
    long_data = long_data.drop(columns=['measurement_time'])

    # Pivot the data back to wide format for measurements
    long_data = long_data.pivot_table(
        index=id_vars + ['time'], 
        columns='measurement_type', 
        values='value'
    ).reset_index()

    # Flatten the column MultiIndex from pivot_table
    long_data.columns.name = None
    long_data.columns = [str(col) for col in long_data.columns]

    return long_data


# Apply the function to each dataset
print("---------- Convert metadata to long format ----------")
full_long = make_long(full_raw)
full_long['x_t'] = full_long['subject_id'].astype(str) + '.' + full_long['time'].astype(str)

train_long = make_long(train)
train_long['x_t'] = train_long['subject_id'].astype(str) + '.' + train_long['time'].astype(str)

test_long = make_long(test)
test_long['x_t'] = test_long['subject_id'].astype(str) + '.' + test_long['time'].astype(str)

# Inspect the final column names for test_long
print("Columns after transformation:", test_long.columns.to_list())

---------- Convert metadata to long format ----------
Columns after transformation: ['Unnamed: 0', 'record_id', 'subject_id', 'randomized_group', 'cohort_number', 'sex', 'race', 'age', 'time', 'Glucose', 'HDL_Total_Direct_lipid', 'HOMA_IR', 'Insulin_endo', 'LDL_Calculated', 'Triglyceride_lipid', 'outcome_BMI_fnl', 'x_t']


In [13]:
print(test_long.columns.to_list())


['Unnamed: 0', 'record_id', 'subject_id', 'randomized_group', 'cohort_number', 'sex', 'race', 'age', 'time', 'Glucose', 'HDL_Total_Direct_lipid', 'HOMA_IR', 'Insulin_endo', 'LDL_Calculated', 'Triglyceride_lipid', 'outcome_BMI_fnl', 'x_t']


In [14]:
print("---------- Clean up ----------")
del test, train, full_raw, full

print("---------- Select and prepare metadata for merging ----------")
test_meta = test_long[['x_t', 'outcome_BMI_fnl']]
train_meta = train_long[['x_t', 'outcome_BMI_fnl']]

print("---------- Merge training data ----------")
train_tax = train_t.merge(train_meta, on='x_t')
train_tax = train_tax.drop(['x_t', X_LABEL, 'character_id', 'timepoint'], axis=1)

print("---------- Merge testing data ----------")
test_tax = test_t.merge(test_meta, on='x_t')
test_tax = test_tax.drop(['x_t', X_LABEL, 'character_id', 'timepoint'], axis=1)


---------- Clean up ----------
---------- Select and prepare metadata for merging ----------
---------- Merge training data ----------
---------- Merge testing data ----------


In [17]:
# Inspect the columns of the tax_full_t DataFrame
print(tax_full_t.columns.to_list())

# Perform the merge with custom suffixes to prevent 'X.x' and 'X.y'
print("---------- Merge full dataset ----------")
full = tax_full_t.merge(full_long, on='x_t', how='left', suffixes=('_tax', '_long'))

# Check the columns of the merged DataFrame
print("Columns after merge:", full.columns.to_list())

# Define columns to drop after merge (including Unnamed: 0 if present)
columns_to_drop = ['Unnamed: 0', 'X.y', 'X.x', 'randomized_group', 'cohort_number', 'record_id',
                  'subject_id', 'character_id', 'cohort_number', 'age', 'race', 'sex', 
                  'time', 'timepoint', 'HOMA_IR', 'Insulin_endo', 'HDL_Total_Direct_lipid',
                  'Glucose', 'LDL_Calculated', 'Triglyceride_lipid']

# Drop columns only if they exist in the DataFrame (since some may not be present after merge)
full = full.drop([col for col in columns_to_drop if col in full.columns], axis=1)

# Check the final columns
print("Final columns after drop:", full.columns.to_list())

# Clean up unnecessary variables
print("---------- Clean up ----------")
del train_meta, test_meta, test_t, train_t, test_long, train_long, full_long, tax_full_t

['Unnamed: 0', 'g__Parabacteroides_B_862066', 'g__Coprenecus', 'g__Butyricimonas', 'g__Odoribacter_865974', 'g__Alistipes_A_871404', 'g__Paramuribaculum', 'g__Alistipes_A_871400', 'g__Barnesiella', 'g__Coprobacter', 'g__Phocaeicola_A_858004', 'g__Bacteroides_H', 'g__Prevotella', 'g__Paraprevotella', 'g__Methanobrevibacter_A', 'g__DTU012', 'g__Escherichia_710834', 'g__Parasutterella', 'g__Sutterella', 'g__Haemophilus_D_735815', 'g__Enterobacter_B_713587', 'g__Akkermansia', 'g__Eubacterium_O_258270', 'g__Anaerofustis', 'g__Peptococcus', 'g__QAMH01', 'g__Senegalimassilia', 'g__Adlercreutzia_404257', 'g__Slackia_A', 'g__Eggerthella', 'g__CAG.1427', 'g__Gordonibacter', 'g__Collinsella', 'g__Holdemania', 'g__Longibaculum', 'g__Catenibacterium', 'g__Erysipelatoclostridium', 'g__Faecalibacillus', 'g___2', 'g__Holdemanella', 'g__Merdibacter', 'g__Clostridium_AQ', 'g__Amedibacillus', 'g__Longicatena', 'g__Dielma', 'g__Pauljensenia', 'g__Bifidobacterium_388775', 'g__Acidaminococcus', 'g__Phascola

In [18]:
print("---------- Remove NAs and filter by time ----------")
full_no_na = full.dropna()
test_tax_no_na = test_tax.dropna()
train_tax_no_na = train_tax.dropna()

print("---------- Create demo datasets filtered by time ----------")
demo_train = full_no_na[full_no_na['t'].astype(int) < 12]
demo_test = full_no_na[full_no_na['t'].astype(int) == 12]

---------- Remove NAs and filter by time ----------
---------- Create demo datasets filtered by time ----------


In [20]:
print("---------- Select predictors for training set ----------")
train_set = demo_train
X = train_set.drop(['t', 'outcome_BMI_fnl', 'all_samples'], axis=1)
Y = train_set[['outcome_BMI_fnl']]
Y = Y['outcome_BMI_fnl'].to_numpy() # Convert Y to numeric array
clusters_train = train_set['all_samples'].to_numpy() # Get ID variables
Z = np.ones((train_set.shape[0], 1)) # Create random effects matrix with ones
time = train_set['t'].astype(float).to_numpy() # Get time values as numeric array 

---------- Select predictors for training set ----------


In [30]:
print("---------- 🥰🥰🥰🥰 RUN MERF 🥰🥰🥰🥰 ----------")
mrf = MERF()
mrf.fit(X.select_dtypes(include=[np.number]), Z, pd.Series(clusters_train), Y)

---------- 🥰🥰🥰🥰 RUN MERF 🥰🥰🥰🥰 ----------


INFO     [merf.py:307] Training GLL is 168.59335666183713 at iteration 1.
INFO     [merf.py:307] Training GLL is 268.99765560219475 at iteration 2.
INFO     [merf.py:307] Training GLL is 306.1788402597862 at iteration 3.
INFO     [merf.py:307] Training GLL is 323.4259416169389 at iteration 4.
INFO     [merf.py:307] Training GLL is 331.5438569670572 at iteration 5.
INFO     [merf.py:307] Training GLL is 333.6258751864621 at iteration 6.
INFO     [merf.py:307] Training GLL is 337.0102529050606 at iteration 7.
INFO     [merf.py:307] Training GLL is 341.6669153390822 at iteration 8.
INFO     [merf.py:307] Training GLL is 340.46053931375917 at iteration 9.
INFO     [merf.py:307] Training GLL is 340.96475290141143 at iteration 10.
INFO     [merf.py:307] Training GLL is 341.3903734639695 at iteration 11.
INFO     [merf.py:307] Training GLL is 340.4433590299796 at iteration 12.
INFO     [merf.py:307] Training GLL is 343.5584397651649 at iteration 13.
INFO     [merf.py:307] Training GLL is 341.

<merf.merf.MERF at 0x135c52c80>