## Description:
Run many models looking at:
1 - BMI and omic variables longitudinally 
2 - BMI and variable changes from BL-6m and 6m-12m

In [1]:
import pandas as pd
import os
import re
import sys
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import matplotlib as mpl
from merf.merf import MERF
from sklearn.model_selection import train_test_split, KFold
from merf.viz import plot_merf_training_stats
from joblib import dump
sns.set_context("poster")
mpl.rcParams['figure.figsize'] = (11,8)

current_dir = os.getcwd() # Get the current working directory
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
sys.path.append(parent_dir)
from em_utils import *

# test and train set directories for input data
longitudinal_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_dfs/5.combined/"
delta_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/zachs_rerun/drift_fs/csv/all_omic_processed_data/deltas/"
clin_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/data/clinical/transformed/aim2/merf_ready"
# Where hyperparameter training results are
delta_df_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_dfs/6.two_timepoints_deltas"
long_df_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_dfs/5.combined"
func_df_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_dfs/3.functional"
taxa_long_df_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_dfs/2.taxa"
clinical_df_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_dfs/1.clinical"
micom_df_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_dfs/4.micom"

In [2]:
# test and train longitudinal files 
test_all = pd.read_csv(os.path.join(longitudinal_dir, 'test_merged_all_omics_raw_meta.csv'))  
train_all = pd.read_csv(os.path.join(longitudinal_dir, 'training_merged_all_omics_raw_meta.csv'))  
print("test long shape = ", test_all.shape)
print("train long shape = ", train_all.shape)
print("test longitudinal: ", test_all.columns)
print("train longitudinal: ", train_all.columns)

# test and train delta files 
test_delta = read_data(delta_dir, "jan30_all_delta_test_imp_varcheck.csv")
train_delta = read_data(delta_dir, "jan30_all_delta_train_imp_varcheck.csv")
print("test delta shape = ", test_delta.shape)
print("train delta shape = ", train_delta.shape)
print("test delta: ", test_delta.columns)
print("train delta: ", train_delta.columns)

# hyperparameter training files 
delta_df = pd.read_csv(os.path.join(delta_df_dir, 'jan30_2times_all_omic_deltas_BMI_remove_time_var_imputed_swapped.csv'))
long_df = pd.read_csv(os.path.join(long_df_dir, 'jan13_params_fine_tuning_results_all_omics.csv'))
func_df = pd.read_csv(os.path.join(func_df_dir, 'dec6_magggie_params_fine_tuning_results_raw_functional.csv'))
taxa_df = pd.read_csv(os.path.join(taxa_long_df_dir, 'dec6_maggie_params_tuning_raw_taxa_oob.csv'))
micom_df = pd.read_csv(os.path.join(micom_df_dir, 'dec6_maggie_params_fine_tuning_results_raw_micom.csv'))
meta_df = pd.read_csv('/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_dfs/1.clinical/fine_tuning_results_raw_clinical.csv')

test long shape =  (66, 635)
train long shape =  (268, 635)
test longitudinal:  Index(['Unnamed: 0_test_long', 'record_id', 'subject_id', 'randomized_group',
       'cohort_number', 'sex', 'race', 'age', 'time', 'Glucose',
       ...
       'outcome_BMI_fnl_merged_data', 'Unnamed: 0_merged_data', 'bmi_prs',
       'old_or_new', 'predicted_BL_BMI', 'differences_BL_BMI',
       'diff_BMI_quartile', 'diff_BMI_std', 'methyl_bmi_rs',
       'methyl_bmi_rs_standardized'],
      dtype='object', length=635)
train longitudinal:  Index(['Unnamed: 0_train_long', 'record_id', 'subject_id', 'randomized_group',
       'cohort_number', 'sex', 'race', 'age', 'time', 'Glucose',
       ...
       'outcome_BMI_fnl_merged_data', 'Unnamed: 0_merged_data', 'bmi_prs',
       'old_or_new', 'predicted_BL_BMI', 'differences_BL_BMI',
       'diff_BMI_quartile', 'diff_BMI_std', 'methyl_bmi_rs',
       'methyl_bmi_rs_standardized'],
      dtype='object', length=635)
test delta shape =  (62, 920)
train delta shape 

### Longitudinal BMI first 

In [3]:
# Drop some columns 
train_set = train_all.drop(['Unnamed: 0_y','Unnamed: 0_x', 'Unnamed: 0_train_long', 'Unnamed: 0_merged_data',
                    'sample_id','subject_id', 'all_samples_x', 'all_samples_y', 'record_id','SampleID',
                    'time_x', 'time_y', 'x_t', 't', 'time', 'outcome_BMI_fnl_x', 'outcome_BMI_fnl_y', 
                    'outcome_BMI_fnl_merged_data', 'old_or_new', 'predicted_BL_BMI', 
                    'differences_BL_BMI', 'diff_BMI_quartile', 'diff_BMI_std', 'methyl_bmi_rs', 'methyl_bmi_rs_standardized'], 
                    axis=1)

test_set = test_all.drop(['Unnamed: 0_y','Unnamed: 0_x', 'Unnamed: 0_test_long', 'Unnamed: 0_merged_data',
                    'sample_id','subject_id', 'all_samples_x', 'all_samples_y', 'record_id', 'SampleID',
                    'time_x', 'time_y', 'x_t', 't', 'time', 'outcome_BMI_fnl_x', 'outcome_BMI_fnl_y', 
                    'outcome_BMI_fnl_merged_data', 'old_or_new', 'predicted_BL_BMI', 
                    'differences_BL_BMI', 'diff_BMI_quartile', 'diff_BMI_std', 'methyl_bmi_rs', 'methyl_bmi_rs_standardized'], 
                    axis=1)
print("test long shape = ", test_set.shape)
print("train long shape = ", train_set.shape)
print("test longitudinal: ", list(test_set.columns))
print("train longitudinal: ", list(train_set.columns))

test long shape =  (66, 610)
train long shape =  (268, 610)
test longitudinal:  ['randomized_group', 'cohort_number', 'sex', 'race', 'age', 'Glucose', 'HDL_Total_Direct_lipid', 'HOMA_IR', 'Insulin_endo', 'LDL_Calculated', 'Triglyceride_lipid', 'outcome_BMI_fnl_test_long', 'proton', 'Diacetyl', 'Butyrate', 'Formate', 'acetone', 'X2.methylbutyrate', 'Valeric.Acid', 'Methane', 'X1.4.Dihydroxy.2.naphthoate', 'X2.Oxoglutarate', 'Pyruvate', 'D.lactate', 'L.lactate', 'laurate', 'Phenol', 'L.Carnitine', 'Isopropanol', 'p.Cresol', 'O2', 'X1.Propanol', 'butanol', 'Propanal', 'X4.hydroxyphenylacetate', 'X.S..propane.1.2.diol', 'Propane.1.3.diol', 'R_R__2_3_Butanediol', 'Oxaloacetate', 'sulfate', 'octadecenoate..n.C18.1.', 'o.Succinylbenzoate', 'Thymidine', 'Reduced.riboflavin', 'Riboflavin', 'X2.Hydroxyphenylacetate', 'Sulfite', 'thiosulfate.2..', 'X.3.4.dihydroxyphenyl.acetate', 'Diphosphate', 'trimethylamine', 'glycine.betaine', 'Choline', 'Trimethylamine.N.oxide', 'Isochorismate', 'Inosine', '

In [4]:
# Find the column numbers for "proton" and "Carbon.dioxide" in train_set
proton_column = train_set.columns.get_loc("proton")
carbon_dioxide_column = train_set.columns.get_loc("Carbon.dioxide")
print("Column number for 'proton': ", proton_column)
print("Column number for 'Carbon.dioxide': ", carbon_dioxide_column)


Column number for 'proton':  12
Column number for 'Carbon.dioxide':  102


In [5]:
# Columns to KEEP for only meta 
meta_keep = ['all_samples','outcome_BMI_fnl_train_long', 'randomized_group', 'cohort_number', 'sex', 'race', 'age', 'Glucose', 'HDL_Total_Direct_lipid', 'HOMA_IR', 'Insulin_endo', 'LDL_Calculated', 'Triglyceride_lipid']
# Columns to KEEP for only meta + GRS
meta_grs = ['all_samples','outcome_BMI_fnl_train_long', 'randomized_group', 'cohort_number', 'sex', 'race', 'age', 'Glucose', 'HDL_Total_Direct_lipid', 'HOMA_IR', 'Insulin_endo', 'LDL_Calculated', 'Triglyceride_lipid', 'bmi_prs']
# Columns to KEEP for only meta + GRS + taxa
meta_grs_taxa = meta_grs + [col for col in train_set.columns if col.startswith("g__")]
# Columns to KEEP for only meta + GRS + taxa + pathway/functional 
meta_grs_taxa_functional = meta_grs_taxa + [col for col in train_set.columns if col not in train_set.columns[proton_column:carbon_dioxide_column+1]]
# Columns to DROP for only GRS + taxa + pathway/functional + micom (no meta)
all_but_meta = ['randomized_group', 'cohort_number', 'sex', 'race', 'age', 'Glucose', 'HDL_Total_Direct_lipid', 'HOMA_IR', 'Insulin_endo', 'LDL_Calculated', 'Triglyceride_lipid']

1.1 Merf for just meta longitudinal

In [6]:
train = pd.read_csv(os.path.join(clin_dir, 'raw__train_clinical.csv'))  
test = pd.read_csv(os.path.join(clin_dir, 'raw_test_clinical.csv'))
print("test longitudinal: ", list(test.columns))
print("train longitudinal: ", list(train.columns))

output_dir = "/Users/emily/projects/research/Stanislawski/comps/mutli-omic-predictions/play_scripts/2.models/merf_python/merf_plots/long_combined"
r2_out = 'r2_only_meta_merf_bmi_long_feb5.png'
feature_imp_out = 'fi_only_meta_merf_bmi_long_feb5.png'

test = test.dropna()
train = train.dropna()
print("test shape = ", test.shape)
print("train shape = ", train.shape)

print("---------- Select predictors for training set ----------")
train_set = train
X = train_set.drop(['outcome_BMI_fnl', 'subject_id', 'x_t'], axis=1)
Y = train_set['outcome_BMI_fnl'].to_numpy()
clusters_train = train_set['subject_id'].to_numpy() 
Z = np.ones((train_set.shape[0], 1)) 

print("---------- Select predictors for test set ----------")
test_set = test
X_new = test_set.drop(['outcome_BMI_fnl', 'subject_id', 'x_t'], axis=1)[X.columns].astype(X.dtypes)
Y_new = test_set['outcome_BMI_fnl'].to_numpy()  
clusters_new = pd.Series(test_set['subject_id'])  
Z_new = np.ones((len(X_new), 1))

print("---------- Run MERF models ----------")
run_merf_analysis(X, Y, Z, clusters_train, 
                  X_new, Y_new, Z_new, clusters_new, 
                  meta_df,
                  output_dir, r2_out, feature_imp_out)

test longitudinal:  ['subject_id', 'randomized_group', 'sex', 'race', 'age', 'time', 'Glucose', 'HDL_Total_Direct_lipid', 'HOMA_IR', 'Insulin_endo', 'LDL_Calculated', 'Triglyceride_lipid', 'outcome_BMI_fnl', 'x_t']
train longitudinal:  ['subject_id', 'randomized_group', 'sex', 'race', 'age', 'time', 'Glucose', 'HDL_Total_Direct_lipid', 'HOMA_IR', 'Insulin_endo', 'LDL_Calculated', 'Triglyceride_lipid', 'outcome_BMI_fnl', 'x_t']
test shape =  (67, 14)
train shape =  (275, 14)
---------- Select predictors for training set ----------
---------- Select predictors for test set ----------
---------- Run MERF models ----------
First 5 columns for the lowest mean_mse_score:
n_estimators          5.0
max_depth            50.0
min_samples_split     2.0
max_iter             20.0
n_splits             10.0
Name: 155, dtype: float64
First 5 columns for the lowest mean_prev_score:
n_estimators         100.0
max_depth            100.0
min_samples_split      2.0
max_iter              10.0
n_splits      

InvalidParameterError: The 'min_samples_split' parameter of RandomForestRegressor must be an int in the range [2, inf) or a float in the range (0.0, 1.0]. Got 2.0 instead.