In [None]:
import datetime as dt
import csv
import logging
import os
import pickle
import re
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr, zscore
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# RABIT model training

In [None]:
# read the CSV file into a DataFrame
df_reprs_final= pd.read_csv('/path/to/motor_representations/ehr_representations_random.csv')
## get protein expression for selected protein
prot = pd.read_csv('/path/to/measured_proteomics/from/uk_biobank/measured_proteomics_random.csv')

In [None]:
# list of protein names
poi_list = [col[:-8] for col in prot.columns if col.endswith('_protein')]
poi_list

In [None]:
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='/path/to/outputdir/mylog.log', mode='w')  # 'a' means append mode
formatter = logging.Formatter('%(asctime)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

In [None]:
# for collection of accuracy metrics
data = {"protein": [], 'pearson_correlation': [], 'pearson_pvalue':[], "spearman_correlation": [], "spearman_pvalue": [], "best_train_rmse": [],  "best_val_rmse": [], "best_test_rmse": []}
df = pd.DataFrame(data)


In [None]:
# to collect predictions
predvalues_df = pd.DataFrame({'sample_eid': prot['eid']})

# function to populate the dataframe with new columns
def populate_dataframe(df, poi, predictions, y):
    column_name = poi + "_protein"
    y.drop(columns=[column_name], inplace=True)
    y[column_name] = predictions
    df = pd.merge(df, y, left_on='sample_eid', right_on='patient_ids', how='left')
    df = df.drop(columns=['patient_ids'])
    return df

In [None]:
def data_org(protein_name, latentdf, prot):

    # protein of interest
    poi = protein_name #change this to select different proteins
    poiname = poi + '_protein'
    print(poiname)
    # subset protein expression for poi
    poidf = prot[[poiname, 'eid']]
    # drop rows (patients) that don't have value for given protein
    poidf = poidf.dropna()
    # merge dfs and clean for xgboost by removing sample_ID and extraneous columns
    merged_df = pd.merge(latentdf, poidf, left_on = 'patient_ids', right_on='eid', how='inner')
    cols_remove = ['eid', 'labeling_time']
    merged_clean = merged_df.drop(columns=cols_remove)

    X, y = merged_clean.drop(poiname, axis=1), merged_clean[[poiname, 'patient_ids']]
    return poi, X, y




def bootstrap_nn(poi, X, y, bootstrap, output_dir):    
    # Create data structures below for bootstrapping
    train_error_dict = {}
    val_error_dict = {}
    test_error_dict = {}
    prediction_dict = {str(i): None for i in range(y.shape[0])}
    featimp_dict = {col: [] for col in df_reprs_final.columns if col.startswith("data_")}
    poiname = poi + '_protein'
    
    unique_patient_ids = X['patient_ids'].unique()  # for bypatient split
    unique_patient_ids_df = pd.DataFrame(unique_patient_ids, columns=['patient_ids'])  # for bypatient split
    
    # Create the models subdirectory inside the output_dir
    models_dir = os.path.join(output_dir, "models")
    os.makedirs(models_dir, exist_ok=True)
    
    # Create the poi_model_weights directory inside the models directory
    model_weights_dir = os.path.join(models_dir, f"{poi}_model_weights")
    os.makedirs(model_weights_dir, exist_ok=True)
    
    for i in range(bootstrap):
        
        # ('patient_ids') is not a feature
        input_size = X.shape[1] - 1
        train_ids_initial, test_ids = train_test_split(unique_patient_ids_df, test_size=0.5, random_state=i)
        
        # split data into train and test (50/50 split)
        cols_remove = ['patient_ids']
        X_train_initial = X[X['patient_ids'].isin(train_ids_initial['patient_ids'])]
        X_test = X[X['patient_ids'].isin(test_ids['patient_ids'])].drop(columns=cols_remove)
        y_train_initial = y[y['patient_ids'].isin(train_ids_initial['patient_ids'])]
        y_test = y[y['patient_ids'].isin(test_ids['patient_ids'])].drop(columns=cols_remove)
        
        # second split of training data into training and validation sets (80/20 split)
        train_ids, val_ids = train_test_split(train_ids_initial, test_size=0.2, random_state=i)
        
        X_train = X_train_initial[X_train_initial['patient_ids'].isin(train_ids['patient_ids'])].drop(columns=cols_remove)
        X_val = X_train_initial[X_train_initial['patient_ids'].isin(val_ids['patient_ids'])].drop(columns=cols_remove)
        y_train = y_train_initial[y_train_initial['patient_ids'].isin(train_ids['patient_ids'])].drop(columns=cols_remove)
        y_val = y_train_initial[y_train_initial['patient_ids'].isin(val_ids['patient_ids'])].drop(columns=cols_remove)
        
        # convert to PyTorch tensors
        X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
        X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
        y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
        X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
        y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)
        
        # create datasets and dataloaders
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
        test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
        test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
        
        # build Model
        class RegressionModel(nn.Module):
            def __init__(self):
                super(RegressionModel, self).__init__()
                self.layer1 = nn.Linear(input_size, 32)  # Input to hidden layer
                self.relu = nn.ReLU()
                self.layer2 = nn.Linear(32, 1)  # Hidden to output layer
        
            def forward(self, x):
                x = self.relu(self.layer1(x))
                x = self.layer2(x)
                return x
        
        model = RegressionModel()
        
        # criterion and optimizer
        criterion = nn.MSELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        
        # training loop with early stopping
        num_epochs = 500  # Maximum number of epochs
        patience = 10  # Number of epochs to wait
        train_losses = []
        val_losses = []
        
        best_val_loss = float('inf')
        best_model_state = None
        best_epoch = 0
        epochs_no_improve = 0
        
        for epoch in range(num_epochs):
            model.train()
            train_loss = 0.0
            for inputs, labels in train_loader:
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                train_loss += loss.item() * inputs.size(0)
            train_loss = train_loss / len(train_loader.dataset)
            train_losses.append(train_loss)
            
            # validation
            model.eval()
            val_loss = 0.0
            with torch.no_grad():
                for inputs, labels in val_loader:
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    val_loss += loss.item() * inputs.size(0)
            val_loss = val_loss / len(val_loader.dataset)
            val_losses.append(val_loss)
            
            # update best model if current validation loss is lower
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model_state = model.state_dict()
                best_epoch = epoch + 1
                epochs_no_improve = 0  # Reset the counter if there is an improvement
            else:
                epochs_no_improve += 1
            
            # early stopping condition
            if epochs_no_improve >= patience:
                print(f'Early stopping at epoch {epoch+1}')
                break
            
            print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')
        
        # store training and validation losses for plotting
        train_error_dict[i] = train_losses
        val_error_dict[i] = val_losses
        
        # print best epoch
        print(f'The best model was from epoch {best_epoch} with Validation Loss: {best_val_loss:.4f}')
        
        # load best model
        model.load_state_dict(best_model_state)
        
        # save the model for this bootstrap iteration in the poi_model_weights directory
        model_path = os.path.join(model_weights_dir, f'bootstrap_model_{i}.pt')
        torch.save(model.state_dict(), model_path)
        print(f"Saved model for bootstrap iteration {i} at {model_path}")
        
        # collect actual and predicted values for the best model
        actual = []
        predicted = []
        
        model.eval()
        with torch.no_grad():
            for inputs, labels in test_loader:
                outputs = model(inputs)
                actual.extend(labels.numpy().flatten())
                predicted.extend(outputs.numpy().flatten())
        
        # collect predictions
        for p in range(y_test.shape[0]):
            key_to_append = str(y_test.index[p])
            value_to_append = predicted[p]
            if key_to_append in prediction_dict:
                if prediction_dict[key_to_append] is None:
                    prediction_dict[key_to_append] = []
                prediction_dict[key_to_append].append(value_to_append)

    return prediction_dict, train_error_dict, val_error_dict


def plot_correlation(poi, y, prediction_dict, savepath):
    from scipy.stats import pearsonr, spearmanr
    # calculate average prediction for each sample
    y_avg_pred = [sum(values) / len(values) if values is not None else None for values in prediction_dict.values()]
    
    # Convert y_test DataFrame to a numpy array and reshape
    cols_remove = ['patient_ids']
    y = y.drop(columns=cols_remove)
    
    y_np = y.to_numpy().reshape(-1)

    # create a mask for numeric values in y_avg_pred
    is_numeric_mask = np.array([isinstance(x, (int, float)) for x in y_avg_pred])

    # apply the mask to both y_np and y_avg_pred to filter out rows where y_avg_pred is not numeric
    filtered_y_np = y_np[is_numeric_mask]
    filtered_y_avg_pred = np.array(y_avg_pred)[is_numeric_mask]

    # calculate correlations using the filtered data
    if len(filtered_y_np) > 1 and len(filtered_y_avg_pred) > 1: 
        pcorrelation, ppvalue = pearsonr(filtered_y_np, filtered_y_avg_pred)
        scorrelation, spvalue = spearmanr(filtered_y_np, filtered_y_avg_pred)
    else:
        pcorrelation, ppvalue = np.nan, np.nan 
        scorrelation, spvalue = np.nan, np.nan  

    # setting same axis range for both x and y axis
    min_val = min(min(filtered_y_np), min(filtered_y_avg_pred))
    max_val = max(max(filtered_y_np), max(filtered_y_avg_pred))

     # plot predicted vs. actual values
    plt.scatter(filtered_y_np, filtered_y_avg_pred, alpha=0.5)
    plt.xlabel("Actual Values")
    plt.ylabel("Average Predicted Values")
    plt.title(f"{poi} Predicted vs. Actual\n(Spearman Correlation: {scorrelation:.2f}, Pvalue: {spvalue})")

    
    plt.xlim(min_val, max_val)
    plt.ylim(min_val, max_val)
    plt.plot([min_val, max_val], [min_val, max_val], color='red', linestyle='--')
    save_directory = savepath+"/correlation/"
    os.makedirs(save_directory, exist_ok=True)
    modpoi = poi.replace("/", "_")
    plot_filename = os.path.join(save_directory, f"{modpoi}_scorr.png")
    plt.savefig(plot_filename)
    plt.clf()
    return poi, pcorrelation, ppvalue, scorrelation, spvalue, y_avg_pred

def pad_errors(error_dict):
    max_length = max(len(lst) for lst in error_dict.values())
    padded_errors = {key: lst + [np.nan] * (max_length - len(lst)) for key, lst in error_dict.items()}
    return padded_errors

def plot_learning_curve(poi, train_error_dict, val_error_dict, savepath):
    padded_train_errors = pad_errors(train_error_dict)
    padded_val_errors = pad_errors(val_error_dict)
    train_errors = np.array(list(padded_train_errors.values()))
    val_errors = np.array(list(padded_val_errors.values()))

    # calculate the mean error per epoch across all bootstrap iterations
    mean_train_errors = np.nanmean(train_errors, axis=0)
    mean_val_errors = np.nanmean(val_errors, axis=0)
    epochs = np.arange(1, len(mean_train_errors) + 1)
    
   # plot the learning curve
    plt.figure(figsize=(10, 6))
    plt.plot(epochs, mean_train_errors, label='Average Train Error', marker='o')
    plt.plot(epochs, mean_val_errors, label='Average Validation Error', marker='x')
    plt.xlabel('Epoch')
    plt.ylabel('Error')
    plt.title(f'{poi} Learning Curve')
    plt.legend()
    plt.grid(True)
    
    # save the plot
    save_directory = os.path.join(savepath, "learning_curves")
    os.makedirs(save_directory, exist_ok=True)
    plot_filename = os.path.join(save_directory, f"{poi.replace('/', '_')}_learning_curve.png")
    plt.savefig(plot_filename)
    plt.clf()
    
    # return the minimum average training error (rmse for interpretation)
    best_train_error = np.min(mean_train_errors)
    best_train_rmse = np.sqrt(best_train_error)
    best_val_error = np.min(mean_val_errors)
    best_val_rmse = np.sqrt(best_val_error)
    return best_train_rmse, best_val_rmse
    

In [None]:
# loop through all proteins
outputdir = "/path/to/outputdir/"
for i in range(len(poi_list)):
    logger.debug(f"i index {i}")
    print("i index:", i)
    start_time = time.time()
    poi = poi_list[i]
    logger.debug(f"Started {poi}")
    bootstrap = 10 
    poi, X, y = data_org(poi, df_reprs_final, prot)

    predictions, train_error_dict, valid_error_dict =  bootstrap_nn(poi, X, y, bootstrap, outputdir)
    poi, pcorrelation, ppvalue, scorrelation, spvalue, y_avg_pred = plot_correlation(poi, y, predictions, outputdir)
    best_train_rmse_error, best_val_rmse_error = plot_learning_curve(poi, train_error_dict, valid_error_dict, outputdir)

    predvalues_df = populate_dataframe(predvalues_df, poi, y_avg_pred, y)
    predvalues_df.to_csv(outputdir+'/predvalues.csv', index=False)

    df.loc[i] = [poi, pcorrelation, ppvalue, scorrelation, spvalue, best_train_rmse_error, best_val_rmse_error, None]
    df.to_csv(outputdir+'/output.csv', index=False)

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Time spent on grid search: {elapsed_time:.2f} seconds")
    logger.debug(f"Time spent protein: {elapsed_time:.2f} seconds")

In [None]:
# correct pvalue
output = pd.read_csv(outputdir+'/output.csv')
output

In [None]:
# adjust p-values using Benjamini-Hochberg
def benjamini_hochberg(p_values, alpha=0.05):
    m = len(p_values)
    sorted_indices = p_values.argsort()
    sorted_p_values = p_values[sorted_indices]

    adjusted_p_values = sorted_p_values.copy()
    for i in reversed(range(m)):
        if i == m - 1:
            adjusted_p_values[i] = min(sorted_p_values[i], 1)
        else:
            adjusted_p_values[i] = min(sorted_p_values[i] * m / (i + 1), adjusted_p_values[i + 1])

    # reordering to the original order of p-values
    back_order_indices = sorted_indices.argsort()
    return adjusted_p_values[back_order_indices]

output['bh_corrected_spearman_pvalue'] = benjamini_hochberg(output['spearman_pvalue'].values)
output['bh_corrected_pearson_pvalue'] = benjamini_hochberg(output['pearson_pvalue'].values)
output.to_csv(outputdir+'/output_adjusted.csv', index=False)
output



# Using trained model to generate protein values

In [None]:
# poi_list (same as above, list of protein names that you are predicting)

In [None]:
def data_org(protein_name, latentdf):

    #protein of interest
    poi = protein_name #change this to select different proteins
    poiname = poi + '_protein'
    print(poiname)
    cols_remove = ['labeling_time']
    # cols_remove = ['sample_ID', 'labeling_time'] ## for bypatient split
    latentdf_clean = latentdf.drop(columns=cols_remove)
    return poi, latentdf_clean



# Define the model architecture to match what was used during training
class RegressionModel(nn.Module):
    def __init__(self, input_size):
        super(RegressionModel, self).__init__()
        self.layer1 = nn.Linear(input_size, 32)  # Input to hidden layer
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(32, 1)  # Hidden to output layer
    
    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.layer2(x)
        return x

# Updated generate_predictions function with model architecture definition and loading
def generate_predictions(poi, latentdf_clean):
    # Directory containing the models for the given poi
    model_dir = f"/path/to/saved_models/models/{poi}_model_weights"
    
    # Prepare data columns for prediction
    feature_columns = [col for col in latentdf_clean.columns if col.startswith("data_")]
    X = latentdf_clean[feature_columns].values  # Extract features for model input
    patient_ids = latentdf_clean['patient_ids']  # Keep track of patient IDs

    # Initialize model with the input size equal to the number of feature columns
    input_size = X.shape[1]
    model = RegressionModel(input_size)
    
    # Collect predictions for each bootstrap model
    all_predictions = []
    for i in range(10):
        print('bootstrap model number', i+1)
        model_path = os.path.join(model_dir, f"bootstrap_model_{i}.pt")
        
        # Load model weights for this bootstrap iteration, using weights_only=True for security
        model.load_state_dict(torch.load(model_path, weights_only=True))
        model.eval()  # Set model to evaluation mode
        
        # Generate predictions for the current bootstrap model
        with torch.no_grad():
            inputs = torch.tensor(X, dtype=torch.float32)
            outputs = model(inputs).numpy().flatten()
            all_predictions.append(outputs)
    
    # Calculate the averaged predictions across all bootstrap models
    avg_predictions = np.mean(all_predictions, axis=0)
    
    # Combine patient IDs with averaged predictions into a dataframe
    predvalues = pd.DataFrame({'patient_ids': patient_ids, f"{poi}_prediction": avg_predictions})
    
    return predvalues



In [None]:
ehr_rep = pd.read_csv('/path/to/ehr_motor_representations/ehr_representations_random.csv')
ehr_rep

In [None]:
# dataframe to store RABIT proteomics
all_predictions_df = pd.DataFrame()
i = 0

# loop through all proteins
for poi in proteinlist:
    print('index:', i)
    print(poi)
    print('data cleaning')
    poi, latentdf_clean = data_org(poi, ehr_rep)
    
    print('generating values')
    predvalues = generate_predictions(poi, latentdf_clean)
    
    print('merging data')
    if all_predictions_df.empty:
        all_predictions_df = predvalues
    else:
        all_predictions_df = all_predictions_df.merge(predvalues, on="patient_ids", how="left")
    i += 1

all_predictions_df.to_csv('/path/to/outputdir', index=False)

