# Preprocess Cox Dataset
### Author: Divya Veerapaneni MS4, Ong Lab
### Description: This ipynb preprocesses the TBI dataset for the Cox model.  
### Input: 
#### all_pupils_bmc.csv - multiple pupil observations per patient
#### Consolidated_Study_Cohort_Traits.xlsx - patient-level fixed variables (demographic and pupil data)
#### utilizes helper functions from HelperMethods.py
### Output: cox dataset

In [1]:
#import statements
import os
import pandas as pd
import numpy as np
import csv
from datetime import datetime 
import seaborn as sns
import matplotlib.pyplot as plt
from os import path
from scipy.stats import f_oneway
import datetime
import warnings
import statistics
import HelperMethods
warnings.filterwarnings("ignore")

In [2]:
#read input files as dataframes
file_path = '/Users/divs/Box/1-BMC Smartguards/10-Processing and Visualization/8-TBI Pupillometry/Data/Intermediate Datasets/'
outcomes_df = pd.read_excel('/Users/divs/Box/1-BMC Smartguards/10-Processing and Visualization/8-TBI Pupillometry/Data/Consolidated_Study_Cohort_Traits.xlsx')
pupil_df = pd.read_csv('/Users/divs/Box/1-BMC Smartguards/10-Processing and Visualization/8-TBI Pupillometry/Data/all_pupils_bmc_04-04-2023.csv', low_memory=False)

#pre-process pupil dataset
pupil_df = pupil_df[pupil_df.mrn.isin(outcomes_df.MRN.to_list())]
pupil_df = pupil_df[['mrn', 'date', 'npil', 'sizel','minl', '%l', 'cvl', 'mcvl', 'dvl', 'latl', 'npir', 'sizer', 'minr', '%r', 'cvr', 'mcvr', 'dvr', 'latr']]

#pre-process patient dataset
outcomes_df = outcomes_df[['MRN', 'ID', 'Crani_Surgery',
       'Discharge_Disposition', 'Unfavorable_Outcome',
       'Orientedx3', 'Awake_Alert', 'Mechanism_Injury', 'PRES_DT',
       'ADMIT_DT', 'DISCH_DT', 'tbi_severity', 'AGE',
       'GCS', 'Rotterdam', 'Marshall', 'Mechanism_Injury', 'Deceased', 'RACE_Black']]
outcomes_df = outcomes_df.rename(columns={'MRN':'mrn'})

#preprocess pupil data
cleaned_pupil_df = HelperMethods.clean_tbi_dataframe(pupil_df) #pre_process dataframe
cleaned_pupil_df['lower_npi_0_removed'] = cleaned_pupil_df.lower_npi.replace(0, np.nan) #keep only nonzero data
cleaned_pupil_df['average_npi_0_removed'] = cleaned_pupil_df.average_npi.replace(0, np.nan) #keep only nonzero data
obs_df = HelperMethods.create_first_x_hours_df(cleaned_pupil_df, outcomes_df, 5000) #obtain observations for all patients

#compute incidence of abnormal pupil phenotype stages
obs_df = obs_df.apply(HelperMethods.compute_incidence, axis=1)

#select desired columms (computed per patient/MRN)
outcomes_for_obs_df = outcomes_df[['mrn', 'ID', 'Unfavorable_Outcome', 'Deceased', 'Orientedx3', 'Awake_Alert', 'tbi_severity', 'AGE', 'RACE_Black', 'Rotterdam', 'Marshall', 'Mechanism_Injury']]
obs_df = obs_df.merge(outcomes_for_obs_df, on ='mrn', how='left')

# group by unique mrn and add total observations for each mrn as n_obs
grouping = obs_df.groupby(['mrn'])
n_obs_df = grouping.size().to_frame('total_obs')
obs_df = obs_df.merge(n_obs_df, on='mrn', how='left')

In [3]:
# compute time from discharge
outcomes_df.ADMIT_DT = pd.to_datetime(outcomes_df.ADMIT_DT)
outcomes_df.DISCH_DT = pd.to_datetime(outcomes_df.DISCH_DT)
outcomes_df['time_to_discharge'] = outcomes_df.DISCH_DT - outcomes_df.ADMIT_DT

# create df where average pupil metrics is calculated when duplicate observations occur
averaged_duplicate_df = obs_df.groupby(['mrn','date'], as_index=False)['average_npi','lower_npi', 'average_npi_0_removed', 'lower_npi_0_removed','npi_diff', 'size_diff'].mean() #took mean pupil metrics of rows where MRN and date same
merged_df = obs_df.merge(averaged_duplicate_df, on=['mrn', 'date'], how='left') #merge original df with averaged out pupiil metrics
merged_df = merged_df.drop(['average_npi_x','lower_npi_x', 'average_npi_0_removed_x', 'lower_npi_0_removed_x','npi_diff_x', 'size_diff_x', 'total_obs_x'], axis=1) 
merged_df = merged_df.rename(columns={'average_npi_y': 'average_npi', 'lower_npi_y':'lower_npi', 'average_npi_0_removed_y':'average_npi_0_removed', \
                                      'lower_npi_0_removed_y': 'lower_npi_0_removed','npi_diff_y':'npi_diff', 'size_diff_y':'size_diff', 'total_obs_y':'total_obs'})
averaged_df  = merged_df.drop_duplicates(subset=['mrn', 'date'])
averaged_df.sort_values(by=['mrn', 'time'], ascending=True, inplace=True)

In [4]:
#helper function to switch format of column data from sequential values to intervals as follows: ie. col - 1, 2, 5, etc.  ---->  col - 0 to 1, 1 to 2, 2 to 5, etc.
def create_start_end_column_version(input_df, col):
    start_column_list = []
    end_column_list = []
    for i in range(len(input_df)):
        if i == 0:
            start_column_list.append(0)
            end_column_list.append(input_df[col].iloc[i])  
        elif i < len(input_df):
            start_column_list.append(input_df[col].iloc[i-1]) 
            end_column_list.append(input_df[col].iloc[i])
    input_df[col + '_start'] = start_column_list
    input_df[col + '_end'] = end_column_list
    return input_df

#helper function to switch deceased data from 0 to 1 for desired mrns
#sets deceased to 1 on very last observation time for patient
def helper_create_interval_outcome(df, mrn, mrns_to_change):
    input_df = df[df.mrn == mrn]
    input_df['Deceased_cox'] = 0
    if mrn not in mrns_to_change:
        return input_df
    else:
        input_df.iloc[-1, input_df.columns.get_loc('Deceased_cox')] = 1
        return input_df

#helper function to iterate through all mrns and change only mrns with bad outcomes
def create_interval_outcome(input_df, bad_outcome_pts):
    dfs = [helper_create_interval_outcome(input_df, mrn, bad_outcome_pts) for mrn in input_df.mrn.unique()]
    final_df = pd.concat(dfs)
    return final_df

preprocessed_df = averaged_df.copy()
preprocessed_df = HelperMethods.create_start_end_column_version(preprocessed_df, 'time') #switch time from sequential to interval formatting
pts_deceased = list(preprocessed_df[preprocessed_df.Deceased==1].mrn.unique())
final_df = create_interval_outcome(preprocessed_df, pts_deceased) #format deceased column 
final_df = final_df[['ID', 'time', 'time_start', 'time_end', 'date', 'average_npi', 'lower_npi', 'average_npi_0_removed',
       'lower_npi_0_removed', 'npi_diff', 'size_diff',  'any_incidence', 'poor_npi_incidence', 'npi_diff_incidence',
       'size_diff_incidence', 'uni_any_incidence', 'stage1u_incidence',
       'stage2u_incidence', 'stage3u_incidence', 'stage4u_incidence',
       'bi_incidence', 'stage1b_incidence', 'stage2b_incidence',
       'stage3b_incidence', 'npi_diff_size_diff_incidence',
       'size_diff_poor_npi_incidence', 'Deceased_cox', 'tbi_severity', 'AGE', 'RACE_Black', 'Rotterdam', 'Mechanism_Injury', 'total_obs']]
#final_df.to_csv(file_path + 'Cox_9-13.csv', index=False)