# Dataset Generation
### Author: Divya Veerapaneni MS4, Ong Lab
### Input: 
#### all_pupils_bmc.csv - multiple pupil observations per patient
#### Consolidated_Study_Cohort_Traits.xlsx - patient-level fixed variables (demographic and pupil data)
#### utilizes helper functions from HelperMethods.py
### Description: This ipynb preprocesses the TBI dataset to create a final dataset for logistic regression including pupil data, demographic data, and incidence and frequency of abnormal pupil phenotypes
### Output: 
#### 1. obs_df - a csv file that shows pupil observation level data for the full patient cohort
#### 2. final_merged_df - a csv file that shows patient level data for the full patient cohort


# Preprocessing Data

In [1]:
#import statements
import pandas as pd
import numpy as np
import csv
from datetime import datetime 
import datetime
import warnings
warnings.filterwarnings("ignore")
import HelperMethods

In [2]:
#read input files as dataframes
file_path = '/Users/divs/Box/1-BMC Smartguards/10-Processing and Visualization/8-TBI Pupillometry/Data/Intermediate Datasets/'
outcomes_df = pd.read_excel('/Users/divs/Box/1-BMC Smartguards/10-Processing and Visualization/8-TBI Pupillometry/Data/Consolidated_Study_Cohort_Traits.xlsx')
pupil_df = pd.read_csv('/Users/divs/Box/1-BMC Smartguards/10-Processing and Visualization/8-TBI Pupillometry/Data/all_pupils_bmc_04-04-2023.csv', low_memory=False)

#pre-process pupil dataset
pupil_df = pupil_df[pupil_df.mrn.isin(outcomes_df.MRN.to_list())]
pupil_df = pupil_df[['mrn', 'date', 'npil', 'sizel','minl', '%l', 'cvl', 'mcvl', 'dvl', 'latl', 'npir', 'sizer', 'minr', '%r', 'cvr', 'mcvr', 'dvr', 'latr']]

#pre-process ground_truth dataset
outcomes_df = outcomes_df[['MRN', 'ID', 'Crani_Surgery',
       'Discharge_Disposition', 'Unfavorable_Outcome',
       'Orientedx3', 'Awake_Alert', 'Mechanism_Injury', 'PRES_DT',
       'ADMIT_DT', 'DISCH_DT', 'tbi_severity', 'AGE', 'SEX', 'RACE',
       'GCS', 'Deceased', 'RACE_Black', 'Rotterdam', 'Marshall']]
outcomes_df = outcomes_df.rename(columns={'MRN':'mrn'})

# Creating Observation Level Dataset 

In [3]:
#preprocess pupil data
cleaned_pupil_df = HelperMethods.clean_tbi_dataframe(pupil_df) #pre_process dataframe
obs_df = HelperMethods.create_first_x_hours_df(cleaned_pupil_df, outcomes_df, 72) #obtain observations up to 3 days

#compute burden of abnormal pupil phenotype stages
obs_df =  obs_df.apply(HelperMethods.compute_incidence, axis=1)

#select desired columms (computed per patient/MRN)
outcomes_for_obs_df = outcomes_df[['mrn', 'ID', 'Unfavorable_Outcome', 'Deceased', 'Orientedx3', 'Awake_Alert', 'tbi_severity', 'AGE', 'SEX',  'RACE_Black', 'Rotterdam', 'Mechanism_Injury']]
obs_df = obs_df.merge(outcomes_for_obs_df, on ='mrn', how='left')

grouping = obs_df.groupby(['mrn'])
n_obs_df = grouping.size().to_frame('total_obs')
obs_df = obs_df.merge(n_obs_df, on='mrn', how='left')
# obs_df.to_csv(file_path + 'df_72h_9_7.csv',index=False)

# Creating Patient Level Dataset

In [4]:
#preprocess pupil data and only collect observations from first 72 hrs for each patient
cleaned_pupil_df = HelperMethods.clean_tbi_dataframe(pupil_df)
df_72h = HelperMethods.create_first_x_hours_df(cleaned_pupil_df, outcomes_df, 72) 

# group by unique mrn and add observations total by mrn as n_obs
grouping = df_72h.groupby(['mrn'])
median_pupil_data = grouping.median().round(2)
median_pupil_data['n_obs'] = grouping.size().to_list()
median_pupil_data = median_pupil_data.reset_index()
median_pupil_data = median_pupil_data[['mrn', 'lower_npi', 'average_npi', 'npi_diff', 'size_diff', 'n_obs']]

#compute incidence and incidence from imported methods
burdens_72h = HelperMethods.compute_burden(df_72h)
incidence_72h = burdens_72h.copy(deep=True)
incidence_72h.columns = incidence_72h.columns.str.replace('burden', 'incidence')
column_names = incidence_72h.columns[0:]
incidence_72h[column_names] = incidence_72h[column_names].astype(bool).astype(int)

#merge datasets together
outcomes_df = outcomes_df[['mrn', 'ID', 'Unfavorable_Outcome', 'Deceased', 'Orientedx3', 'Awake_Alert', 'tbi_severity', 'AGE', 'SEX',  'RACE_Black']]
final_merged_df = outcomes_df.merge(median_pupil_data, on='mrn', how='left') #pupil data
final_merged_df = final_merged_df.merge(burdens_72h, on='mrn', how='left') #frequencies
final_merged_df = final_merged_df.merge(incidence_72h, on='mrn', how='left') #incidences
final_merged_df = final_merged_df.fillna(0)
# final_merged_df = final_merged_df.to_excel(file_path + 'temp_data.xlsx',  index=False)