# Dataset Generation
### Author: Divya Veerapaneni MS4, Ong Lab
### Input: Smartguard pupil dataset,  Ground_truth dataset with patient-level fixed variables (age, sex, other demographic data)
### Description: This ipynb preprocesses the TBI dataset to create a final dataset for logistic regression including pupil data, demographic data, and burden and burden of pupil phenotypes
### Output: 
#### 1. obs_df - a csv file that shows pupil observation level data for the full patient cohort
#### 2. final_merged_df - a csv file that shows patient level data for the full patient cohort


# Preprocessing Data

In [10]:
#import statements
import pandas as pd
import numpy as np
import csv
from datetime import datetime 
import datetime
import warnings
warnings.filterwarnings("ignore")
import HelperMethods

In [11]:
#read input files as dataframes
file_path = '/Users/divs/Box/1-BMC Smartguards/10-Processing and Visualization/8-TBI Pupillometry/Data/Intermediate Datasets/'
outcomes_df = pd.read_excel('/Users/divs/Box/1-BMC Smartguards/10-Processing and Visualization/8-TBI Pupillometry/Data/Consolidated_Study_Cohort_Traits.xlsx')
pupil_df = pd.read_csv('/Users/divs/Box/1-BMC Smartguards/10-Processing and Visualization/8-TBI Pupillometry/Data/all_pupils_bmc_04-04-2023.csv', low_memory=False)

#pre-process pupil dataset
print('Tbi_df',pupil_df.columns)
pupil_df = pupil_df[pupil_df.mrn.isin(outcomes_df.MRN.to_list())]
pupil_df = pupil_df[['mrn', 'date', 'npil', 'sizel','minl', '%l', 'cvl', 'mcvl', 'dvl', 'latl', 'npir', 'sizer', 'minr', '%r', 'cvr', 'mcvr', 'dvr', 'latr']]
print(len(pupil_df.mrn.unique()))

#pre-process ground_truth dataset
outcomes_df = outcomes_df[['MRN', 'ID', 'Crani_Surgery',
       'Discharge_Disposition', 'Unfavorable_Outcome',
       'Orientedx3', 'Awake_Alert', 'Mechanism_Injury', 'PRES_DT',
       'ADMIT_DT', 'DISCH_DT', 'tbi_severity', 'AGE', 'SEX', 'RACE',
       'GCS', 'Deceased', 'RACE_Black', 'Rotterdam', 'Marshall']]
outcomes_df = outcomes_df.rename(columns={'MRN':'mrn'})
outcomes_df

Tbi_df Index(['study_id', 'redcap_id', 'quip_include', 'mrn', 'date', 'npil', 'sizel',
       'minl', '%l', 'cvl', 'mcvl', 'dvl', 'latl', 'npir', 'sizer', 'minr',
       '%r', 'cvr', 'mcvr', 'dvr', 'latr', 'include', 'include_comments'],
      dtype='object')
136


Unnamed: 0,mrn,ID,Crani_Surgery,Discharge_Disposition,Unfavorable_Outcome,Orientedx3,Awake_Alert,Mechanism_Injury,PRES_DT,ADMIT_DT,DISCH_DT,tbi_severity,AGE,SEX,RACE,GCS,Deceased,RACE_Black,Rotterdam,Marshall
0,713638,52,no,Skilled Nursing Facility,0,1,1,blunt,2021-05-04,2021-05-04 16:40:00,2021-05-12 11:57:00,Moderate,54,M,White,9,0,0,2,2
1,810078,71,no,Rehab Facility,0,1,1,blunt,2022-04-15,2022-04-15 20:21:00,2022-04-28 19:10:00,Severe,83,F,Unknown,7,0,0,5,3
2,923712,92,yes,Deceased,1,0,0,blunt,2020-10-15,2020-10-15 13:06:00,2020-10-20 10:16:00,Severe,58,M,Unknown,7,1,0,6,5
3,2033891,129,no,Home,0,1,1,blunt,2022-01-17,2022-01-17 14:14:00,2022-01-31 10:34:00,Mild,56,M,White,13,0,0,2,2
4,2126184,178,yes,Skilled Nursing Facility,0,1,1,blunt,2021-12-29,2021-12-29 17:05:00,2022-01-18 15:47:00,Mild,65,M,Black,15,0,1,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,5401565,2492,no,Skilled Nursing Facility,0,1,1,blunt,2022-07-11,2022-07-11 05:53:00,2022-07-15 16:44:00,Mild,86,M,Unknown,14,0,0,1,5
132,5408510,2518,no,Hospice,1,0,1,blunt,2022-08-05,2022-08-05 21:44:00,2022-08-09 15:21:00,Severe,79,F,Hispanic,8,0,0,5,6
133,5410418,2527,no,Deceased,1,0,0,blunt,2022-08-14,2022-08-15 00:38:00,2022-08-19 07:32:00,Severe,84,M,White,6,1,0,4,6
134,5411815,2534,no,Deceased,1,0,0,blunt,2022-08-15,2022-08-18 19:07:00,2022-08-19 16:21:00,Severe,23,M,White,3,1,0,2,3


# Creating Df_72h Dataset (Per Observation)

In [12]:
#preprocess pupil data
cleaned_pupil_df = HelperMethods.clean_tbi_dataframe(pupil_df) #pre_process dataframe
obs_df = HelperMethods.create_first_x_hours_df(cleaned_pupil_df, outcomes_df, 72) #obtain observations up to 3 days

#compute burden of abnormal pupil phenotype stages
obs_df =  obs_df.apply(HelperMethods.compute_incidence, axis=1)

#select desired columms (computed per patient/MRN)
outcomes_for_obs_df = outcomes_df[['mrn', 'ID', 'Unfavorable_Outcome', 'Deceased', 'Orientedx3', 'Awake_Alert', 'tbi_severity', 'AGE', 'SEX',  'RACE_Black', 'Rotterdam', 'Mechanism_Injury']]
obs_df = obs_df.merge(outcomes_for_obs_df, on ='mrn', how='left')

grouping = obs_df.groupby(['mrn'])
n_obs_df = grouping.size().to_frame('total_obs')
obs_df = obs_df.merge(n_obs_df, on='mrn', how='left')
obs_df
# obs_df.to_csv(file_path + 'df_72h_9_7.csv',index=False)

Unnamed: 0,mrn,date,npil,sizel,minl,%l,cvl,mcvl,dvl,latl,...,Deceased,Orientedx3,Awake_Alert,tbi_severity,AGE,SEX,RACE_Black,Rotterdam,Mechanism_Injury,total_obs_y
0,713638,2021-05-04 23:45:00,4.8,1.84,1.46,21.0,0.54,1.01,0.26,0.30,...,0,1,1,Moderate,54,M,0,2,blunt,14
1,713638,2021-05-05 01:23:00,4.8,1.76,1.37,22.0,0.63,1.12,0.43,0.27,...,0,1,1,Moderate,54,M,0,2,blunt,14
2,713638,2021-05-05 02:08:00,4.8,2.45,1.78,27.0,0.97,1.75,0.64,0.27,...,0,1,1,Moderate,54,M,0,2,blunt,14
3,713638,2021-05-05 03:13:00,4.7,1.68,1.34,20.0,0.56,0.85,0.26,0.23,...,0,1,1,Moderate,54,M,0,2,blunt,14
4,713638,2021-05-05 08:00:00,4.7,1.96,1.55,21.0,0.84,1.21,0.36,0.27,...,0,1,1,Moderate,54,M,0,2,blunt,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4129,5412950,2022-08-26 01:54:00,4.7,4.20,2.43,42.0,2.60,4.13,0.99,0.23,...,0,1,1,Mild,71,M,0,4,blunt,24
4130,5412950,2022-08-26 03:58:00,4.6,4.50,2.64,41.0,2.72,4.49,1.22,0.20,...,0,1,1,Mild,71,M,0,4,blunt,24
4131,5412950,2022-08-26 05:48:00,4.6,3.45,2.27,34.0,2.76,3.77,1.22,0.20,...,0,1,1,Mild,71,M,0,4,blunt,24
4132,5412950,2022-08-26 08:30:00,4.4,2.80,2.28,19.0,1.25,1.72,0.72,0.27,...,0,1,1,Mild,71,M,0,4,blunt,24


# Creating MRN Dataset (Per Patient)

In [4]:
#preprocess pupil data and only collect observations from first 72 hrs for each patient
cleaned_pupil_df = HelperMethods.clean_tbi_dataframe(pupil_df)
df_72h = HelperMethods.create_first_x_hours_df(cleaned_pupil_df, outcomes_df, 72) #168 = 7 days

# group by unique mrn and add observations total by mrn as n_obs
grouping = df_72h.groupby(['mrn'])
median_pupil_data = grouping.median().round(2)
median_pupil_data['n_obs'] = grouping.size().to_list()
median_pupil_data = median_pupil_data.reset_index()
median_pupil_data = median_pupil_data[['mrn', 'lower_npi', 'average_npi', 'npi_diff', 'size_diff', 'n_obs']]

#compute incidence and incidence from imported methods
burdens_72h = HelperMethods.compute_burden(df_72h)
incidence_72h = burdens_72h.copy(deep=True)
incidence_72h.columns = incidence_72h.columns.str.replace('burden', 'incidence')
column_names = incidence_72h.columns[0:]
incidence_72h[column_names] = incidence_72h[column_names].astype(bool).astype(int)

#merge datasets together
outcomes_df = outcomes_df[['mrn', 'ID', 'Unfavorable_Outcome', 'Deceased', 'Orientedx3', 'Awake_Alert', 'tbi_severity', 'AGE', 'SEX',  'RACE_Black']]
final_merged_df = outcomes_df.merge(median_pupil_data, on='mrn', how='left') #pupil data
final_merged_df = final_merged_df.merge(burdens_72h, on='mrn', how='left') #burdens
final_merged_df = final_merged_df.merge(incidence_72h, on='mrn', how='left') #incidences
final_merged_df = final_merged_df.fillna(0)
# final_merged_df = final_merged_df.to_excel(file_path + 'temp_data.xlsx',  index=False)