In [1]:
# Import libraries and packages
import pandas as pd
import numpy as np
import os
import math
from datetime import datetime, timedelta
from scipy import stats
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load encounters file
dept_path = '/labs/kamaleswaranlab/ECMO/new_data/TAB2_Encounter_Departments.parquet.gzip'
dept = pd.read_parquet(dept_path)
dept[['BIRTH_DATE', 'Hosp_Admission', 'Hosp_Discharge', 'Entered_Dept', 'Exited_Dept']] = dept[['BIRTH_DATE', 'Hosp_Admission', 'Hosp_Discharge', 'Entered_Dept', 'Exited_Dept']].apply(pd.to_datetime)
dept = dept[['Pat ID', 'Encounter CSN', 'Name', 'BIRTH_DATE', 'Department', 'Entered_Dept', 'Exited_Dept', 'Hosp_Admission', 'Hosp_Discharge']]
dept.columns = ['patid', 'csn', 'name', 'dob', 'department', 'entered_dept', 'exited_dept', 'hosp_adm', 'hosp_disch']
dept = dept[(dept['department'].str.contains('PEDIATRIC ICU')) & (dept['hosp_adm'] >= '2010-01-01')]
dept['csn'] = dept['csn'].astype(int)
dept.dropna(inplace=True)
dept.drop_duplicates(inplace=True)

# Load ETT file
ett_path = '/labs/kamaleswaranlab/ECMO/new_data/TAB6_ETTs_and_Trachs.parquet.gzip'
mv_ett = pd.read_parquet(ett_path)
mv_ett.columns = ['patid', 'name', 'dob', 'mrn', 'lda_type', 'placement_date', 'removal_date', 'lda_in_place']
mv_ett[['dob', 'placement_date', 'removal_date']] = mv_ett[['dob', 'placement_date', 'removal_date']].apply(pd.to_datetime)
mv_ett = mv_ett.merge(dept[['patid', 'csn', 'hosp_adm', 'hosp_disch']], how='inner', on='patid')
mv_ett = mv_ett[(mv_ett['placement_date'] >= mv_ett['hosp_adm']) & (mv_ett['placement_date'] <= mv_ett['hosp_disch']) & (mv_ett['placement_date'] >= '2010-01-01')]

# Create list of CSNs with tracheostomy tube
trach_list = mv_ett.loc[mv_ett['lda_type'] == 'Tracheostomy Tube', 'csn'].unique().tolist()

# Filter patients with endotracheal tube
mv_ett = mv_ett[mv_ett['lda_type'] == 'Endotracheal Tube']
mv_ett.dropna(inplace=True)
mv_ett.reset_index(drop=True, inplace=True)

# Organize data
mv_ett = mv_ett[['patid', 'csn', 'dob', 'placement_date']]
mv_ett['variable_id'] = 2
mv_ett['variable_name'] = 'resp_indicator'
mv_ett['value'] = 1
mv_ett = mv_ett[['patid', 'csn', 'dob', 'variable_id', 'variable_name', 'placement_date', 'value']]
mv_ett.columns = ['patid', 'csn', 'dob', 'variable_id', 'variable_name', 'recorded_time', 'value']

In [4]:
# Load mechanical ventilation indicators
mv_fs = pd.read_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/mv_indicators_raw.parquet.gzip')
mv_fs[['dob', 'recorded_time']] = mv_fs[['dob', 'recorded_time']].apply(pd.to_datetime)
mv_fs['csn'] = mv_fs['csn'].astype(int)
mv_fs = mv_fs[mv_fs['csn'].isin(dept['csn'].unique().tolist())]

# Create list of CSNs with tracheostomy tube
trach_list.extend(mv_fs.loc[((mv_fs['variable_name'] == 'Apparatus Type') | (mv_fs['variable_name'] == 'Mechanical Vent Initial')) & (mv_fs['value'].str.contains('Trach')), 'csn'].unique().tolist()) 
mv_fs.dropna(subset='value', inplace=True)

# Filter Mechanical Vent Initial
discard = ['Off-Discontinued Vent', 'V/Trach']
mv_fs = mv_fs[~((mv_fs['variable_name'] == 'Mechanical Vent Initial') & (mv_fs['value'].isin(discard)))] 

mv_fs = mv_fs[['patid', 'csn', 'dob', 'recorded_time']]
mv_fs['variable_id'] = 2
mv_fs['variable_name'] = 'resp_indicator'
mv_fs['value'] = 1
mv_fs = mv_fs[['patid', 'csn', 'dob', 'variable_id', 'variable_name', 'recorded_time', 'value']]

In [5]:
# Concat dataframes
mv = pd.concat([mv_ett, mv_fs])

# Discard CSNs with trach
mv = mv[~(mv['csn'].isin(trach_list))]

# Save file
mv.to_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/resp_data.parquet.gzip', compression='gzip')