In [2]:
# Import libraries and packages
import pandas as pd
import numpy as np
import os
import math
from datetime import datetime, timedelta
from scipy import stats
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load cohort
cohort = pd.read_csv('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_screening/complete_cohort.csv')
cohort['csn'] = cohort['csn'].astype(int)

### Extract features from Flowsheets and LABS

In [4]:
# Create Dictionary with variable names for flowsheets
fs_vars = {14: 'Weight',
            2061017: 'Code Sheet Weight (kg)',
            8: 'Pulse', 
            2059505: 'MAP',
            2059508: 'ABP MAP',
            2059569: 'ART MAP',
            5: 'BP', 
            9: 'Resp', 
            10: 'SpO2', 
            56200200: 'Perfused Pulse (SpO2)', 
            6: 'Temp',
            2060342: 'FiO2 (%)',
            2060631: 'PaO2/FiO2 (Calculated)',
            2053015: 'Pupil Left Reaction',
            2053014: 'Pupil Left Size',
            2053013: 'Pupil Right Reaction',
            2053012: 'Pupil Right Size',
            2055126: 'Coma Scale Total',
            2060343: 'Oxygen Flow (lpm)',
            2065111: 'POC pH',
            2065113: 'POC PO2',
            2065112: 'POC PCO2',
            20570250: 'Volume Infused (mL)',
            2057578: 'Urine (mL)'}

# Create Dictionary with variable names for labs
lab_vars = {2255: 'POTASSIUM', 
            2711: 'SODIUM', 
            541: 'CHLORIDE',
            2226: 'POC GLUCOSE', 
            1285: 'GLUCOSE', 
            393: 'BUN', 
            733: 'CREATININE', 
            417: 'CALCIUM', 
            2223: 'POC CALCIUM IONIZED',
            663: 'CO2', 
            1431: 'HEMOGLOBIN', 
            343: 'BILIRUBIN TOTAL',
            113: 'ALBUMIN', 
            3255: 'WBC', 
            2206: 'PLATELETS', 
            2323: 'PTT',               
            271: 'ARTERIAL POC PCO2', 
            430: 'CAPILLARY POC PCO2', 
            3186: 'VENOUS POC PCO2',
            273: 'ARTERIAL POC PO2', 
            432: 'CAPILLARY POC PO2', 
            3188: 'VENOUS POC PO2',
            268: 'ARTERIAL BASE EXCESS', 
            3182: 'VENOUS BASE EXCESS',
            426: 'CAP BASE EXCESS',
            267: 'ART BASE DEFICIT',
            3181: 'VENOUS BASE DEFICIT',
            425: 'CAP BASE DEFICIT',
            1333: 'HCO3',
            1700: 'LACTIC ACID',
            2229: 'POC LACTIC ACID',
            16301: 'LACTIC ACID WHOLE BLOOD',
            315: 'BAND NEUTROPHILS % (MANUAL)', 
            272: 'ARTERIAL POC PH', 
            431: 'CAPILLARY POC PH', 
            3187: 'VENOUS POC PH', 
            150: 'ALT (SGPT)', 
            281: 'AST (SGOT)', 
            1621: 'INT NORM RATIO', 
            2306: 'PROTIME'}

# List of flowsheets
fs_list = list(filter(lambda x: '.parquet' in x, os.listdir('/labs/kamaleswaranlab/ECMO/new_data/new_flowsheets_feb23')))

# Loop through files to extract variables of dataset picu stays from flowsheets
vars_data = []
for filename in fs_list:
    print('Extracting data from {} file...'.format(filename))
    df = pd.read_parquet(os.path.join('/labs/kamaleswaranlab/ECMO/new_data/new_flowsheets_feb23', filename))
    df = df[['Pat ID', 'Encounter CSN', 'Date of Birth', 'Row ID', 'Row Name', 'Recorded Time', 'Value']]
    df[['Encounter CSN', 'Row ID']] = df[['Encounter CSN', 'Row ID']].astype(int)
    vars_data.extend(df[(df['Row ID'].isin(list(fs_vars.keys()))) & (df['Encounter CSN'].isin(cohort['csn'].unique().tolist()))].values)

# Extract variables of dataset patients from labs
print('Extracting data from labs file...')
df = pd.read_parquet('/labs/kamaleswaranlab/ECMO/new_data/DR15269_LABsAndPFTs.parquet.gzip')
df = df[['Pat ID', 'Encounter CSN', 'Date of Birth', 'Component ID', 'Component', 'Result Date', 'Result']]
df.columns = ['Pat ID', 'Encounter CSN', 'Date of Birth', 'Row ID', 'Row Name', 'Recorded Time', 'Value']
df[['Encounter CSN', 'Row ID']] = df[['Encounter CSN', 'Row ID']].astype(int)
vars_data.extend(df[(df['Row ID'].isin(list(lab_vars.keys()))) & (df['Encounter CSN'].isin(cohort['csn'].unique().tolist()))].values)

# Create dataframe
print('Creating dataframe...')
variables = pd.DataFrame(vars_data)
variables.columns = ['patid', 'csn', 'dob', 'variable_id', 'variable_name', 'recorded_time', 'value']
variables.reset_index(drop=True, inplace=True)

# Save dataset file
variables.to_parquet('/labs/kamaleswaranlab/dchanci/data/pediatric_sepsis/prediction_ml/updated_data/data_models/raw_features.parquet.gzip', compression='gzip')

Extracting data from Flowsheet_Rows_2010.parquet.gzip file...
Extracting data from Flowsheet_Rows_2011.parquet.gzip file...
Extracting data from Flowsheet_Rows_2012.parquet.gzip file...
Extracting data from Flowsheet_Rows_2013.parquet.gzip file...
Extracting data from Flowsheet_Rows_2014.parquet.gzip file...
Extracting data from Flowsheet_Rows_2015.parquet.gzip file...
Extracting data from Flowsheet_Rows_2016.parquet.gzip file...
Extracting data from Flowsheet_Rows_2017.parquet.gzip file...
Extracting data from Flowsheet_Rows_2018.parquet.gzip file...
Extracting data from Flowsheet_Rows_2019.parquet.gzip file...
Extracting data from Flowsheet_Rows_2020.parquet.gzip file...
Extracting data from Flowsheet_Rows_2021.parquet.gzip file...
Extracting data from Flowsheet_Rows_2022.parquet.gzip file...
Extracting data from labs file...
Creating dataframe...


In [5]:
print('Unique CSN total:', len(variables['csn'].unique().tolist()))

Unique CSN total: 63877
