# Base extraction script for all features used in ICP model

#### Features from different files are extracted separately in each part, before being merged into two files: static features and dynamic features

Data source: eICU Collaborative Research Database, accessed from https://physionet.org/content/eicu-crd/2.0/

Pollard TJ, Johnson AEW, Raffa JD, Celi LA, Mark RG and Badawi O. The eICU Collaborative Research Database, a freely available multi-center database for critical care research. Scientific Data (2018). DOI: http://dx.doi.org/10.1038/sdata.2018.178.

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# Folder containing EICU data
EICU_DIR = "."

# Folder to store extracted patients data
INTERIM_DIR = "./data"

# 1) Extract patient cohort
---

Patients in eICU are selected if:

    a) they have been diagnosed with a brain injury of interest and 

    b) they have $\geq$ 100 ICP readings

### 1a) Extract patients with relevant Brain Injury Diagnoses

---

- **TBI:**

    ICD10CM: S02*, S04*, S06*, S07*, S09*, T02, T04, T06
    
    ICD9CM: 800*, 801*, 802*, 803*, 804*, 850.1 – 850.5, 850.9, 851*, 852*, 853*, 854*, 959.01

- **Severe TBI:**

    G93.5/348.4 Brain herniation, G93.6 Brain edema

- **Intracerebral Hemorrhage:**

    I61, 431, 432
    I60, 430
    430, 331.3, G91.0, I60.9
    I62.01, I62.02, 4321
    I62.1, 4320

- **Acute Ischemic Stroke:**

    I63, 433, 434.x1, I65

- **Cerebral Venous Thrombosis:**

    437.6, I67.6

- **Hepatic encephalopathy and other non neurological encephalopathy:**

    572.2, K72.01, K72.11,K72.91
    348.30, G93.40

- **Chronic Neurosurgical Conditions (code all as 1 variable, there should be very few):**

    331.3, 331.4, 331.5, G91
    348.2, G93.2
    I62.03

In [2]:
diag_df = pd.read_csv(EICU_DIR + "/diagnosis.csv.gz", compression="gzip")
diagnosis_types_regex = [
    r"^(S0[24679]|T0[246]|80[0-4]|850.[1-59]|85[1-4]|959.01$)", 
    r"^G93.[56]",
    r".*(G91.0|I6[01]|I62.(0[12]|1)|43[012]|331.3)",
    r".*(I63|433|434.[0-9]1|I65)",
    r".*(437.6|I67.6)",
    r".*(572.2|K72.01|K72.11|K72.91|348.30|G93.40)",
    r".*(331.[345]|G91|348.2|G93.2|I62.03)"
]
patients_with_diagnoses = []
for regex in diagnosis_types_regex:
    patients_with_diagnoses.extend(list(diag_df.loc[diag_df['icd9code'].str.match(regex) == True]["patientunitstayid"].unique()))
    
patients_with_diagnoses = list(set(patients_with_diagnoses))

### 1b) Keep patients with $\geq$ 100 ICP records

In [3]:
vital_dfs = pd.read_csv(EICU_DIR + "/vitalPeriodic.csv.gz", compression="gzip", chunksize=50000)
icp_records = []
for v in tqdm(vital_dfs):
    icp_records.append(v.loc[(v['patientunitstayid'].isin(patients_with_diagnoses)) & (~v['icp'].isnull())])
icp_records = pd.concat(icp_records)

2934it [02:20, 20.95it/s]


In [13]:
# reused later
filtered_patients_df = icp_records[icp_records['patientunitstayid'].map(icp_records['patientunitstayid'].value_counts()) >= 100]

filtered_patients_counts = filtered_patients_df.value_counts(['patientunitstayid'])
print(filtered_patients_counts)
print(f"Mean number of icp readings per patient: {filtered_patients_counts.mean():.3f} ")
print(f"SD of number of icp readings per patient: {filtered_patients_counts.std():.3f} ")
length_of_stay = filtered_patients_df.groupby('patientunitstayid')['observationoffset'].agg('max')
print(f"Mean length of stay in the ICU: {length_of_stay.mean() / 60 / 24 :.3f} days")
print(f"SD of length of stay in the ICU: {length_of_stay.std() / 60 / 24 :.3f} days")
print(f"Percentage missingness of ICP measurements: {filtered_patients_df['icp'].isna().sum() / len(filtered_patients_df) * 100}%")
# checking extreme icp values
cleaned_icp_df = filtered_patients_df[(filtered_patients_df['icp'] <= 100) & (filtered_patients_df['icp'] >= 0)]
print(f"Percentage of odd ICP measurements (ICP < 0 or ICP > 100): {100 - len(cleaned_icp_df) / len(filtered_patients_df) * 100:.3f}%")
filtered_patients_df = filtered_patients_df.sort_values(by=["patientunitstayid", "observationoffset"])




patientunitstayid
3227651              11662
2673737              10341
454498               10258
490002               10214
649154                8861
                     ...  
562222                 107
2767544                105
2952757                104
2494195                104
1570129                101
Length: 941, dtype: int64
Mean number of icp readings per patient: 2200.109 
SD of number of icp readings per patient: 1765.991 
Mean length of stay in the ICU: 8.647 days
SD of length of stay in the ICU: 6.704 days
Percentage missingness of ICP measurements: 0.0%
Percentage of odd ICP measurements (ICP < 0 or ICP > 100): 7.798%


In [6]:
with open(INTERIM_DIR + "/patients.txt", "w") as f:
    f.write(" ".join(filtered_patients_df['patientunitstayid'].astype(str).unique()))

patients = sorted(list(filtered_patients_df['patientunitstayid'].unique()))

# 2) Extract patient static info
---

- **Basic Info (on admission):**

    age, gender, BMI, ethnicity, GCS

- **Diagnoses (refer to ICD codes and categories in 1a):**

    TBI, Severe-TBI, IH, AIS, CVT, HE, CNC

Cheat: unitdischargestatus

In [7]:
# basic info
patient_info_df = pd.read_csv(EICU_DIR + "/patient.csv.gz", compression="gzip")
patient_info_df = patient_info_df[patient_info_df["patientunitstayid"].isin(patients)]
print(f"Percentage missingness of mortality: {patient_info_df['unitdischargestatus'].isna().sum() / len(patient_info_df) * 100:.3f}%")
# patient_info_df["unitdischargestatus"].fillna("Alive")
patient_info_df.age = patient_info_df.age.replace("> 89", "89").astype(int)
patient_info_df["BMI"] = patient_info_df["admissionweight"] / (patient_info_df["admissionheight"]/100) ** 2

apache = pd.read_csv(EICU_DIR + "/apachePredVar.csv.gz", compression="gzip")
apache["GCS"] = apache["verbal"] + apache["motor"] + apache["eyes"]

basic_info_df = patient_info_df.merge(apache[["patientunitstayid", "GCS"]], how="left", on="patientunitstayid")[[
    "patientunitstayid", 
    "gender",
    "age", 
    "ethnicity", 
    "BMI", 
    "GCS", 
    "unitdischargestatus"
]]
basic_info_df.loc[basic_info_df[basic_info_df["BMI"] > 100].index, "BMI"] = np.nan

# diagnoses
diagnoses = [
    "TBI", "Severe-TBI", "IH", "AIS", "CVT", "HE", "CNC"
]
patient_diagnoses_df = pd.DataFrame(
    np.zeros((len(patients), len(diagnoses)), dtype=int), 
    index=patients, 
    columns=diagnoses
)

for i, regex in enumerate(diagnosis_types_regex):
    patients_with_diag = set(patients) & set(diag_df.loc[diag_df['icd9code'].str.match(regex) == True]["patientunitstayid"])
    patient_diagnoses_df.loc[list(patients_with_diag), diagnoses[i]] = 1
    
# combine
static_info_df = basic_info_df.merge(patient_diagnoses_df, left_on="patientunitstayid", right_index=True)
static_info_df.to_csv(INTERIM_DIR + "/static_info.csv", index=False)
static_info_df

Percentage missingness of mortality: 0.000%


Unnamed: 0,patientunitstayid,gender,age,ethnicity,BMI,GCS,unitdischargestatus,TBI,Severe-TBI,IH,AIS,CVT,HE,CNC
0,143881,Female,73,African American,22.851562,15.0,Alive,0,0,1,0,0,0,0
1,145396,Female,54,Caucasian,,15.0,Alive,1,0,0,0,0,0,0
2,145603,Male,85,Hispanic,23.206991,14.0,Alive,0,0,1,0,0,0,0
3,157475,Female,58,Asian,,5.0,Alive,0,0,1,0,0,0,0
4,162779,Female,59,Caucasian,24.572971,15.0,Alive,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
936,3349062,Female,47,African American,42.623299,,Expired,0,0,1,1,0,1,0
937,3349394,Male,51,Caucasian,21.403540,3.0,Expired,0,0,1,0,0,0,0
938,3349812,Male,21,Other/Unknown,27.359332,3.0,Alive,1,0,0,0,0,0,0
939,3350216,Female,35,Other/Unknown,27.696926,4.0,Alive,1,0,0,0,0,0,0


## 3) Extract patient dynamic readings
---

Features are extracted from the following tables and combined. Missing values are not dealt with here. 

- a) Basic vital signs from vitalPeriodic
- b) Vital signs from nurseCharting
- c) Drug infusions from infusionDrug
- d) Lab results
- e) Treatments and procedures recorded in treatments


- f) Merging

Records from b to e for each patient are merged to the nearest timestamp in a

### 3a) Basic Vital Signs
---
In 5 minute intervals:

- ICP
- temperature, sao2, heart rate, respiration rate, cvp, eto2, systemic/pa systolic/diastolic/mean

In [9]:
with open(INTERIM_DIR + "/patients.txt", "r") as f:
    patients = list(map(int, f.read().strip().split()))

vital_dfs = pd.read_csv(EICU_DIR + "/vitalPeriodic.csv.gz", compression="gzip", chunksize=50000)
icp_records = []
for v in tqdm(vital_dfs):
    icp_records.append(v.loc[(v['patientunitstayid'].isin(patients_with_diagnoses)) & (~v['icp'].isnull())])
icp_records = pd.concat(icp_records)

2934it [02:10, 22.40it/s]


In [10]:
basic_vitals_df = icp_records.drop(columns=["vitalperiodicid", "st1", "st2", "st3"]).sort_values(by=["patientunitstayid", "observationoffset"])

In [11]:
basic_vitals_df.to_csv(INTERIM_DIR + "/raw/basic_vitals.csv", index=False)
basic_vitals_df

Unnamed: 0,patientunitstayid,observationoffset,temperature,sao2,heartrate,respiration,cvp,etco2,systemicsystolic,systemicdiastolic,systemicmean,pasystolic,padiastolic,pamean,icp
235768,143881,450,,100.0,68.0,29.0,,,148.0,44.0,70.0,,,,11.0
235576,143881,455,,100.0,66.0,28.0,,,146.0,44.0,70.0,,,,20.0
235912,143881,460,,100.0,68.0,27.0,,,150.0,46.0,72.0,,,,5.0
235408,143881,465,,99.0,64.0,18.0,,,146.0,44.0,70.0,,,,-7.0
235772,143881,470,,100.0,68.0,19.0,,,148.0,46.0,70.0,,,,-15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146459329,3351831,6912,,98.0,72.0,,3.0,,,,,,,,15.0
146458181,3351831,6917,,98.0,72.0,,2.0,,,,,,,,13.0
146458527,3351831,6922,,98.0,85.0,,1.0,,,,,,,,14.0
146458271,3351831,6927,,96.0,84.0,,2.0,,,,,,,,15.0


### 3b) Nurse Vital Signs
---

In [19]:
nurse_vitals = [
    'Glasgow coma score - Eyes',
    'Glasgow coma score - GCS Total',
    'Glasgow coma score - Motor',
    'Glasgow coma score - Verbal',
    'Invasive BP - Invasive BP Diastolic',
    'Invasive BP - Invasive BP Mean',
    'Invasive BP - Invasive BP Systolic',
    'Non-Invasive BP - Non-Invasive BP Diastolic',
    'Non-Invasive BP - Non-Invasive BP Mean',
    'Non-Invasive BP - Non-Invasive BP Systolic',
    'CVP - CVP',
    'Temperature - Temperature (C)',
    'Heart Rate - Heart Rate',
    'Respiratory Rate - Respiratory Rate',
    'SpO2 - Value',
]

nurseCharting = pd.read_csv(EICU_DIR + "/nurseCharting.csv.gz", compression="gzip", chunksize=50000)
nurse = []
for n in tqdm(nurseCharting):
    nurse.append(n.loc[(n['patientunitstayid'].isin(patients))])
nurse = pd.concat(nurse)
nurse["comb"] = nurse["nursingchartcelltypevallabel"] + " - " + nurse["nursingchartcelltypevalname"]
sorted_nurse = nurse[["patientunitstayid", "nursingchartoffset", "nursingchartvalue", "comb"]] \
    .loc[(nurse['comb'].isin(nurse_vitals))] \
    .sort_values(by=['patientunitstayid', "nursingchartoffset", "comb"])

3033it [02:23, 21.19it/s]


In [22]:
records = []
for patient in tqdm(patients):
    patient_vitals = sorted_nurse[sorted_nurse['patientunitstayid']==patient]
    for time in patient_vitals["nursingchartoffset"].unique():
        record = [patient, time] + [np.nan for _ in range(len(nurse_vitals))]
        pv = patient_vitals[patient_vitals["nursingchartoffset"]==time]
        for index, row in pv.iterrows():
            record[2 + nurse_vitals.index(row["comb"])] = row["nursingchartvalue"]
        records.append(record)
nursing_vitals_df = pd.DataFrame(records, columns=["patientunitstayid", "offset"] + nurse_vitals)

100%|█████████████████████████████████████████████████████████████████████████████████| 941/941 [02:09<00:00,  7.24it/s]


In [23]:
nursing_vitals_df.to_csv(INTERIM_DIR + "/raw/nursing_vitals.csv", index=False)
nursing_vitals_df

Unnamed: 0,patientunitstayid,offset,Glasgow coma score - Eyes,Glasgow coma score - GCS Total,Glasgow coma score - Motor,Glasgow coma score - Verbal,Invasive BP - Invasive BP Diastolic,Invasive BP - Invasive BP Mean,Invasive BP - Invasive BP Systolic,Non-Invasive BP - Non-Invasive BP Diastolic,Non-Invasive BP - Non-Invasive BP Mean,Non-Invasive BP - Non-Invasive BP Systolic,CVP - CVP,Temperature - Temperature (C),Heart Rate - Heart Rate,Respiratory Rate - Respiratory Rate,SpO2 - Value
0,143881,-102,,15,,,,,,,,,,,,,
1,143881,-87,,,,,,,,98,,214,,,,,
2,143881,-81,,,,,,,,100,,221,,,78,,
3,143881,-80,,15,,,,,,,,,,,,,
4,143881,-79,,,,,,,,,,,,36,68,26,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454196,3351831,18426,4,15,6,5,,,,,,,,,,,
454197,3351831,18650,4,15,6,5,,,,,,,,,,,
454198,3351831,18890,4,15,6,5,,,,,,,,,,,
454199,3351831,19130,4,15,6,5,,,,,,,,,,,


### 3c) Drug Infusions
---

In [24]:
hypertonic_saline = [
    '3% NACL (ml/hr)',
    '3 % NaCl (ml/hr)',
    '3 % SALINE (ml/hr)',
    '3 % saline (ml/hr)',
    '3 %NaCl (ml/hr)',
    '3 NACL (ml/hr)',
    '3 nacl (ml/hr)',
    '3%  SALINE (ml/hr)',
    '3% HYPERTONIC SALINE (ml/hr)',
    '3% Hypertonic NaCl (ml/hr)',
    '3% Hypertonic Saline',
    '3% Hypertonic Saline (ml/hr)',
    '3% NACL (ml/hr)',
    '3% NORMAL SALINE (ml/hr)',
    '3% NaCL  (ml/hr)',
    '3% NaCL (ml/hr)',
    '3% NaCl (ml/hr)',
    '3% Nacl (ml/hr)',
    '3% Normal Saline (ml/hr)',
    '3% SALINE (ml/hr)',
    '3% Saline (ml/hr)',
    '3% hypertonic saline  (ml/hr)',
    '3% hypertonic saline (ml/hr)',
    '3% nacl (ml/hr)',
    '3% normal saline (ml/hr)',
    '3% saline',
    '3% saline (ml/hr)',
    '3%NACL (ml/hr)',
    '3%NaCL (ml/hr)',
    '3%NaCl (ml/hr)',
    '3%Nacl (ml/hr)',
    '3%hypertonic saline (ml/hr)',
    'Hypertonic 3% Saline (ml/hr)',
    'Hypertonic 3% saline (ml/hr)',
    'Hypertonic Saline (ml/hr)',
    'Hypertonic Saline 3% (ml/hr)',
    'Hypertonic saline (ml/hr)',
    'NaCl 3% (ml/hr)',
    'NaCl 3% Hypertonic (ml/hr)',
    'hypertonic saline (ml/hr)',
    "NaCl 3% (Hypertonic Saline)",
]

mannitol = [
    'Mannitol (ml/hr)',
    'Mannitol 20%',
    'Mannitol 20% (ml/hr)',
    'Mannitol IVF Infused (ml/hr)',
    'mannitol (ml/hr)',
]

drug_categories = [
    [
        "Propofol",
        "Lorazepam",
        "Midazolam",
        "Dexmedetomidine",
        "Diazepam",
        "Haloperidol",
        "Etomidate",
        #Barbituate (in eICU under treatment: neurologic|therapy for controlling cerebral perfusion pressure|sedative agent|barbiturate),
        "Pentobarbital",
    ], [
        "Cisatracurium",
        "Vecuronium",
        "Rocuronium",
        "Atracurium",
    ], [
        "Nicardipine",
        "Nicardipine 40mg/200",
        "Labetalol",
        "Hydralazine",
        "Nimodipine",
        "Metoprolol",
        "Esmolol",
        "Milrinone",
        "Dilitazem",
        "Clevidipine",
        "Meperidine",
        "Verapamil",
    ], [
        "Norepinephrine",
        "Phenylephrine",
        "Phenylephrine (50/250)",
        "Phenylephrine (200/250)",
        "Dopamine",
        "Dobutamine",
        "Epinephrine",
        "Vasopressin",
    ], [
        "Fentanyl",
        "Fentanyl concentrate",
        "Hydromorphone (Dilaudid)",
        "Morphine Sulfate",
        "Meperidine (Demerol)",
        "Acetaminophen",
    ], [
        "Heparin",
        "Heparin Sodium",
        "Enoxaparin (Lovenox)",
        "Coumadin (Warfarin)",
    ], [
        "Keppra (Levitaceram)",
        "Gabapentin",
        "Diazepam",
        "Fosphenytoin",
    ], [
        "OR Crystalloid Intake",
        "D5 1/2NS",
        "D5NS",
        "OR Colloid Intake",
        "D5LR",
        "Albumin 5%",
        "Dextrose 50%",
        "Albumin 25%",
        "Dextrose 10%"
    ], hypertonic_saline + mannitol
]

drug_category_names = [
    "Sedatives", "Paralytics", "Antihypertensives", "Vasopressors", "Opoids", "Anticoagulants", "Anti-epileptics", "Fluid Resuscitation", "Saline_Mannitol"
]

In [25]:
drugs = pd.read_csv(EICU_DIR + "/infusionDrug.csv.gz", compression="gzip")

  drugs = pd.read_csv(EICU_DIR + "/infusionDrug.csv.gz", compression="gzip")


In [26]:
def label_drug(row):
    for i, category in enumerate(drug_categories):
        if str(row["drugname"]) in category:
            return drug_category_names[i]
    return "others"

drugs = drugs[drugs["patientunitstayid"].isin(patients)].copy()
drugs["drug_all"] = drugs.apply(label_drug, axis=1)

drugs_raw_df = drugs[drugs["drug_all"].isin(drug_category_names)].sort_values(by=['patientunitstayid', "infusionoffset"])
drugs_raw_df.to_csv(INTERIM_DIR + "/raw/drugs_infusions_raw.csv", index=False)

In [27]:
records = []
for patient in tqdm(patients):
    patient_drugs = drugs_raw_df[drugs_raw_df['patientunitstayid']==patient]
    for time in patient_drugs["infusionoffset"].unique():
        record = [patient, time] + [np.nan for _ in range(len(drug_category_names))]
        pv = patient_drugs[patient_drugs["infusionoffset"]==time]
        for index, row in pv.iterrows():
            record[2 + drug_category_names.index(row["drug_all"])] = 1
        records.append(record)
drugs_df = pd.DataFrame(records, columns=["patientunitstayid", "offset"] + drug_category_names)

100%|███████████████████████████████████████████████████████████████████████████████| 941/941 [00:00<00:00, 1618.96it/s]


In [28]:
drugs_df.to_csv(INTERIM_DIR + "/raw/drugs_infusions_boolean.csv", index=False)
drugs_df

Unnamed: 0,patientunitstayid,offset,Sedatives,Paralytics,Antihypertensives,Vasopressors,Opoids,Anticoagulants,Anti-epileptics,Fluid Resuscitation,Saline_Mannitol
0,306989,425,,,,,,,,,1.0
1,306989,1625,,,,,,,,,1.0
2,306989,1865,,,,,,,,,1.0
3,306989,2165,,,,,,,,,1.0
4,306989,2405,,,,,,,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...
2121,3244190,16709,,,,,,,,,1.0
2122,3244190,17557,,,,,,,,,1.0
2123,3244190,17909,,,,,,,,,1.0
2124,3244190,18149,,,,,,,,,1.0


## 3d) Labs
---
Sodium and glucose lab values

In [29]:
with open(INTERIM_DIR + "/patients.txt", "r") as f:
    patients = list(map(int, f.read().strip().split()))

lab_dfs = pd.read_csv(EICU_DIR + "/lab.csv.gz", compression="gzip", chunksize=50000)
lab = []
for l in tqdm(lab_dfs):
    lab.append(l[l["patientunitstayid"].isin(patients)])
lab = pd.concat(lab)


783it [00:40, 19.11it/s]


In [30]:
sodium = lab[(lab["patientunitstayid"].isin(patients)) & (lab["labname"]=="sodium")].rename(columns={"labresult": "sodium"})
glucose = lab[(lab["patientunitstayid"].isin(patients)) & (lab["labname"]=="glucose")].rename(columns={"labresult": "glucose"})

In [31]:
sod_glu = sodium[["patientunitstayid", "labresultoffset", "sodium"]].merge(
    glucose[["patientunitstayid", "labresultoffset", "glucose"]], 
    how="outer", 
    on=["patientunitstayid", "labresultoffset"]
).sort_values(["patientunitstayid", "labresultoffset"]).rename(columns={"labresultoffset": "offset"})

In [32]:
sod_glu.to_csv(INTERIM_DIR + "/raw/labs.csv", index=False)
sod_glu

Unnamed: 0,patientunitstayid,offset,sodium,glucose
12,143881,-90,141.0,105.0
9,143881,497,138.0,148.0
2,143881,718,141.0,
16,143881,939,142.0,159.0
4,143881,1180,145.0,
...,...,...,...,...
26550,3351831,10796,132.0,98.0
26551,3351831,12438,133.0,95.0
26554,3351831,13991,134.0,93.0
26558,3351831,17310,131.0,110.0


## 3e) Procedures and Treatment

In [33]:
treatment_types = [
    [
        "neurologic|therapy for controlling cerebral perfusion pressure|intracranial/cerebral perfusion pressure monitoring|CSF drainage via ventriculostomy",
        "neurologic|procedures / diagnostics|intracranial/cerebral perfusion pressure monitoring|ventriculostomy",
        "neurologic|procedures / diagnostics|neurosurgery|ventricular shunt procedure",
        "neurologic|procedures / diagnostics|neurosurgery|ventricular shunt procedure|ventricular-peritoneal",
        "neurologic|procedures / diagnostics|neurosurgery|ventricular shunt procedure|lumbar-peritoneal",
        "neurologic|procedures / diagnostics|neurosurgery|ventricular shunt procedure|ventricular-atrial",

    ], 
    [
        "neurologic|procedures / diagnostics|neurosurgery|therapeutic craniotomy|for hematoma",
        "neurologic|ICH/ cerebral infarct|surgery|craniotomy for drainage of intracranial hematoma",
        "neurologic|procedures / diagnostics|neurosurgery|drainage of hematoma-craniotomy",
        "neurologic|procedures / diagnostics|neurosurgery|diagnostic craniotomy",
        "neurologic|procedures / diagnostics|neurosurgery|hemicraniectomy",
        "neurologic|ICH/ cerebral infarct|surgery|hemicraniectomy",

    ],
    [
        "neurologic|procedures / diagnostics|lumbar puncture"
    ], 
    [
        "neurologic|procedures / diagnostics|angiogram|with coiling",
        "neurologic|procedures / diagnostics|angiogram|with embolization",
        "neurologic|ICH/ cerebral infarct|angiogram|with coiling",
        "neurologic|procedures / diagnostics|angiogram|with infusion of vasodilator",
        "neurologic|ICH/ cerebral infarct|angiogram|with embolization",
        "neurologic|ICH/ cerebral infarct|angiogram|with infusion of vasodilator",
        "neurologic|ICH/ cerebral infarct|angiogram|with cerebral angioplasty",
        "neurologic|procedures / diagnostics|angiogram|with stenting",

    ], 
    [
        "pulmonary|ventilation and oxygenation|mechanical ventilation"
    ]
]

treatment_categories = [
    "Shunting", "Intracranial operations", "Lumbar Puncture", "Neurovascular Procedures", "Mechanical Ventilation"
]


def label_treatment(row):
    for i, group in enumerate(treatment_types):
        if str(row["treatmentstring"]) in group:
            return treatment_categories[i]
    return "others"
    

In [34]:
treatments = pd.read_csv(EICU_DIR + "/treatment.csv.gz", compression="gzip")
treatments = treatments.loc[treatments["patientunitstayid"].isin(patients)]
treatments["treatmenttype"] = treatments.apply(label_treatment, axis=1)
treatments = treatments.loc[treatments["treatmenttype"]!="others"].sort_values(by=["patientunitstayid", "treatmentoffset"])

In [35]:
records = []
for patient in tqdm(patients):
    patient_treatments = treatments[treatments['patientunitstayid']==patient]
    for time in patient_treatments["treatmentoffset"].unique():
        record = [patient, time] + [np.nan for _ in range(len(treatment_categories))]
        pv = patient_treatments[patient_treatments["treatmentoffset"]==time]
        for index, row in pv.iterrows():
            record[2 + treatment_categories.index(row["treatmenttype"])] = 1
        records.append(record)
treatments_df = pd.DataFrame(records, columns=["patientunitstayid", "offset"] + treatment_categories)

100%|████████████████████████████████████████████████████████████████████████████████| 941/941 [00:01<00:00, 693.30it/s]


In [36]:
treatments_df.to_csv(INTERIM_DIR + "/raw/treatments.csv", index=False)
treatments_df

Unnamed: 0,patientunitstayid,offset,Shunting,Intracranial operations,Lumbar Puncture,Neurovascular Procedures,Mechanical Ventilation
0,263556,107,,,,,1.0
1,263556,783,,,,,1.0
2,263556,1418,,,,,1.0
3,263556,4540,,,,,1.0
4,263556,6521,,,,,1.0
...,...,...,...,...,...,...,...
6262,3351831,28,1.0,,,,1.0
6263,3351831,598,1.0,,,,1.0
6264,3351831,1628,1.0,,,,1.0
6265,3351831,2801,1.0,,,,


## 3f) Merging of dynamic readings
---


In [37]:
basic_vitals_df = pd.read_csv(INTERIM_DIR + "/raw/basic_vitals.csv")
nursing_vitals_df = pd.read_csv(INTERIM_DIR + "/raw/nursing_vitals.csv")
drugs_df = pd.read_csv(INTERIM_DIR + "/raw/drugs_infusions_boolean.csv")
labs_df = pd.read_csv(INTERIM_DIR + "/raw/labs.csv")
treatments_df = pd.read_csv(INTERIM_DIR + "/raw/treatments.csv")
with open(INTERIM_DIR + "/patients.txt", "r") as f:
    patients = list(map(int, f.read().strip().split()))

  nursing_vitals_df = pd.read_csv(INTERIM_DIR + "/raw/nursing_vitals.csv")


In [38]:
# Helper merge function
def merge_dynamic(base_df, dfs):
    base_df = base_df.copy()
    observationoffsets = {}
    for patient in patients:
        patient_base_df = base_df[base_df["patientunitstayid"]==patient].copy()
        observationoffsets[patient] = list(patient_base_df["observationoffset"])
    for df in tqdm(dfs):
        df = df.copy()
        df["matched_offset"] = df.apply(lambda x:interpolate_offset(x, observationoffsets), axis=1)
        df = df.groupby("matched_offset").apply(lambda x: x.ffill().bfill()).drop_duplicates("matched_offset")
        base_df = base_df.merge(
            df, 
            how="left", 
            left_on=["patientunitstayid", "observationoffset"],
            right_on=["patientunitstayid", "matched_offset"]
        ).drop(columns=["matched_offset"])
    return base_df

def interpolate_offset(row, observationoffsets):
#     MAXDIFF = 300
    offset = np.nan
    min_diff = 300
    offsets = observationoffsets[row["patientunitstayid"]]
    for po in offsets:
        if abs(po - row["offset"]) < min_diff:
            min_diff = abs(po - offset)
            offset = po
    return offset

In [39]:
# Helper merge function
def merge_dynamic(base_df, dfs):
    
    for patient in tqdm(patients):
        patient_base_df = base_df[base_df["patientunitstayid"]==patient].copy()
        observationoffsets = list(patient_base_df["observationoffset"])
        for df in dfs:
            patient_df = df[df["patientunitstayid"]==patient].copy()
            if not patient_df.empty:
                patient_df["matched_offset"] = patient_df.apply(lambda x:interpolate_offset(x, observationoffsets), axis=1)
                patient_df = patient_df.groupby("matched_offset").apply(lambda x: x.ffill().bfill()).drop_duplicates("matched_offset")
                if patient_df.empty:
                    continue
                patient_base_df = patient_base_df.merge(
                    patient_df, 
                    how="left", 
                    left_on=["patientunitstayid", "observationoffset"],
                    right_on=["patientunitstayid", "matched_offset"]
                ).drop(columns=["matched_offset"])
        patient_dfs.append(patient_base_df)

def interpolate_offset(row, observationoffsets):
#     MAXDIFF = 300
    offset = np.nan
    min_diff = 300
    for po in observationoffsets:
        if abs(po - row["offset"]) < min_diff:
            min_diff = abs(po - offset)
            offset = po
    return offset

In [41]:
# Basic merging
import warnings
warnings.filterwarnings("ignore")

patient_dfs = []
merge_dynamic(basic_vitals_df, [nursing_vitals_df, drugs_df, labs_df, treatments_df])

dynamic_columns = list(basic_vitals_df.columns)
for df in [nursing_vitals_df, drugs_df, labs_df, treatments_df]:
    dynamic_columns.extend(list(df.columns)[2:])
    
patient_dfs2 = []
for df in patient_dfs:
    patient_dfs2.append(df.loc[:,~df.columns.duplicated()].reindex(columns=dynamic_columns))
merged_dynamic_df = pd.concat(patient_dfs2)
merged_dynamic_df.to_csv(INTERIM_DIR + "/dynamic_info_raw.csv", index=False)

100%|█████████████████████████████████████████████████████████████████████████████████| 941/941 [29:10<00:00,  1.86s/it]


In [42]:
merged_dynamic_df = pd.read_csv(INTERIM_DIR + "/dynamic_info_raw.csv")
patients = list(merged_dynamic_df["patientunitstayid"].unique())

In [43]:
# Removing periods with no ICP values, forward-filling in feature values, culmulative number of times (cumsum) of drugs and procedures
def fix_gcs_total(row):
    gcs_total = row['Glasgow coma score - GCS Total']
    if np.isnan(gcs_total):
        return row['Glasgow coma score - Eyes'] + row['Glasgow coma score - Motor'] + row['Glasgow coma score - Verbal']
    return gcs_total

merged_dynamic_df['Glasgow coma score - GCS Total'] = merged_dynamic_df['Glasgow coma score - GCS Total'].replace('Unable to score due to medication', np.nan).astype(float) 
merged_dynamic_df['Glasgow coma score - GCS Total'] = merged_dynamic_df.apply(fix_gcs_total, axis=1)

patient_dfs3 = []
patient_dfs4 = []
for patient in patients:
    df = merged_dynamic_df[merged_dynamic_df["patientunitstayid"]==patient].copy()
    measurements = [
        'temperature', 'sao2',
        'heartrate', 'respiration', 'cvp', 'etco2', 'systemicsystolic',
        'systemicdiastolic', 'systemicmean', 'pasystolic', 'padiastolic',
        'pamean', 'Glasgow coma score - Eyes',
        'Glasgow coma score - GCS Total', 'Glasgow coma score - Motor',
        'Glasgow coma score - Verbal', 'Invasive BP - Invasive BP Diastolic',
        'Invasive BP - Invasive BP Mean', 'Invasive BP - Invasive BP Systolic',
        'Non-Invasive BP - Non-Invasive BP Diastolic',
        'Non-Invasive BP - Non-Invasive BP Mean',
        'Non-Invasive BP - Non-Invasive BP Systolic', 'CVP - CVP',
        'Temperature - Temperature (C)', 'Heart Rate - Heart Rate',
        'Respiratory Rate - Respiratory Rate', 'SpO2 - Value', "sodium", "glucose"
    ]
    drugs = [
        'Sedatives',
        'Paralytics', 'Antihypertensives', 'Vasopressors', 'Opoids',
        'Anticoagulants', 'Anti-epileptics', 'Fluid Resuscitation',
        'Saline_Mannitol'
    ]
    operations = [
        'Shunting', 'Intracranial operations', 'Lumbar Puncture', 
        'Neurovascular Procedures', 'Mechanical Ventilation'
    ]
    
    df.loc[:, measurements] = df[measurements].ffill()
    
    for x in operations + drugs:
        df[x + "_cumsum"] = df[x].fillna(0).cumsum()
    
    df.loc[:, drugs] = df[drugs].ffill(limit=12)
        
    df.loc[(df["icp"] > 100) | (df["icp"] < 0), "icp"] = np.nan
    
    mask = df["icp"].notna()
    a = mask.ne(mask.shift()).cumsum()
    df1 = df[(a.groupby(a).transform('size') <= 6) | mask].copy() # ICP gaps of up to 30 mins
    df1.loc[:, "icp"] = df1["icp"].ffill()
    
    df1["observationoffset"] -= df1.iloc[0, 1]
    patient_dfs3.append(df1)
    df1["offsetdiff"] = df1["observationoffset"].diff()
    
    mask1  = df1["offsetdiff"] == 0
    b = mask1.ne(mask1.shift()).cumsum()
    df2 = df1[(b.groupby(b).transform('size') >= 24) | mask1].copy()
    patient_dfs4.append(df2.drop(columns="offsetdiff"))
    
merged_dynamic_df3 = pd.concat(patient_dfs3)
merged_dynamic_df4 = pd.concat(patient_dfs4)
merged_dynamic_df4["icp"].interpolate(inplace=True)

In [44]:
merged_dynamic_df4.to_csv(INTERIM_DIR + "/dynamic_info_cleaned.csv", index=False)

In [45]:
# percentage missing
(merged_dynamic_df4.isna().sum()) / len(merged_dynamic_df4)

patientunitstayid                              0.000000
observationoffset                              0.000000
temperature                                    0.673937
sao2                                           0.000658
heartrate                                      0.000004
respiration                                    0.019071
cvp                                            0.694631
etco2                                          0.842990
systemicsystolic                               0.267893
systemicdiastolic                              0.267893
systemicmean                                   0.262383
pasystolic                                     0.996839
padiastolic                                    0.996839
pamean                                         0.994490
icp                                            0.000000
Glasgow coma score - Eyes                      0.312108
Glasgow coma score - GCS Total                 0.234315
Glasgow coma score - Motor                     0