In [None]:

from google.colab import drive
drive.mount('/content/drive') 

directory = "/content/drive/MyDrive/cs598project/data"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas

In [None]:
from datetime import timedelta
from tables.description import Time32Col
from dateutil.relativedelta import relativedelta


# Get last 48 hours of chartevents from ICU stay
def get_48(df): 
    df['CHARTTIME'] = pandas.to_datetime(df['CHARTTIME'])

    l = []  
    for row in df[['SUBJECT_ID', 'HADM_ID', 'ICUSTAY_ID']].drop_duplicates().iterrows():
        row = row[1]
        sid = row['SUBJECT_ID']
        hid = row['HADM_ID']
        iid = row['ICUSTAY_ID']
        events = df.query(f'SUBJECT_ID == @sid & HADM_ID == @hid & ICUSTAY_ID == @iid')
        max_charttime = events['CHARTTIME'].max()
        min_charttime = max_charttime - timedelta(hours=48)
        pruned = events[events['CHARTTIME'] >= min_charttime]
        l.append(pruned)
    return pandas.concat(l)


In [None]:
# These ITEMIDs correspond chart events of interest described in the paper
# e.g (e.g. weight, height, pH, respiratory rate, body temperature, systolic and 
# diastolic blood pressure, capillary refill rate, Glascow coma eye, verbal and motor response parameters)
# We painstakingly collected the codes from the full D_ITEMS list to the best of our ability and matched them to the above

items = [226707,
3580,
3581,
228243,
220734,
220274,
223830,
4202,
618,
220210,
223761,
223762,
6,
51,
442,
455,
3313,
3315,
3317,
3319,
3321,
3323,
3325,
6701,
224167,
227243,
220050,
220179,
225309,
8364,
8368,
8440,
8441,
8502,
8503,
8504,
8505,
8506,
8507,
8508,
8555,
227242,
224643,
220051,
220180,
225310,
115,
3348,
8377,
223951,
224308,
184,
220739,
723,
454,
223900,
223901,
226755,
226756,
226757,
226758,
227011,
227012,
227013,
227014,
1352,	
1880,	
1495,	
7262,	
6754,	
1524,	
220603,	
211,
220045,
220179,
220180,	
220181,
]

In [None]:
# Due to the size of the original CHARTEVENTS table, we processed the file in chunks, filtered time window and item IDS, and resaved for future concatenation
i = 0
with pandas.read_csv(f"{directory}/CHARTEVENTS.csv", chunksize=20000000, low_memory=False) as reader:
    for chunk in reader:
        i += 1
        print(i)
        chunk = chunk[chunk['ITEMID'].isin(items)]
        chunk = get_48(chunk)
        print(chunk.shape)

        chunk.to_csv(
            f"{directory}/chartevents_segments_2/pass_1_chunk_{i}.csv",
            index=False
            )
        
i

1
(3010956, 15)
2
(2523744, 15)
3
(462849, 15)
4
(452921, 15)
5
(463842, 15)
6
(466435, 15)
7
(478495, 15)
8
(484327, 15)
9
(495882, 15)
10
(613325, 15)
11
(803140, 15)
12
(849028, 15)
13
(687340, 15)
14
(240513, 15)
15
(252531, 15)
16
(258522, 15)
17
(136123, 15)


17

In [None]:
# This process took multiple passes as we would read in file fragments based on the above, concatenated, applied 48hr window, and resaved
# The final pruned chartevents table is saved below and used in our next step.
# Chartevent chunks are read in in sections and resaved

In [None]:


df = pandas.concat([
    pandas.read_csv(f"{directory}/chartevents_segments/pass_1_chunk_1.csv", low_memory=False),
    pandas.read_csv(f"{directory}/chartevents_segments/pass_1_chunk_3.csv", low_memory=False),
    pandas.read_csv(f"{directory}/chartevents_segments/pass_1_chunk_4.csv", low_memory=False),
    pandas.read_csv(f"{directory}/chartevents_segments/pass_1_chunk_5.csv", low_memory=False),
    pandas.read_csv(f"{directory}/chartevents_segments/pass_1_chunk_5.csv", low_memory=False),
    pandas.read_csv(f"{directory}/chartevents_segments/pass_1_chunk_7.csv", low_memory=False),
    pandas.read_csv(f"{directory}/chartevents_segments/pass_1_chunk_8.csv", low_memory=False),
    pandas.read_csv(f"{directory}/chartevents_segments/pass_1_chunk_9.csv", low_memory=False),

    ])

df = get_48(df)
df.to_csv(f"{directory}/chartevents_segments/pass_2_chunk_1-9.csv", index=False)

In [None]:
df = pandas.concat([
    pandas.read_csv(f"{directory}/chartevents_segments/pass_1_chunk_10.csv", low_memory=False),
    pandas.read_csv(f"{directory}/chartevents_segments/pass_1_chunk_11.csv", low_memory=False),
    pandas.read_csv(f"{directory}/chartevents_segments/pass_1_chunk_12.csv", low_memory=False),
    pandas.read_csv(f"{directory}/chartevents_segments/pass_1_chunk_13.csv", low_memory=False),
    pandas.read_csv(f"{directory}/chartevents_segments/pass_1_chunk_14.csv", low_memory=False),
    pandas.read_csv(f"{directory}/chartevents_segments/pass_1_chunk_15.csv", low_memory=False),
    pandas.read_csv(f"{directory}/chartevents_segments/pass_1_chunk_16.csv", low_memory=False),
    pandas.read_csv(f"{directory}/chartevents_segments/pass_1_chunk_17.csv", low_memory=False),

    ])

df = get_48(df)
df.to_csv(f"{directory}/chartevents_segments/pass_2_chunk_10-17.csv", index=False)

In [None]:
df1 = pandas.read_csv(f"{directory}/chartevents_segments/pass_2_chunk_1-9.csv", low_memory=False)
df2 = pandas.read_csv(f"{directory}/chartevents_segments/pass_2_chunk_10-17.csv", low_memory=False)
df = pandas.concat([df1, df2])
df = get_48(df)

In [None]:
# Final pruned chartevents is saved
df.to_csv(f"{directory}/chartevents_pruned.csv", index=False)