In [33]:
import pandas as pd
import numpy as np
import os

DATA_PATH = "D:/Local/PhysionetChallenge2012/predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0"
drop_ratio = 0 # 0, 0.2, 0.4, 0.6, 0.8
# pick a set
# dataset = "set-a"
dataset = "set-b"
data_path = os.path.join(DATA_PATH, dataset)

In [34]:
# load all files into list of lists
txt_all = list()
for f in os.listdir(data_path):
    with open(os.path.join(data_path, f), 'r') as fp:
        txt = fp.readlines()
        
    # get recordid to add as a column
    recordid = txt[1].rstrip('\n').split(',')[-1]
    txt = [t.rstrip('\n').split(',') + [int(recordid)] for t in txt]
    txt_all.extend(txt[1:])
    
    
# convert to pandas dataframe
df = pd.DataFrame(txt_all, columns=['time', 'parameter', 'value', 'recordid'])

# extract static variables into a separate dataframe
df_static = df.loc[df['time'] == '00:00', :].copy()

# retain only one of the 6 static vars:
static_vars = ['RecordID', 'Age', 'Gender', 'Height', 'ICUType', 'Weight']
df_static = df_static.loc[df['parameter'].isin(static_vars)]

# remove these from original df
idxDrop = df_static.index
df = df.loc[~df.index.isin(idxDrop), :]

# to ensure there are no duplicates, group by recordid/parameter and take the last value
# last will be chosen as last row in the loaded file
# there was 1 row in set-b which had 2 weights (70.4, 70.8) and thus required this step
df_static = df_static.groupby(['recordid', 'parameter'])[['value']].last()
df_static.reset_index(inplace=True)

# pivot on parameter so there is one column per parameter
df_static = df_static.pivot(index='recordid', columns='parameter', values='value')

# some conversions on columns for convenience
df['value'] = pd.to_numeric(df['value'], errors='raise')
df['time'] = df['time'].map(lambda x: int(x.split(':')[0])*60 + int(x.split(':')[1]))

df_static.head()

parameter,Age,Gender,Height,ICUType,RecordID,Weight
recordid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
142675,70,1,175.3,2,142675,85.0
142676,57,0,-1.0,3,142676,-1.0
142680,60,1,182.9,3,142680,78.8
142683,64,1,180.3,2,142683,145.2
142688,51,1,172.7,4,142688,90.0


In [35]:
features = {'Albumin': 'Serum Albumin (g/dL)',
    'ALP': 'Alkaline phosphatase (IU/L)',
    'ALT': 'Alanine transaminase (IU/L)',
    'AST': 'Aspartate transaminase (IU/L)',
    'Bilirubin': 'Bilirubin (mg/dL)',
    'BUN': 'Blood urea nitrogen (mg/dL)',
    'Cholesterol': 'Cholesterol (mg/dL)',
    'Creatinine': 'Serum creatinine (mg/dL)',
    'DiasABP': 'Invasive diastolic arterial blood pressure (mmHg)',
    'FiO2': 'Fractional inspired O2 (0-1)',
    'GCS': 'Glasgow Coma Score (3-15)',
    'Glucose': 'Serum glucose (mg/dL)',
    'HCO3': 'Serum bicarbonate (mmol/L)',
    'HCT': 'Hematocrit (%)',
    'HR': 'Heart rate (bpm)',
    'K': 'Serum potassium (mEq/L)',
    'Lactate': 'Lactate (mmol/L)',
    'Mg': 'Serum magnesium (mmol/L)',
    'MAP': 'Invasive mean arterial blood pressure (mmHg)',
    'MechVent': 'Mechanical ventilation respiration (0:false or 1:true)',
    'Na': 'Serum sodium (mEq/L)',
    'NIDiasABP': 'Non-invasive diastolic arterial blood pressure (mmHg)',
    'NIMAP': 'Non-invasive mean arterial blood pressure (mmHg)',
    'NISysABP': 'Non-invasive systolic arterial blood pressure (mmHg)',
    'PaCO2': 'partial pressure of arterial CO2 (mmHg)',
    'PaO2': 'Partial pressure of arterial O2 (mmHg)',
    'pH': 'Arterial pH (0-14)',
    'Platelets': 'Platelets (cells/nL)',
    'RespRate': 'Respiration rate (bpm)',
    'SaO2': 'O2 saturation in hemoglobin (%)',
    'SysABP': 'Invasive systolic arterial blood pressure (mmHg)',
    'Temp': 'Temperature (°C)',
    'TroponinI': 'Troponin-I (μg/L)',
    'TroponinT': 'Troponin-T (μg/L)',
    'Urine': 'Urine output (mL)',
    'WBC': 'White blood cell count (cells/nL)',
    'Weight': 'Weight (kg)'}

In [36]:
# convert static into numeric
for c in df_static.columns:
    df_static[c] = pd.to_numeric(df_static[c])
    
# preprocess
for c in df_static.columns:
    x = df_static[c]
    if c == 'Age':
        # replace anon ages with 91.4
        idx = x > 130
        df_static.loc[idx, c] = 91.4
    elif c == 'Gender':
        idx = x < 0
        df_static.loc[idx, c] = np.nan
    elif c == 'Height':
        idx = x < 0
        df_static.loc[idx, c] = np.nan
        
        # fix incorrectly recorded heights
        
        # 1.8 -> 180
        idx = x < 10
        df_static.loc[idx, c] = df_static.loc[idx, c] * 100
        
        # 18 -> 180
        idx = x < 25
        df_static.loc[idx, c] = df_static.loc[idx, c] * 10
        
        # 81.8 -> 180 (inch -> cm)
        idx = x < 100
        df_static.loc[idx, c] = df_static.loc[idx, c] * 2.2
        
        # 1800 -> 180
        idx = x > 1000
        df_static.loc[idx, c] = df_static.loc[idx, c] * 0.1
        
        # 400 -> 157
        idx = x > 250
        df_static.loc[idx, c] = df_static.loc[idx, c] * 0.3937
        
    elif c == 'Weight':
        idx = x < 35
        df_static.loc[idx, c] = np.nan
        
        idx = x > 299
        df_static.loc[idx, c] = np.nan

In [37]:
def delete_value(df, c, value=0):
    idx = df['parameter'] == c
    idx = idx & (df['value'] == value)
    
    df.loc[idx, 'value'] = np.nan
    return df

def replace_value(df, c, value=np.nan, below=None, above=None):
    idx = df['parameter'] == c
    
    if below is not None:
        idx = idx & (df['value'] < below)
        
    if above is not None:
        idx = idx & (df['value'] > above)
    
    
    if 'function' in str(type(value)):
        # value replacement is a function of the input
        df.loc[idx, 'value'] = df.loc[idx, 'value'].apply(value)
    else:
        df.loc[idx, 'value'] = value
        
    return df

Apply dynamic data rules.

In [38]:
df = delete_value(df, 'DiasABP', -1)
df = replace_value(df, 'DiasABP', value=np.nan, below=1)
df = replace_value(df, 'DiasABP', value=np.nan, above=200)
df = replace_value(df, 'SysABP', value=np.nan, below=1)
df = replace_value(df, 'MAP', value=np.nan, below=1)

df = replace_value(df, 'NIDiasABP', value=np.nan, below=1)
df = replace_value(df, 'NISysABP', value=np.nan, below=1)
df = replace_value(df, 'NIMAP', value=np.nan, below=1)

df = replace_value(df, 'HR', value=np.nan, below=1)
df = replace_value(df, 'HR', value=np.nan, above=299)

df = replace_value(df, 'PaCO2', value=np.nan, below=1)
df = replace_value(df, 'PaCO2', value=lambda x: x*10, below=10)

df = replace_value(df, 'PaO2', value=np.nan, below=1)
df = replace_value(df, 'PaO2', value=lambda x: x*10, below=20)

# the order of these steps matters
df = replace_value(df, 'pH', value=lambda x: x*10, below=0.8, above=0.65)
df = replace_value(df, 'pH', value=lambda x: x*0.1, below=80, above=65)
df = replace_value(df, 'pH', value=lambda x: x*0.01, below=800, above=650)
df = replace_value(df, 'pH', value=np.nan, below=6.5)
df = replace_value(df, 'pH', value=np.nan, above=8.0)

# convert to farenheit
df = replace_value(df, 'Temp', value=lambda x: x*9/5+32, below=10, above=1)
df = replace_value(df, 'Temp', value=lambda x: (x-32)*5/9, below=113, above=95)

df = replace_value(df, 'Temp', value=np.nan, below=25)
df = replace_value(df, 'Temp', value=np.nan, above=45)

df = replace_value(df, 'RespRate', value=np.nan, below=1)
df = replace_value(df, 'WBC', value=np.nan, below=1)

df = replace_value(df, 'Weight', value=np.nan, below=35)
df = replace_value(df, 'Weight', value=np.nan, above=299)

N = df.shape[0]
print(df.shape)
drop_indices = np.random.choice(df.index, round(drop_ratio*N), replace=False)
df = df.drop(drop_indices)
df = df.set_index("recordid")
print(df.shape)

(1738534, 4)
(1738534, 3)


Create a design matrix X.

In [39]:
# Initialize a dataframe with df_static
X_static = df_static.copy()

X_static.drop('RecordID', axis=1, inplace=True)

# MICU is ICUType==3, and is used as the reference category
X_static['CCU'] = (X_static['ICUType'] == 1).astype(int)
X_static['CSRU'] = (X_static['ICUType'] == 2).astype(int)
X_static['SICU'] = (X_static['ICUType'] == 4).astype(int)
X_static.drop('ICUType', axis=1, inplace=True)

print(X_static.shape)
X_static.head()

(4000, 7)


parameter,Age,Gender,Height,Weight,CCU,CSRU,SICU
recordid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
142675,70.0,1.0,175.3,85.0,0,1,0
142676,57.0,0.0,,,0,0,0
142680,60.0,1.0,182.9,78.8,0,0,0
142683,64.0,1.0,180.3,145.2,0,1,0
142688,51.0,1.0,172.7,90.0,0,0,1


In [40]:
X_time = df.copy()

feats = ['Albumin', 'ALP', 'ALT', 'AST', 'Bilirubin', 'BUN', 'Cholesterol', 'Creatinine',
    'DiasABP', 'FiO2', 'GCS', 'Glucose', 'HCO3', 'HCT', 'HR', 'K', 'Lactate', 'Mg', 'MAP', 'MechVent',
    'Na', 'NIDiasABP', 'NIMAP', 'NISysABP', 'PaCO2', 'PaO2', 'pH', 'Platelets', 'RespRate',
    'SaO2', 'SysABP', 'Temp', 'TroponinI', 'TroponinT', 'Urine', 'WBC', 'Weight'
]

X_time = X_time.drop(columns=['parameter', 'value'])
new_cols = {feat: np.nan for feat in feats}
X_time = X_time.assign(**new_cols)
X_time = X_time.reset_index()
X_time = X_time.drop_duplicates(subset=['recordid', 'time'])
X_time = X_time.set_index(['recordid', 'time'])

for i, row in df.iterrows():
    X_time.loc[(i, row['time']), row['parameter']] = row['value']

print(X_time.shape)
X_time.head()

(295167, 37)


Unnamed: 0_level_0,Unnamed: 1_level_0,Albumin,ALP,ALT,AST,Bilirubin,BUN,Cholesterol,Creatinine,DiasABP,FiO2,...,Platelets,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight
recordid,time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
142675,44,,,,,,,,,,,...,,,,,,,,,,
142675,78,,,,,,,,,,,...,,,,,,,,400.0,,
142675,93,,,,,,,,,60.0,,...,,,,104.0,,,,,,
142675,99,,,,,,,,,,1.0,...,,,,,,,,,,
142675,108,,,,,,,,,71.0,,...,,,,116.0,35.7,,,200.0,,


In [41]:
X_time = X_time.reset_index()
gp = X_time.groupby(by="recordid")
X_time_keys = gp.groups.keys()
X_time = X_time.set_index("recordid")

In [42]:
# load in outcomes
if dataset == 'set-a':
    output_path = os.path.join(DATA_PATH, 'Outcomes-a.txt')
elif dataset == 'set-b':
    output_path = os.path.join(DATA_PATH, 'Outcomes-b.txt')
y = pd.read_csv(output_path)
    
y.set_index('RecordID', inplace=True)
y.index.name = 'recordid'
y = y.loc[X_time_keys]
print(y.shape)
y.head()

(3993, 5)


Unnamed: 0_level_0,SAPS-I,SOFA,Length_of_stay,Survival,In-hospital_death
recordid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
142675,27,14,9,7,1
142676,12,1,31,468,0
142680,12,7,17,16,1
142683,19,15,17,-1,0
142688,3,0,9,-1,0


In [43]:
# output to file
if not os.path.exists("./data"):
    os.makedirs("data")
processed_path = "data/"
X_static.to_csv(os.path.join(processed_path, f"{dataset}_static_{drop_ratio}.csv"), sep=',', index=True)
X_time.to_csv(os.path.join(processed_path, f"{dataset}_time_{drop_ratio}.csv"), sep=',', index=True)
y.to_csv(os.path.join(processed_path, f"{dataset}_y_{drop_ratio}.csv"), sep=',', index=True)