In [1]:
# Helper notebooks to generate dummies from categorical variables out of the precomputed dataframe

In [2]:
import pandas as pd
from pathlib import Path

In [3]:
# Globals

SEED = 42

# Data info. Will be used to load the appropriate precomputed data form disk
samp_size = 5000
balanced_data = True

In [4]:
## Project root path
pjpath = ''

# Hacky way of finding the project's root path. Do not rely on this, set your own pjpath!
for p in Path.cwd().parents:
    if p.stem == 'llms4mortality':
        pjpath = p
        break

print(f'> Project path is {pjpath}')

> Project path is /home/daucco/ownCloud/unsync/_entregabledata/llms4mortality


In [5]:
# Set this to your MIMIC-IV path where discharge, patients and admissions tables are located
mimicpath = pjpath / 'data/mimiciv'

In [6]:
# Load precomputed dataframe.
df_id = f'mimiciv_4_mortality_S{samp_size}{'_balanced' if balanced_data else ''}.csv.gz'
df = pd.read_csv(mimicpath / df_id, index_col='hadm_id')

In [7]:
from ast import literal_eval

# Gets dummies from categorical data
categorical_columns = ['gender', 'admission_type', 'insurance', 'marital_status', 'race']
for category in categorical_columns:
    df = pd.get_dummies(df, prefix=category, prefix_sep='_', columns=[category])

# Special treatment for drg_codes
# First we evaluate data, as it was reimported as strings instead of lists
df_drg = df['drg_code'].apply(literal_eval).to_frame()

# Then we explode values
df_drg = df_drg.explode('drg_code')

# Aggregate categoricals into a single row per hadm_id (index)
df_drg = pd.get_dummies(df_drg, prefix='drg_code', prefix_sep='_', columns=['drg_code']).groupby('hadm_id').sum()

# Drop original drg_code from df and merge with drg_code
df = df.drop(columns=['drg_code'])
df = pd.merge(df, df_drg, on='hadm_id', how='inner')

df.shape

(5000, 726)

In [8]:
# Export result to disk
df.reset_index().to_csv(mimicpath / f'd_{df_id}', index=False)