In [1]:
import pandas as pd
from pathlib import Path
import json
from sklearn.model_selection import train_test_split

In [2]:
# Globals

SEED = 42

# Sample data
samp_size = 5000

# Create 4-splits (train, test, validation and case base for cbr)
split_ratios = {
    'cb': .4,
    'train': .3,
    'test': .2,
    'val': .1       
}

# Set to true if forcing case balance per mortality
balanced_data = True

In [3]:
## Project root path
pjpath = ''

# Hacky way of finding the project's root path. Do not rely on this, set your own pjpath!
for p in Path.cwd().parents:
    if p.stem == 'llms4mortality':
        pjpath = p
        break

print(f'> Project path is {pjpath}')

> Project path is /home/daucco/ownCloud-UPM/CBR/llms4mortality


In [4]:
# Set this to your MIMIC-IV path where discharge, patients and admissions tables are located
mimicpath = pjpath / 'data/mimiciv'

In [5]:
# Load data from discharge, patients and admissions tables. Keep only relevant columns
print('> Loading tables...')
df_discharge = pd.read_csv(mimicpath / 'discharge.csv.gz')[['note_id', 'subject_id', 'hadm_id', 'charttime', 'text']]
df_patients = pd.read_csv(mimicpath / 'patients.csv.gz')[['subject_id', 'gender', 'dod', 'anchor_age', 'anchor_year']]
df_admissions = pd.read_csv(mimicpath / 'admissions.csv.gz')[['hadm_id', 'admittime', 'admission_type', 'insurance', 'marital_status', 'race']]
df_drgs = pd.read_csv(mimicpath / 'drgcodes.csv.gz')[['hadm_id', 'description', 'drg_mortality', 'drg_code']]

# Fixes date columns across tables
print('> Fixing date types...')
df_discharge['charttime'] = pd.to_datetime(df_discharge['charttime'], format='%Y-%m-%d %H:%M:%S').dt.date
df_patients['dod'] = pd.to_datetime(df_patients['dod'], format='%Y-%m-%d').dt.date
df_admissions['admittime'] = pd.to_datetime(df_admissions['admittime'].apply(lambda x: x.split(' ')[0]), format='%Y-%m-%d').dt.date # We are only interested in y-m-d

# Fix drg codes

## Descriptions
# Concatenates multiple drg descriptions for the same hadm_id
df_drgs_descs = df_drgs[['hadm_id', 'description']].groupby('hadm_id')['description'].apply(lambda x: ', '.join(x)).to_frame(name='diagnose_group_description')

## Moratility risk rank
df_drgs_mortality = df_drgs[['hadm_id', 'drg_mortality']]
# Helper dic to transform drg_mortality rank into literals
mortality_rank2literal = {
    1: 'LOW',
    2: 'MODERATE',
    3: 'HIGH',
    4: 'VERY HIGH'
}

# Finds invalid ranks
df_drgs_mortality['drg_mortality'] = df_drgs_mortality['drg_mortality'].apply(lambda x: x if x in list(mortality_rank2literal.keys()) else -1)
mortality_rank2literal[-1] = 'UNKOWN'   # Literal for unkown ranks

# Keeps only highest mortality rank per hadm_id
df_drgs_mortality = df_drgs_mortality.groupby('hadm_id')['drg_mortality'].max().to_frame(name='drg_mortality')

# Gets literals
df_drgs_mortality['diagnose_group_mortality'] = df_drgs_mortality['drg_mortality'].apply(lambda x: mortality_rank2literal[x])

# Combines descriptions and processed rank into single frame
df_drgs_m = pd.merge(df_drgs_descs, df_drgs_mortality, 'inner', on='hadm_id')

# Combines all drg codes into a single column (as a list of codes).
df_drgs_code = df_drgs[['hadm_id', 'drg_code']].groupby('hadm_id').agg(list)

# Merges into single frame
df_drgs = pd.merge(df_drgs_m, df_drgs_code, 'inner', on='hadm_id')

> Loading tables...
> Fixing date types...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_drgs_mortality['drg_mortality'] = df_drgs_mortality['drg_mortality'].apply(lambda x: x if x in list(mortality_rank2literal.keys()) else -1)


In [6]:
# Main df generation

# Merge tables and fix index to hadm_id (unique in generated merge)
print('> Merging...')
df = pd.merge(df_discharge, df_patients, on='subject_id', how='inner')
df = pd.merge(df, df_admissions, on='hadm_id', how='inner')
df = pd.merge(df, df_drgs, on='hadm_id', how='inner')
df.set_index('hadm_id', inplace=True)

print('> Resolving additional columns...')
# Resolves age of patient at (each) hospital admission
df['age'] = df['charttime'].apply(lambda x: x.year) - df['anchor_year'] + df['anchor_age']

# 1. Finds hadm_id of cases for which there's a dod registered
#   Assumes that if there's no dod, then dod==NaN!=NaN
hadm_ids_dead = df[df['dod'] == df['dod']].index

# 2. Resolves delta_days_dod: days passed from any given report to patient death where applicable. -1 if no registry of patient death
df['delta_days_dod'] = -1
df['delta_days_dod'].loc[hadm_ids_dead] = df.loc[hadm_ids_dead].apply(lambda e: int((e['dod'] - e['charttime']).days), axis=1)
#df['delta_days_dod'].loc[predeath_note_idx] = df.loc[predeath_note_idx].apply(lambda e: int((e['dod'] - e['charttime']).days), axis=1)

## 3. Drop instances where delta_days_dod == 0. (patient died on arrival (DOA) or shortly after (~same day))
#   In most of these, there's direct mention to the patient's death in the report, hence these are invalid for the mortality prediction task
df = df[~(df['delta_days_dod'] == 0)]

## 5. Balance data if specified
if balanced_data:
    """
        To create a balanced set we consider 3 types of patient profiles depending on when they died according to the mimic registry
            1. Never or later than 365 days after discharge (delta_days_dod == -1 or delta_days_dod > 365)
            2. Within 30 days after discharge (0 < delta_days_dod <= 30)
            3. Within 30 and 365 days after discharge (30 < delta_days_dod <= 365)
        This ensures a uniform proportion of discharge notes in terms of the type of mortality they encompass
    """
    print(f'> Balancing data...')

    # Find the total number of instances of each considered type:
    df_died_after_365 = df[df['delta_days_dod'].apply(lambda x: x == -1 or x > 365)]
    df_died_within_30 = df[df['delta_days_dod'].apply(lambda x: x > 0 and x <= 30)]
    df_died_within_30365 = df[df['delta_days_dod'].apply(lambda x: x > 30 and x <= 365)]

    n_total = len(df)

    n_died_after_365 = len(df_died_after_365)
    n_died_within_30 = len(df_died_within_30)
    n_died_within_30365 = len(df_died_within_30365)

    # Resolves the number of samples of the smaller subset, then samples the rest to this number
    n_smaller = min(n_died_after_365, n_died_within_30, n_died_within_30365)
    df_died_after_365 = df_died_after_365.sample(n=n_smaller, random_state=SEED)
    df_died_within_30 = df_died_within_30.sample(n=n_smaller, random_state=SEED)
    df_died_within_30365 = df_died_within_30365.sample(n=n_smaller, random_state=SEED)

    # Updates df with balanced samples
    df = pd.concat((df_died_after_365, df_died_within_30, df_died_within_30365))

    # Updates samp_size if not enough samples in new balanced set
    samp_size = min(samp_size, 3*n_smaller)

# Fix unkown drg_mortality
df['drg_mortality'] = df['drg_mortality'].fillna(-1)

# Fix missing values on non-essential columns that can have nans (except dod, it might be useful to have an actual nan here)
df.loc[:, df.columns != 'dod'] = df.loc[:, df.columns != 'dod'].fillna('UNKOWN')

# Resets index
df.reset_index(inplace=True)

# Fix numerical types
numeric_columns = ['age', 'drg_mortality']
df = df.astype({cname: int for cname in numeric_columns})

# Take a sample
df_samp = df.sample(samp_size, random_state=SEED)

print('done')

> Merging...
> Resolving additional columns...


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['delta_days_dod'].loc[hadm_ids_dead] = df.loc[hadm_ids_dead].apply(lambda e: int((e['dod'] - e['charttime']).days), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/s

> Balancing data...
done


In [7]:
# We also get dummies from categorical data. This will later be useful to assess similarity out of these values
categorical_columns = ['gender', 'admission_type', 'insurance', 'marital_status', 'race']

df_dummies = df_samp.copy()
for category in categorical_columns:
    df_dummies = pd.get_dummies(df_dummies, prefix=category, prefix_sep='_', columns=[category])

# Special treatment for drg_codes
# First we evaluate data, as it was reimported as strings instead of lists
df_drg = df_dummies.set_index('hadm_id')['drg_code'].to_frame()

# Then we explode values
df_drg = df_drg.explode('drg_code')

# Aggregate categoricals into a single row per hadm_id (index)
df_drg = pd.get_dummies(df_drg, prefix='drg_code', prefix_sep='_', columns=['drg_code']).groupby('hadm_id').sum()

# Drop original drg_code from df and merge with drg_code
df_dummies = df_dummies.drop(columns=['drg_code'])
df_dummies = pd.merge(df_dummies, df_drg, on='hadm_id', how='inner')

In [8]:
print(df.shape)
print(df.columns)
print(df_dummies.shape)
print(df_dummies.columns)
df.head()

(25959, 20)
Index(['hadm_id', 'note_id', 'subject_id', 'charttime', 'text', 'gender',
       'dod', 'anchor_age', 'anchor_year', 'admittime', 'admission_type',
       'insurance', 'marital_status', 'race', 'diagnose_group_description',
       'drg_mortality', 'diagnose_group_mortality', 'drg_code', 'age',
       'delta_days_dod'],
      dtype='object')
(5000, 727)
Index(['hadm_id', 'note_id', 'subject_id', 'charttime', 'text', 'dod',
       'anchor_age', 'anchor_year', 'admittime', 'diagnose_group_description',
       ...
       'drg_code_974', 'drg_code_975', 'drg_code_976', 'drg_code_977',
       'drg_code_981', 'drg_code_982', 'drg_code_983', 'drg_code_987',
       'drg_code_988', 'drg_code_989'],
      dtype='object', length=727)


Unnamed: 0,hadm_id,note_id,subject_id,charttime,text,gender,dod,anchor_age,anchor_year,admittime,admission_type,insurance,marital_status,race,diagnose_group_description,drg_mortality,diagnose_group_mortality,drg_code,age,delta_days_dod
0,23149593,15076868-DS-14,15076868,2125-06-15,\nName: ___ Unit No: ___\n \...,F,NaT,77,2125,2125-06-08,OBSERVATION ADMIT,Medicare,WIDOWED,WHITE,DORSAL & LUMBAR FUSION PROC EXCEPT FOR CURVATU...,1,LOW,"[304, 454]",77,-1
1,25964724,18335259-DS-3,18335259,2176-03-22,\nName: ___ Unit No: ___\...,M,NaT,56,2172,2176-03-20,EW EMER.,Other,MARRIED,WHITE,"DISORDERS OF PANCREAS EXCEPT MALIGNANCY, DISOR...",1,LOW,"[282, 440]",60,-1
2,25340693,18410637-DS-21,18410637,2122-10-26,\nName: ___ Unit No: ___\n ...,M,NaT,69,2122,2122-10-07,OBSERVATION ADMIT,Other,MARRIED,WHITE - RUSSIAN,"RADIOTHERAPY, PERIPH/CRANIAL NERVE & OTHER NER...",4,VERY HIGH,"[692, 40]",69,-1
3,20291958,16328062-DS-16,16328062,2176-04-14,\nName: ___ Unit No: ___\n...,F,NaT,79,2176,2176-04-11,EW EMER.,Medicare,SINGLE,WHITE,"OTHER COMPLICATIONS OF TREATMENT, COMPLICATION...",1,LOW,"[813, 921]",79,-1
4,21828253,14387168-DS-9,14387168,2159-03-16,\nName: ___ Unit No: ___\n \...,F,NaT,37,2159,2159-03-07,URGENT,Medicare,UNKOWN,UNKNOWN,"BIPOLAR DISORDERS, PSYCHOSES",1,LOW,"[753, 885]",37,-1


In [9]:
# Split and export

df_id = f'mimiciv_4_mortality_S{samp_size}{'_balanced' if balanced_data else ''}'

# Split by subject to avoid leakage
subj_cb_train, subj_test_val = train_test_split(df_samp['subject_id'].unique(), train_size=(split_ratios['cb'] + split_ratios['train']), random_state=SEED)

# Second level split (accounts for it in train_size)
subj_cb, subj_train, = train_test_split(subj_cb_train, train_size=(split_ratios['cb'] / (split_ratios['cb']+split_ratios['train'])), random_state=SEED)
subj_test, subj_val, = train_test_split(subj_test_val, train_size=(split_ratios['test'] / (split_ratios['test']+split_ratios['val'])), random_state=SEED)

# Resolves hadm_id from subject_id for each split. These will be the unique indices used later to resolve the entries of each split in the main df
splits_hadmids = {}
for sname, ssubj_idx in zip(('cb', 'train', 'test', 'val'), (subj_cb, subj_train, subj_test, subj_val)):
    splits_hadmids[sname] = df_samp[df_samp['subject_id'].isin(ssubj_idx)]['hadm_id'].to_list()

# Export full sampled dataframes and split info to disk
df_samp.to_csv(mimicpath / f'{df_id}.csv.gz', index=False)
df_dummies.to_csv(mimicpath / f'd_{df_id}.csv.gz', index=False)

with open(mimicpath / f'hadmid_splits_S{samp_size}{'_balanced' if balanced_data else ''}.json', 'w') as ofile:
    json.dump(splits_hadmids, ofile)

# Also exports sorted list of hadm_ids for the sample as a separate json file
with open(mimicpath / f'hadmid_sorted_S{samp_size}{'_balanced' if balanced_data else ''}.json', 'w') as ofile:
    json.dump({'HADM_ID': df_samp['hadm_id'].to_list()}, ofile)