In [1]:
# Baseline model with LR

In [2]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
import re

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support as prf
from sklearn.metrics import matthews_corrcoef as mcc

In [3]:
# Globals

SEED = 42
samp_size = 5000   # Use the same as the precomputed df to test on

# List of embedding to load from disk (name of embedding model used)
modnames = [
    #'all-distilroberta-v1',
    #'medicalai/ClinicalBERT',
    #'emilyalsentzer/Bio_Discharge_Summary_BERT',
    'nazyrova/clinicalBERT'
    ]

# all-distilroberta-v1                          ## Non-specific
# medicalai/ClinicalBERT                        ## Healthcare-specific
# emilyalsentzer/Bio_Discharge_Summary_BERT     ## MIMIC-III discharge notes
# nazyrova/clinicalBERT                         ## MIIMC-IV discharge notes

truncation_side = 'right' # right middle 
balanced_data = True
summaries = False
withprepended = True

In [4]:
# LR fixed hyperparams
lr_base_kwargs = {
    'penalty': 'elasticnet',
    'max_iter': 1000,
    'l1_ratio': .5,
    'n_jobs': -1,
    'random_state': SEED
}

# LR adjustable and testable params
lr_tol = [.0001]
lr_C = [20.0, 50.0]
lr_solver = [
    #'liblinear',
    #'lbfgs',
    'saga'
    ]

In [5]:
## Project root path
pjpath = ''

# Hacky way of finding the project's root path. Do not rely on this, set your own pjpath!
for p in Path.cwd().parents:
    if p.stem == 'llms4mortality':
        pjpath = p
        break

print(f'> Project path is {pjpath}')

> Project path is /home/daucco/ownCloud-UPM/CBR/llms4mortality


In [6]:
# Set this to your MIMIC-IV path where discharge, patients and admissions tables are located
mimicpath = pjpath / 'data/mimiciv'

In [7]:
# Load precomputed dataframe. Keeps only hadm_id and delta_days_dod (to find patients that died after n days discharge)
# Transform to boolean (patient died within 30 days after discharge)
df = pd.read_csv(mimicpath / f'mimiciv_4_mortality_S{samp_size}{'_balanced' if balanced_data else ''}.csv.gz')[['hadm_id', 'delta_days_dod']]
df['delta_days_dod'] = df['delta_days_dod'].apply(lambda x: x > 0 and x <= 30)  # Only keeps this columns, so there's no need to rename it for eval

# Load precomputed splits
with open(mimicpath / f'hadmid_splits_S{samp_size}{'_balanced' if balanced_data else ''}.json', 'r') as ifile:
    splits_hadmids = json.load(ifile)

# Load sorted hadm_ids from disk
with open(mimicpath / f'hadmid_sorted_S{samp_size}{'_balanced' if balanced_data else ''}.json', 'r') as ifile:
    emb_hadmids = json.load(ifile)['HADM_ID']

In [9]:
# Fits and evaluates models for each type of embeddings

res = pd.DataFrame(columns=['features', 'tol', 'C', 'solver', 'f1_micro', 'f1_macro', 'mcc'])

e_n = len(modnames) * len(lr_tol) * len(lr_C) * len(lr_solver)
e_count = 1
for  i, modname in enumerate(modnames):
    # Load embedding model (need to be pregenerated in disk)
    modname = re.sub('[^a-zA-Z0-9]+', '', modname)
    mod_fname = f'embeddings_{modname}_{'summary_' if summaries else ''}S{samp_size}_T{truncation_side}{'_balanced' if balanced_data else ''}{'_PR' if withprepended else ''}.npy'
    
    print(f'> Loading embeddings from {mod_fname}...')
    embeddings = np.load(mimicpath / mod_fname)

    # Locates train and test data
    hadm2idx = {hadm: i for i, hadm in enumerate(emb_hadmids)}
    X_train = embeddings[[hadm2idx[hadm] for hadm in splits_hadmids['train']]]
    y_train = df.set_index('hadm_id').loc[splits_hadmids['train']]
    X_test = embeddings[[hadm2idx[hadm] for hadm in splits_hadmids['test']]]
    y_test = df.set_index('hadm_id').loc[splits_hadmids['test']]

    for tol in lr_tol:
        for C in lr_C:
            for solver in lr_solver:
                # Fits LR
                print(f'>> EX IDX: {e_count}/{e_n}')
                print(f'> [EX. CONFIG]: model:{modname}\ntol:{tol}, C:{C}, solver:{solver}')
                print(f'> Fitting LR model on samples of shape: {X_train.shape}')
                lr_cla = LogisticRegression(tol=tol, C=C, solver=solver, **lr_base_kwargs).fit(X_train, y_train)

                # Eval LR (prf micro macro, mcc)
                y_pred = lr_cla.predict(X_test)
                
                # Save results to results df
                res.loc[len(res)] = [modname,
                    tol,
                    C,
                    solver,
                    prf(y_test, y_pred, average='micro')[2],
                    prf(y_test, y_pred, average='macro')[2],
                    mcc(y_test, y_pred)]

                e_count += 1

# Export results df to disk
res.to_csv(f'{pjpath}/exps/results/lr_embeddings_S{samp_size}{'_balanced' if balanced_data else ''}.csv', index=False)

> Loading embeddings from embeddings_nazyrovaclinicalBERT_S5000_Tright_balanced_PR.npy...
>> EX IDX: 1/2
> [EX. CONFIG]: model:nazyrovaclinicalBERT
tol:0.0001, C:20.0, solver:saga
> Fitting LR model on samples of shape: (1502, 768)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


>> EX IDX: 2/2
> [EX. CONFIG]: model:nazyrovaclinicalBERT
tol:0.0001, C:50.0, solver:saga
> Fitting LR model on samples of shape: (1502, 768)


