# Replication of Èric's notebook

I did try removing categorical features and it improved the metrics so that's the main difference between his notebook and this one, together with this one actually setting a random seed :P

In [15]:
import pandas as pd
import numpy as np
import torch
from lightgbm import LGBMClassifier
from sklearn.feature_extraction.text import CountVectorizer
from category_encoders import TargetEncoder

In [16]:
"""
IMPORTANT!!! You should be setting a random seed!
"""

# Set the random seed for reproducibility
# np.random.seed() has a range of [0, 2**32 - 1] for the seed value
# print(np.random.randint(0, 2**32 - 1)) # Uncomment here for the first run to be a statistical purist
SEED = 3508706438 # Obviously your print is going to be something different

torch.manual_seed(SEED)
np.random.seed(SEED)

## 1. Data Loading & Relational Merging
We use Pandas for the initial relational operations (merges, groupbys) as these are inefficient to replicate from scratch in PyTorch. If we wanted to go full on with neural networks we would simply use a different approach altogether.

In [17]:
# Load datasets
test_hef = pd.read_csv('../data/MIMIC/MIMIC_III_dataset_death/mimic_test_death.csv')
train_hef = pd.read_csv('../data/MIMIC/MIMIC_III_dataset_death/mimic_train.csv')
extra_diag = pd.read_csv('../data/MIMIC/MIMIC_III_dataset_death/extra_data/MIMIC_diagnoses.csv')

In [None]:
"""
We can NOT discern which icu stay comes before which other icu stay within the same hospital stay, so we will try to:
    1. Remove all the observations in the training set that share hadm_id with the test set.
    2. Remove all all observations in the training set that come from a subject at a posterior time than in the test set (this is additional to the previous step)
"""

hadm_ids_test = set(test_hef['hadm_id'])
subject_ids_test = set(test_hef['subject_id'])

train_hef_og = train_hef.copy()

# First option:
train_hef = train_hef[~train_hef['hadm_id'].isin(hadm_ids_test)]
print(len(train_hef_og), len(train_hef), '\n')

# Second option:
min_test_admittime = test_hef.groupby('subject_id')['ADMITTIME'].min().reset_index().rename(columns={'ADMITTIME': 'min_test_admittime'})
train_hef = train_hef.merge(min_test_admittime, on='subject_id', how='left')
train_hef = train_hef[(train_hef['ADMITTIME'] < train_hef['min_test_admittime']) | (train_hef['min_test_admittime'].isna())]
train_hef.drop(columns=['min_test_admittime'], inplace=True)
print(len(train_hef_og), len(train_hef), '\n')

In [None]:
extra_diag['ICD9_CODE'] = extra_diag['ICD9_CODE'].astype(str)
extra_diag['ICD9_CHAPTER'] = extra_diag['ICD9_CODE'].str[:3]

extra_diag['IS_SEPSIS'] = extra_diag['ICD9_CODE'].str.startswith(('9959', '7855')).astype(int)
extra_diag['IS_HEART_FAIL'] = extra_diag['ICD9_CODE'].str.startswith('428').astype(int)
extra_diag['IS_CANCER'] = extra_diag['ICD9_CODE'].str.startswith(('196', '197', '198', '199')).astype(int)
extra_diag['IS_RENAL'] = extra_diag['ICD9_CODE'].str.startswith(('584', '585')).astype(int)

diag_grouped = extra_diag.groupby('HADM_ID').agg({
    'ICD9_CODE': [
        ('NUM_DIAGNOSES', 'count'),                                  
        ('DIAG_STRING', lambda x: ' '.join(x.dropna().astype(str)))],
    'ICD9_CHAPTER': [('UNIQUE_CHAPTERS', 'nunique')],
    'IS_SEPSIS': [('HAS_SEPSIS', 'max')],
    'IS_HEART_FAIL': [('HAS_HEART_FAIL', 'max')],
    'IS_CANCER': [('HAS_CANCER', 'max')],
    'IS_RENAL': [('HAS_RENAL', 'max')]
})

diag_grouped.columns = diag_grouped.columns.droplevel(0)
diag_grouped = diag_grouped.reset_index()

""""""
print(extra_diag['HADM_ID'].nunique())
set(train_hef['hadm_id'])

# check whether all hadm_ids in train_hef are present in the diagnoses dataset
print(set(train_hef['hadm_id']).issubset(set(diag_grouped['HADM_ID'])))
""""""

# Merge features
train_hef = train_hef.merge(diag_grouped, left_on='hadm_id', right_on='HADM_ID', how='left')
test_hef = test_hef.merge(diag_grouped, left_on='hadm_id', right_on='HADM_ID', how='left')

# Fill NaNs created by merge
fill_zeros = ['NUM_DIAGNOSES', 'UNIQUE_CHAPTERS', 'HAS_SEPSIS', 'HAS_HEART_FAIL', 'HAS_CANCER', 'HAS_RENAL', 'DIAG_STRING']
# check whether there actually are NaNs in the merged test and train
print(train_hef[fill_zeros].isna().sum())
print(test_hef[fill_zeros].isna().sum())

""" NO missing values in these columns so we don't need to fill them! """
# for col in fill_zeros:
#     train_hef[col] = train_hef[col].fillna(0)
#     test_hef[col] = test_hef[col].fillna(0)

# train_hef['DIAG_STRING'] = train_hef['DIAG_STRING'].fillna('')
# test_hef['DIAG_STRING'] = test_hef['DIAG_STRING'].fillna('')
""""""

# if 'HADM_ID' in train_hef.columns:
train_hef.drop('HADM_ID', axis=1, inplace=True)
test_hef.drop('HADM_ID', axis=1, inplace=True)

58976
True
NUM_DIAGNOSES      0
UNIQUE_CHAPTERS    0
HAS_SEPSIS         0
HAS_HEART_FAIL     0
HAS_CANCER         0
HAS_RENAL          0
DIAG_STRING        0
dtype: int64
NUM_DIAGNOSES      0
UNIQUE_CHAPTERS    0
HAS_SEPSIS         0
HAS_HEART_FAIL     0
HAS_CANCER         0
HAS_RENAL          0
DIAG_STRING        0
dtype: int64


## 2. Feature Engineering
We continue using pandas a bit longer for the feature engineering steps, as these are also more straightforward to implement with pandas' rich API.

In [None]:
def engineer_features(df_input):
    df = df_input.copy()
    
    df['ADMITTIME'] = pd.to_datetime(df['ADMITTIME'])
    df['DOB'] = pd.to_datetime(df['DOB'])
    df['AGE'] = df['ADMITTIME'].dt.year - df['DOB'].dt.year
    df.loc[df['AGE'] > 89, 'AGE'] = 90
    df.loc[df['AGE'] < 0, 'AGE'] = df['AGE'].median()
    
    original_index = df.index
    df = df.sort_values(by=['subject_id', 'ADMITTIME'])
    df['PREV_ICU_STAYS'] = df.groupby('subject_id').cumcount()
    df['LAST_ADMIT'] = df.groupby('subject_id')['ADMITTIME'].shift(1)
    seconds_diff = (df['ADMITTIME'] - df['LAST_ADMIT']).dt.total_seconds()
    df['DAYS_SINCE_LAST'] = seconds_diff / (24 * 3600)
    df['DAYS_SINCE_LAST'] = df['DAYS_SINCE_LAST'].fillna(-1)
    df = df.reindex(original_index)

    cols_to_drop = ['ADMITTIME', 'DOB', 'LAST_ADMIT', 'DISCHTIME', 'DEATHTIME', 
                    'DOD', 'LOS', 'Diff', 'MeanBP_Min', 'MeanBP_Max', 
                    'MeanBP_Mean', 'hadm_id', 'subject_id']
    df = df.drop([c for c in cols_to_drop if c in df.columns], axis=1)
    return df

train_processed = engineer_features(train_hef)
test_processed = engineer_features(test_hef)

In [None]:
# Split X and y
y_train = torch.tensor(train_processed['HOSPITAL_EXPIRE_FLAG'].values, dtype=torch.float32)
X_train_df = train_processed.drop(['HOSPITAL_EXPIRE_FLAG'], axis=1)
X_test_df = test_processed.copy()

## 3. Tensor-Based Preprocessing Pipeline

Here we leave `sklearn`. We perform transformations explicitly and convert all data to PyTorch Tensors. This ensures that in future steps, we can easily swap components (like the Text processing) for Neural Network layers.

In [None]:
# Define Column Groups
num_cols = [
    'HeartRate_Min', 'HeartRate_Max', 'HeartRate_Mean', 
    'SysBP_Min', 'SysBP_Max', 'SysBP_Mean', 
    'DiasBP_Min', 'DiasBP_Max', 'DiasBP_Mean', 
    'RespRate_Min', 'RespRate_Max', 'RespRate_Mean', 
    'TempC_Min', 'TempC_Max', 'TempC_Mean', 
    'SpO2_Min', 'SpO2_Max', 'SpO2_Mean', 
    'Glucose_Min', 'Glucose_Max', 'Glucose_Mean', 
    'PREV_ICU_STAYS', 'AGE', 'DAYS_SINCE_LAST', 
    'NUM_DIAGNOSES', 'UNIQUE_CHAPTERS', 
    'HAS_SEPSIS', 'HAS_HEART_FAIL', 'HAS_CANCER', 'HAS_RENAL'] 

categorical_cols = [
    'ICD9_diagnosis', 'DIAGNOSIS', 'FIRST_CAREUNIT', 
    'GENDER', 'ADMISSION_TYPE', 'INSURANCE', 'RELIGION', 'MARITAL_STATUS', 'ETHNICITY']

text_col = 'DIAG_STRING'

In [None]:
"""
Numerical columns: Standard Scaling in PyTorch
"""
# Convert to tensors
X_train_num = torch.tensor(X_train_df[num_cols].values, dtype=torch.float32)
X_test_num = torch.tensor(X_test_df[num_cols].values, dtype=torch.float32)

# Calculate stats on Train
mean = X_train_num.mean(dim=0)
std = X_train_num.std(dim=0)

# Normalize (epsilon added for stability)
epsilon = 1e-7
X_train_num_scaled = (X_train_num - mean) / (std + epsilon)
X_test_num_scaled = (X_test_num - mean) / (std + epsilon)

print(f"Numerical Tensor Shape: {X_train_num_scaled.shape}")

"""
Categorical columns: Target Encoding -> Tensor
"""
encoder = TargetEncoder(cols=categorical_cols)
X_train_cat_pandas = encoder.fit_transform(X_train_df[categorical_cols], train_processed['HOSPITAL_EXPIRE_FLAG'])
X_test_cat_pandas = encoder.transform(X_test_df[categorical_cols])

X_train_cat = torch.tensor(X_train_cat_pandas.values, dtype=torch.float32)
X_test_cat = torch.tensor(X_test_cat_pandas.values, dtype=torch.float32)

print(f"Categorical Tensor Shape: {X_train_cat.shape}")

"""
Text feature: CountVectorizer -> Tensor
"""
vect_max_features = 800
vectorizer = CountVectorizer(binary=True, token_pattern=r'(?u)\b\w+\b', max_features=vect_max_features)

X_train_text_sparse = vectorizer.fit_transform(X_train_df[text_col])
X_test_text_sparse = vectorizer.transform(X_test_df[text_col])

X_train_text = torch.tensor(X_train_text_sparse.todense(), dtype=torch.float32)
X_test_text = torch.tensor(X_test_text_sparse.todense(), dtype=torch.float32)

print(f"Text Tensor Shape: {X_train_text.shape}")

Numerical Tensor Shape: torch.Size([20885, 30])
Categorical Tensor Shape: torch.Size([20885, 9])
Text Tensor Shape: torch.Size([20885, 800])


In [None]:
"""
Count Vectorizer Encoding Example
"""

print(X_train_df[text_col][12])
print(X_train_text_sparse[12])
display(X_train_text[12])

4241 99812 41401 4139 4019 2724 25000 E8790 3051
<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 9 stored elements and shape (1, 800)>
  Coords	Values
  (0, 240)	1
  (0, 73)	1
  (0, 105)	1
  (0, 186)	1
  (0, 267)	1
  (0, 653)	1
  (0, 255)	1
  (0, 253)	1
  (0, 679)	1


tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 

## 4. Feature Union & Model Training
We concatenate all processed tensors. Since we are replicating Èric, using LightGBM, we convert the final combined tensor back to Numpy for the `.fit()` call.

In [None]:
# Concatenate all features
X_train_final = torch.cat([X_train_num_scaled,  X_train_text], dim=1)
X_test_final = torch.cat([X_test_num_scaled,  X_test_text], dim=1)

print(f"Final Training Data Shape: {X_train_final.shape}")

# Optimal Parameters
params = {
    'n_estimators': 1200,
    'learning_rate': 0.04,
    'num_leaves': 60,
    'max_depth': -1,
    'subsample': 1.0,
    'scale_pos_weight': 10,
    'n_jobs': -1,
    'random_state': SEED,
    'verbose': -1
}

lgbm = LGBMClassifier(**params)
lgbm.fit(X_train_final.numpy(), y_train.numpy())

Final Training Data Shape: torch.Size([20885, 830])


0,1,2
,boosting_type,'gbdt'
,num_leaves,60
,max_depth,-1
,learning_rate,0.04
,n_estimators,1200
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [None]:
# Prediction
y_proba = lgbm.predict_proba(X_test_final.numpy())[:, 1]



In [None]:
# Save Results
submission_df = pd.DataFrame({
    'icustay_id': test_hef['icustay_id'],
    'prediction': y_proba
})

submission_df.to_csv('../data/MIMIC/lgbm_replication_submission.csv', index=False)