# Q2.1 Classic Machine Learning Methods

## Pre Task

In [4]:
import pandas as pd
import numpy as np
from project_1.config import PROCESSED_DATA_DIR, PROJ_ROOT

SEED = 42

In [5]:
# Load the data from Parquet files
sets_dict = {}
sets = ["a", "b", "c"]

for set_name in sets:
    directory = PROCESSED_DATA_DIR / f"set_{set_name}_final.parquet"
    temp_set = pd.read_parquet(directory)
    sets_dict[f"set_{set_name}"] = temp_set

# Assure the loading was correct
print(sets_dict["set_a"].shape)
sets_dict["set_a"].head()


(196000, 43)


Unnamed: 0,RecordID,Time,Gender,Height,Weight,Age,Albumin,Cholesterol,DiasABP,HCO3,...,Urine,WBC,pH,MechVent,TroponinT,ALP,ALT,AST,Bilirubin,TroponinI
0,132539.0,2025-03-10 00:00:00,0.0,-0.950365,-0.231946,-0.583612,1.452191,-0.133234,-0.810049,-0.037519,...,0.923077,0.318841,-0.25,0.0,1.314286,0.189655,-0.307692,0.037037,1.166667,0.857143
1,132539.0,2025-03-10 01:00:00,0.0,-0.950365,-0.231946,-0.583612,0.171208,-0.693421,0.634509,-2.669841,...,3.076923,0.608696,-0.625,0.0,14.571429,-0.5,1.557692,-0.240741,0.416667,-0.214286
2,132539.0,2025-03-10 02:00:00,0.0,-0.950365,-0.231946,-0.583612,1.025197,0.856429,0.937401,-0.256879,...,-0.384615,0.434783,0.0,0.0,0.542857,2.12069,-0.461538,1.777778,0.083333,-0.178571
3,132539.0,2025-03-10 03:00:00,0.0,-0.950365,-0.231946,-0.583612,-0.398117,2.761064,0.587911,-0.037519,...,0.692308,-0.594203,0.625,0.0,2.942857,0.534483,51.769231,-0.259259,22.75,0.357143
4,132539.0,2025-03-10 04:00:00,0.0,-0.950365,-0.231946,-0.583612,-0.540449,-0.002524,0.937401,0.181842,...,-0.153846,-0.028986,0.125,0.0,0.857143,-0.224138,-0.096154,-0.018519,1.416667,0.785714


ICU TYpe da cavare

In [6]:

# Define file names
file_names = ["Outcomes-a.txt", "Outcomes-b.txt", "Outcomes-c.txt"]

# Directory path
base_path = PROJ_ROOT / "data" / "data_1" / "predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0"

# Read files into DataFrames containing all variables
outcomes_a, outcomes_b, outcomes_c = [pd.read_csv(base_path / name) for name in file_names]

# Extract only the "RecordID" and "In-hospital_death" column into separate DataFrames
death_a, death_b, death_c = [df[["RecordID", "In-hospital_death"]] for df in [outcomes_a, outcomes_b, outcomes_c]]
death_a.head()

#CHECK for missing values in the outcome data
print(death_a.isnull().sum())
print(death_b.isnull().sum())
print(death_c.isnull().sum())



RecordID             0
In-hospital_death    0
dtype: int64
RecordID             0
In-hospital_death    0
dtype: int64
RecordID             0
In-hospital_death    0
dtype: int64


In [7]:
# Check if each "In-hospital_death" column contains only 0 and 1
for name, df in zip(["a", "b", "c"], [death_a, death_b, death_c]):
    print(f"\nValue counts for death_{name}:\n")
    print(df["In-hospital_death"].value_counts())
    print("\nContains only 0 and 1:", df["In-hospital_death"].isin([0, 1]).all())



Value counts for death_a:

In-hospital_death
0    3446
1     554
Name: count, dtype: int64

Contains only 0 and 1: True

Value counts for death_b:

In-hospital_death
0    3432
1     568
Name: count, dtype: int64

Contains only 0 and 1: True

Value counts for death_c:

In-hospital_death
0    3415
1     585
Name: count, dtype: int64

Contains only 0 and 1: True


So 1 means he died. We are going to fit a classifier to predict death. 

## Task 1 - Prepare Data Sets (train_set, val_set, test_set)

In [8]:
# Define the sets you want to modify
sets = ["set_a", "set_b", "set_c"]

# Delete the column ICUType from each set
for set_name in sets:
    sets_dict[set_name].drop(columns=["ICUType"], inplace=True)

# Check if the column was deleted by printing all the columns of set_a
sets_dict["set_a"].columns

Index(['RecordID', 'Time', 'Gender', 'Height', 'Weight', 'Age', 'Albumin',
       'Cholesterol', 'DiasABP', 'HCO3', 'HCT', 'HR', 'Mg', 'MAP', 'Na',
       'NIDiasABP', 'NIMAP', 'NISysABP', 'SysABP', 'PaCO2', 'PaO2',
       'Platelets', 'RespRate', 'Temp', 'BUN', 'Creatinine', 'FiO2', 'GCS',
       'Glucose', 'K', 'Lactate', 'SaO2', 'Urine', 'WBC', 'pH', 'MechVent',
       'TroponinT', 'ALP', 'ALT', 'AST', 'Bilirubin', 'TroponinI'],
      dtype='object')

In [9]:
# Define training, validation and testing sets
train_set = sets_dict["set_a"]
val_set = sets_dict["set_b"]
test_set = sets_dict["set_c"]


In [10]:

#change class of RecordID to int32
train_set["RecordID"] = train_set["RecordID"].astype("int32")
val_set["RecordID"] = val_set["RecordID"].astype("int32")
test_set["RecordID"] = test_set["RecordID"].astype("int32")


In [11]:
# Merge the training, validation and testing sets with the corresponding death DataFrames
train_set = train_set.merge(death_a, on="RecordID")
val_set = val_set.merge(death_b, on="RecordID") 
test_set = test_set.merge(death_c, on="RecordID")

# Check if the merge was successful by printing the first 5 rows of the training set
train_set.head()

Unnamed: 0,RecordID,Time,Gender,Height,Weight,Age,Albumin,Cholesterol,DiasABP,HCO3,...,WBC,pH,MechVent,TroponinT,ALP,ALT,AST,Bilirubin,TroponinI,In-hospital_death
0,132539,2025-03-10 00:00:00,0.0,-0.950365,-0.231946,-0.583612,1.452191,-0.133234,-0.810049,-0.037519,...,0.318841,-0.25,0.0,1.314286,0.189655,-0.307692,0.037037,1.166667,0.857143,0
1,132539,2025-03-10 01:00:00,0.0,-0.950365,-0.231946,-0.583612,0.171208,-0.693421,0.634509,-2.669841,...,0.608696,-0.625,0.0,14.571429,-0.5,1.557692,-0.240741,0.416667,-0.214286,0
2,132539,2025-03-10 02:00:00,0.0,-0.950365,-0.231946,-0.583612,1.025197,0.856429,0.937401,-0.256879,...,0.434783,0.0,0.0,0.542857,2.12069,-0.461538,1.777778,0.083333,-0.178571,0
3,132539,2025-03-10 03:00:00,0.0,-0.950365,-0.231946,-0.583612,-0.398117,2.761064,0.587911,-0.037519,...,-0.594203,0.625,0.0,2.942857,0.534483,51.769231,-0.259259,22.75,0.357143,0
4,132539,2025-03-10 04:00:00,0.0,-0.950365,-0.231946,-0.583612,-0.540449,-0.002524,0.937401,0.181842,...,-0.028986,0.125,0.0,0.857143,-0.224138,-0.096154,-0.018519,1.416667,0.785714,0


## Task 2 - Perform Aggregation on the Sets

In [12]:
# Define aggregation rules
aggregation_rules = {
    "Age": "last",
    "Gender": "last",
    "Height": "last",
    "Albumin": "last",
    "ALP": "last",
    "ALT": "last",
    "AST": "last",
    "Bilirubin": "last",
    "BUN": "last",
    "Cholesterol": "last",
    "Creatinine": "last",
    "DiasABP": "mean",
    "FiO2": "mean",
    "GCS": "min",
    "Glucose": "mean",
    "HCO3": "last",
    "HCT": "last",
    "HR": "mean",
    "K": "last",
    "Lactate": "max",
    "Mg": "last",
    "MAP": "mean",
    "MechVent": "last",
    "Na": "last",
    "NIDiasABP": "mean",
    "NIMAP": "mean",
    "NISysABP": "mean",
    "PaCO2": "last",
    "PaO2": "mean",
    "pH": "last",
    "Platelets": "last",
    "RespRate": "mean",
    "SaO2": "mean",
    "SysABP": "mean",
    "Temp": "max",
    "TroponinI": "max",
    "TroponinT": "max",
    "Urine": "sum",
    "WBC": "last",
    "Weight": "last",
    "In-hospital_death": "max"  # If any 1 exists for a patient, return 1
}

# Perform aggregation
train_aggregated = train_set.groupby("RecordID").agg(aggregation_rules).reset_index()
val_aggregated = val_set.groupby("RecordID").agg(aggregation_rules).reset_index()
test_aggregated = test_set.groupby("RecordID").agg(aggregation_rules).reset_index()


# Display the processed dataset
train_aggregated.head()

Unnamed: 0,RecordID,Age,Gender,Height,Albumin,ALP,ALT,AST,Bilirubin,BUN,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,In-hospital_death
0,132539,-0.583612,0.0,-0.950365,-1.252106,-0.586207,-0.461538,1.259259,-0.25,-0.588235,...,-0.322617,-0.979592,-0.152341,2.504504,9.714286,16.8,35.615385,-0.304348,-0.231946,0
1,132540,0.669324,1.0,0.618599,-1.252106,-0.586207,-0.461538,1.259259,-0.25,0.176471,...,-0.003777,-1.285714,-0.008614,2.504504,9.714286,16.8,26.985897,0.26087,-0.234578,0
2,132541,-1.153129,0.0,-0.429179,-1.109774,0.5,0.807692,2.462963,1.666667,-0.882353,...,-0.003777,-2.142857,-0.033771,2.504504,9.714286,16.8,21.969231,-0.768116,-1.081104,0
3,132543,0.213711,1.0,1.159248,1.879185,0.5,-0.403846,-0.296296,-0.5,-0.470588,...,-0.712056,-0.979592,-0.152341,2.504504,9.714286,16.8,135.25,-0.521739,0.142631,0
4,132545,1.352744,0.0,-1.252047,0.31354,-0.586207,-0.461538,1.259259,-0.25,0.411765,...,-0.04894,-0.979592,-0.152341,2.504504,9.714286,16.8,-4.3,-0.971014,-0.812672,0


# Model 1 - Logistic Regression

In [13]:
# Separate Predictors (X) and Target (y)
X_train = train_aggregated.drop(columns=["RecordID", "In-hospital_death"])
y_train = train_aggregated["In-hospital_death"]

X_val = val_aggregated.drop(columns=["RecordID", "In-hospital_death"])
y_val = val_aggregated["In-hospital_death"]

X_test = test_aggregated.drop(columns=["RecordID", "In-hospital_death"])
y_test = test_aggregated["In-hospital_death"]

# Visualize the shape of the datasets
print(X_train.shape, y_train.shape)


(4000, 40) (4000,)


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score

# Separate features and target for each set.
# Adjust the column name "target" to your actual target column name.

# Create and train the Logistic Regression classifier.
clf = LogisticRegression(random_state=SEED, max_iter=1000)
clf.fit(X_train, y_train)

# Optionally, evaluate on the validation set.
y_valid_proba = clf.predict_proba(X_val)[:, 1]  # probability for the positive class
roc_auc_valid = roc_auc_score(y_val, y_valid_proba)
auprc_valid = average_precision_score(y_val, y_valid_proba)
print(f"Validation ROC AUC: {roc_auc_valid:.3f}, AUPRC: {auprc_valid:.3f}")

# Evaluate on the test set.
y_test_proba = clf.predict_proba(X_test)[:, 1]
roc_auc_test = roc_auc_score(y_test, y_test_proba)
auprc_test = average_precision_score(y_test, y_test_proba)
print(f"Test ROC AUC: {roc_auc_test:.3f}, AUPRC: {auprc_test:.3f}")

Validation ROC AUC: 0.786, AUPRC: 0.408
Test ROC AUC: 0.758, AUPRC: 0.368


# Model 2 - Random Forest

In [15]:
# Use Random Forest to predict the target
from sklearn.ensemble import RandomForestClassifier

# Create and train the Random Forest classifier.
clf = RandomForestClassifier(random_state=SEED)
clf.fit(X_train, y_train)

# Optionally, evaluate on the validation set.
y_valid_proba = clf.predict_proba(X_val)[:, 1]  # probability for the positive class
roc_auc_valid = roc_auc_score(y_val, y_valid_proba)
auprc_valid = average_precision_score(y_val, y_valid_proba)
print(f"Validation ROC AUC: {roc_auc_valid:.3f}, AUPRC: {auprc_valid:.3f}")

# Evaluate on the test set.
y_test_proba = clf.predict_proba(X_test)[:, 1]
roc_auc_test = roc_auc_score(y_test, y_test_proba)
auprc_test = average_precision_score(y_test, y_test_proba)
print(f"Test ROC AUC: {roc_auc_test:.3f}, AUPRC: {auprc_test:.3f}")

Validation ROC AUC: 0.781, AUPRC: 0.421
Test ROC AUC: 0.739, AUPRC: 0.392


## A little comment on the results:

AUC Score is pretty good. However, AUPRC is not that high. A problem that could arise is class imbalance. In fact, we have two classes, where the size of the 0 class is 6x the size of the 1 class. However, all three sets share the same problem, so it should not be a big problem. Another possible implementation could be using a dimensionality reduction technique, or some feature engineering.

# Q2.1 - Feature Engineering

We could use solutions such as *_Feature Lagging_*, *_Temporal Differences_* or *_Rolling Statistics_* to better capture the time-series trends.

## Attempt 1 - Feature Lagging

Start by selecting the most prominent features (the ones with highest correlation)

In [16]:
def compute_patientwise_avg_acf(df, feature, lag=1):
    """
    Compute the average lag-1 autocorrelation for a given feature across patients.
    """
    acf_values = []
    for rid, group in df.groupby("RecordID"):
        series = group[feature].dropna()
        if len(series) < lag + 1:
            continue
        acf_val = series.corr(series.shift(lag))
        if pd.notna(acf_val):
            acf_values.append(acf_val)
    if len(acf_values) > 0:
        return np.mean(acf_values)
    else:
        return None

#! Using train_set because it's before the aggregation
# Assume your training DataFrame is named train_set and includes "RecordID", "Time", "target", plus dynamic features.
# Define the candidate features: exclude static ones ("RecordID", "Time", "target").
candidate_features = [col for col in train_set.columns if col not in ["RecordID", "Time", "In-hospital_death", "Age", "Weight", "Height", "Gender"]]

# Set threshold for absolute autocorrelation (lag 1)
threshold = 0.5
selected_features = []
for feature in candidate_features:
    avg_acf = compute_patientwise_avg_acf(train_set, feature, lag=1)
    if avg_acf is not None and abs(avg_acf) >= threshold:
        selected_features.append(feature)
        #print(f"Selected {feature} with average lag-1 ACF = {avg_acf:.3f}")

print("Features selected for lag augmentation:", selected_features)

  c /= stddev[:, None]
  c /= stddev[None, :]


Features selected for lag augmentation: ['HCT', 'HR', 'PaO2', 'Temp', 'BUN', 'Creatinine', 'FiO2', 'GCS', 'WBC']


Check the number of NaN values in the entire train_set (as example). The number is 0 because we have a filled data set.

In [17]:
# Print number of NaN values in train set
print(train_set.isnull().sum().sum())

0


Then add the lag columns for those selected features.
WARNING! This operation creates NaN values inside the DataFrame

In [18]:
def add_lag_features_for_selected(df, selected_features, lags=[1,2]):
    """
    For each feature in selected_features, add lag features computed patient-wise.
    """
    df_augmented = df.copy()
    for feature in selected_features:
        if feature not in df_augmented.columns:
            continue
        for lag in lags:
            lag_col = f"{feature}_lag{lag}"
            df_augmented[lag_col] = df_augmented.groupby("RecordID")[feature].shift(lag)
    return df_augmented

# Augment train, validation, and test sets with lag features using the selected_features list.
train_set_aug = add_lag_features_for_selected(train_set, selected_features, lags=[1,2])
valid_set_aug = add_lag_features_for_selected(val_set, selected_features, lags=[1,2])
test_set_aug  = add_lag_features_for_selected(test_set,  selected_features, lags=[1,2])

# Print shapes
print(train_set_aug.shape, valid_set_aug.shape, test_set_aug.shape)

"""train_set_clean = train_set_aug.dropna()
valid_set_clean = valid_set_aug.dropna()
test_set_clean  = test_set_aug.dropna()

print(f"Train set shape after dropping NaNs: {train_set_clean.shape}")
print(f"Validation set shape after dropping NaNs: {valid_set_clean.shape}")
print(f"Test set shape after dropping NaNs: {test_set_clean.shape}")"""
train_set_aug.head()

(196000, 61) (196000, 61) (196000, 61)


Unnamed: 0,RecordID,Time,Gender,Height,Weight,Age,Albumin,Cholesterol,DiasABP,HCO3,...,BUN_lag1,BUN_lag2,Creatinine_lag1,Creatinine_lag2,FiO2_lag1,FiO2_lag2,GCS_lag1,GCS_lag2,WBC_lag1,WBC_lag2
0,132539,2025-03-10 00:00:00,0.0,-0.950365,-0.231946,-0.583612,1.452191,-0.133234,-0.810049,-0.037519,...,,,,,,,,,,
1,132539,2025-03-10 01:00:00,0.0,-0.950365,-0.231946,-0.583612,0.171208,-0.693421,0.634509,-2.669841,...,0.0,,0.75,,2.5,,0.166667,,0.318841,
2,132539,2025-03-10 02:00:00,0.0,-0.950365,-0.231946,-0.583612,1.025197,0.856429,0.937401,-0.256879,...,-0.235294,0.0,0.375,0.75,2.5,2.5,0.166667,0.166667,0.608696,0.318841
3,132539,2025-03-10 03:00:00,0.0,-0.950365,-0.231946,-0.583612,-0.398117,2.761064,0.587911,-0.037519,...,-0.647059,-0.235294,-0.25,0.375,0.5,2.5,0.166667,0.166667,0.434783,0.608696
4,132539,2025-03-10 04:00:00,0.0,-0.950365,-0.231946,-0.583612,-0.540449,-0.002524,0.937401,0.181842,...,0.352941,-0.647059,1.875,-0.25,1.5,0.5,0.166667,0.166667,-0.594203,0.434783


+20 features in total. Now print again the number of NaN values => It's increased because we inserted NaN values in the first 2 rows for each patient (lag-2 and lag-1)

In [19]:
# Print the number of NaN values
print(train_set_aug.isnull().sum().sum(), valid_set_aug.isnull().sum().sum(), test_set_aug.isnull().sum().sum())

108000 108000 108000


## Perform Aggregation on the Augmented DataFrames

By using "last" for the newly inserted columns, the NaN values should disappear.

Check if there are patients which have their last row with a NaN value

In [20]:
# Print the entire row of patients (RecordID) that have NaN values in their last row (last time point)
# Get the last row for each patient (grouped by RecordID)
last_rows = train_set_aug.groupby("RecordID").tail(1)

# Filter to get only those rows with at least one NaN value
missing_last = last_rows[last_rows.isna().any(axis=1)]

# Print the full rows for those patients
print("Patients with NaN values in their last row:")
missing_last


Patients with NaN values in their last row:


Unnamed: 0,RecordID,Time,Gender,Height,Weight,Age,Albumin,Cholesterol,DiasABP,HCO3,...,BUN_lag1,BUN_lag2,Creatinine_lag1,Creatinine_lag2,FiO2_lag1,FiO2_lag2,GCS_lag1,GCS_lag2,WBC_lag1,WBC_lag2


Here for example, patient 133628 has NaN values. Let's explore his/her data.

In [21]:
# Print all the rows for RecordID = 133628
train_set_aug[train_set_aug["RecordID"] == 133628]

Unnamed: 0,RecordID,Time,Gender,Height,Weight,Age,Albumin,Cholesterol,DiasABP,HCO3,...,BUN_lag1,BUN_lag2,Creatinine_lag1,Creatinine_lag2,FiO2_lag1,FiO2_lag2,GCS_lag1,GCS_lag2,WBC_lag1,WBC_lag2
20972,133628,2025-03-10 00:00:00,1.0,0.448835,-0.517923,1.352744,1.452191,-0.133234,-0.810049,-0.037519,...,,,,,,,,,,
20973,133628,2025-03-10 01:00:00,1.0,0.448835,-0.517923,1.352744,0.171208,-0.693421,0.634509,-2.669841,...,0.0,,0.75,,2.5,,0.166667,,0.318841,
20974,133628,2025-03-10 02:00:00,1.0,0.448835,-0.517923,1.352744,1.025197,0.856429,0.937401,-0.256879,...,-0.235294,0.0,0.375,0.75,2.5,2.5,-0.166667,0.166667,0.608696,0.318841
20975,133628,2025-03-10 03:00:00,1.0,0.448835,-0.517923,1.352744,-0.398117,2.761064,0.587911,-0.037519,...,-0.647059,-0.235294,-0.25,0.375,0.5,2.5,0.166667,-0.166667,0.434783,0.608696
20976,133628,2025-03-10 04:00:00,1.0,0.448835,-0.517923,1.352744,-0.540449,-0.002524,0.937401,0.181842,...,0.352941,-0.647059,1.875,-0.25,1.5,0.5,0.166667,0.166667,-0.594203,0.434783
20977,133628,2025-03-10 05:00:00,1.0,0.448835,-0.517923,1.352744,-0.113454,-1.365645,-2.697296,-0.914959,...,-0.294118,0.352941,1.375,1.875,2.5,1.5,-1.0,0.166667,-0.028986,-0.594203
20978,133628,2025-03-10 06:00:00,1.0,0.448835,-0.517923,1.352744,1.167528,-0.133234,-2.697296,-1.7924,...,-0.411765,-0.294118,0.75,1.375,-0.5,2.5,-1.833333,-1.0,-0.637681,-0.028986
20979,133628,2025-03-10 07:00:00,1.0,0.448835,-0.517923,1.352744,-0.68278,1.453962,-2.697296,-0.914959,...,0.411765,-0.411765,-0.375,0.75,0.0,-0.5,0.166667,-1.833333,0.884058,-0.637681
20980,133628,2025-03-10 08:00:00,1.0,0.448835,-0.517923,1.352744,1.167528,-0.898823,-2.697296,0.401202,...,-0.117647,0.411765,0.0,-0.375,-0.5,0.0,-1.333333,0.166667,-0.623188,0.884058
20981,133628,2025-03-10 09:00:00,1.0,0.448835,-0.517923,1.352744,0.171208,-0.394655,-2.697296,1.278643,...,-0.647059,-0.117647,1.75,0.0,0.0,-0.5,0.0,-1.333333,-0.173913,-0.623188


The problem occurs when a patient has not enough data (less than 3 measurements). Since only 6 patients out of 4000 share this problem, start by dropping them.

In [22]:
extended_aggregation_rules = aggregation_rules.copy()
for col in train_set_aug.columns:
    if "lag" in col:
        extended_aggregation_rules[col] = "last"

print("Extended aggregation rules:")
print(extended_aggregation_rules)

# Now perform aggregation on the train, validation, and test sets.
train_aggregated = train_set_aug.groupby("RecordID").agg(extended_aggregation_rules).reset_index()
val_aggregated   = valid_set_aug.groupby("RecordID").agg(extended_aggregation_rules).reset_index()
test_aggregated  = test_set_aug.groupby("RecordID").agg(extended_aggregation_rules).reset_index()

# Display the processed dataset (for example, for the train set)
print(train_aggregated.shape)
train_aggregated.head()

Extended aggregation rules:
{'Age': 'last', 'Gender': 'last', 'Height': 'last', 'Albumin': 'last', 'ALP': 'last', 'ALT': 'last', 'AST': 'last', 'Bilirubin': 'last', 'BUN': 'last', 'Cholesterol': 'last', 'Creatinine': 'last', 'DiasABP': 'mean', 'FiO2': 'mean', 'GCS': 'min', 'Glucose': 'mean', 'HCO3': 'last', 'HCT': 'last', 'HR': 'mean', 'K': 'last', 'Lactate': 'max', 'Mg': 'last', 'MAP': 'mean', 'MechVent': 'last', 'Na': 'last', 'NIDiasABP': 'mean', 'NIMAP': 'mean', 'NISysABP': 'mean', 'PaCO2': 'last', 'PaO2': 'mean', 'pH': 'last', 'Platelets': 'last', 'RespRate': 'mean', 'SaO2': 'mean', 'SysABP': 'mean', 'Temp': 'max', 'TroponinI': 'max', 'TroponinT': 'max', 'Urine': 'sum', 'WBC': 'last', 'Weight': 'last', 'In-hospital_death': 'max', 'HCT_lag1': 'last', 'HCT_lag2': 'last', 'HR_lag1': 'last', 'HR_lag2': 'last', 'PaO2_lag1': 'last', 'PaO2_lag2': 'last', 'Temp_lag1': 'last', 'Temp_lag2': 'last', 'BUN_lag1': 'last', 'BUN_lag2': 'last', 'Creatinine_lag1': 'last', 'Creatinine_lag2': 'last', 

Unnamed: 0,RecordID,Age,Gender,Height,Albumin,ALP,ALT,AST,Bilirubin,BUN,...,BUN_lag1,BUN_lag2,Creatinine_lag1,Creatinine_lag2,FiO2_lag1,FiO2_lag2,GCS_lag1,GCS_lag2,WBC_lag1,WBC_lag2
0,132539,-0.583612,0.0,-0.950365,-1.252106,-0.586207,-0.461538,1.259259,-0.25,-0.588235,...,-0.588235,-0.588235,-0.375,-0.375,-0.75,-0.5,0.166667,0.166667,-0.304348,-0.304348
1,132540,0.669324,1.0,0.618599,-1.252106,-0.586207,-0.461538,1.259259,-0.25,0.176471,...,0.176471,0.176471,0.375,0.375,-0.5,-0.5,0.166667,0.166667,0.26087,0.26087
2,132541,-1.153129,0.0,-0.429179,-1.109774,0.5,0.807692,2.462963,1.666667,-0.882353,...,-0.882353,-0.882353,-0.875,-0.875,-0.5,-0.5,-1.5,-1.5,-0.768116,-0.768116
3,132543,0.213711,1.0,1.159248,1.879185,0.5,-0.403846,-0.296296,-0.5,-0.470588,...,-0.470588,-0.470588,-0.375,-0.375,-0.75,-0.5,0.166667,0.166667,-0.521739,-0.521739
4,132545,1.352744,0.0,-1.252047,0.31354,-0.586207,-0.461538,1.259259,-0.25,0.411765,...,0.411765,0.411765,0.0,0.0,-0.75,-0.5,0.166667,0.166667,-0.971014,-0.971014


In [23]:
# Remove rows that include NaN values
train_aggregated_clean = train_aggregated.dropna()
val_aggregated_clean = val_aggregated.dropna()
test_aggregated_clean = test_aggregated.dropna()
train_aggregated_clean.shape, val_aggregated_clean.shape, test_aggregated_clean.shape


((4000, 60), (4000, 60), (4000, 60))

As we can see, we lost 6 patients in the train, 8 in the validation and only 5 in the test. I guess it's pretty reasonable.

### Evaluation with Logistic Regression

In [24]:
def prepare_xy(df):
    X = df.drop(columns=["In-hospital_death", "RecordID"])  # drop non-feature columns
    y = df["In-hospital_death"]  # target column
    return X, y

X_train, y_train = prepare_xy(train_aggregated_clean)
X_valid, y_valid = prepare_xy(val_aggregated_clean)
X_test,  y_test  = prepare_xy(test_aggregated_clean)

# Initialize and train Logistic Regression.
clf = LogisticRegression(random_state=SEED, max_iter=1000)
clf.fit(X_train, y_train)

# Evaluate on validation set.
y_valid_proba = clf.predict_proba(X_valid)[:, 1]
roc_auc_valid = roc_auc_score(y_valid, y_valid_proba)
auprc_valid = average_precision_score(y_valid, y_valid_proba)
print(f"Validation ROC AUC: {roc_auc_valid:.3f}, AUPRC: {auprc_valid:.3f}")

# Evaluate on test set.
y_test_proba = clf.predict_proba(X_test)[:, 1]
roc_auc_test = roc_auc_score(y_test, y_test_proba)
auprc_test = average_precision_score(y_test, y_test_proba)
print(f"Test ROC AUC: {roc_auc_test:.3f}, AUPRC: {auprc_test:.3f}")

Validation ROC AUC: 0.838, AUPRC: 0.488
Test ROC AUC: 0.821, AUPRC: 0.458


### Evaluation with Random Forest

In [25]:
# Initialize and train Random Forest.
clf = RandomForestClassifier(random_state=SEED)
clf.fit(X_train, y_train)

# Evaluate on validation set.
y_valid_proba = clf.predict_proba(X_valid)[:, 1]
roc_auc_valid = roc_auc_score(y_valid, y_valid_proba)
auprc_valid = average_precision_score(y_valid, y_valid_proba)
print(f"Validation ROC AUC: {roc_auc_valid:.3f}, AUPRC: {auprc_valid:.3f}")

# Evaluate on test set.
y_test_proba = clf.predict_proba(X_test)[:, 1]
roc_auc_test = roc_auc_score(y_test, y_test_proba)
auprc_test = average_precision_score(y_test, y_test_proba)
print(f"Test ROC AUC: {roc_auc_test:.3f}, AUPRC: {auprc_test:.3f}")

Validation ROC AUC: 0.823, AUPRC: 0.475
Test ROC AUC: 0.821, AUPRC: 0.495
