# Q2.1 Classic Machine Learning Methods

In [178]:
import pandas as pd
import numpy as np
from project_1.config import PROCESSED_DATA_DIR, PROJ_ROOT

In [179]:
# Load the data from Parquet files
sets_dict = {}
sets = ["a", "b", "c"]

for set_name in sets:
    directory = PROCESSED_DATA_DIR / f"set_{set_name}_final.parquet"
    temp_set = pd.read_parquet(directory)
    sets_dict[f"set_{set_name}"] = temp_set

# Assure the loading was correct
print(sets_dict["set_a"].shape)
sets_dict["set_a"].head(10)


(183416, 43)


Unnamed: 0,RecordID,Time,Gender,Height,Weight,Age,Albumin,Cholesterol,DiasABP,HCO3,...,ICUType,K,Lactate,MechVent,Urine,WBC,pH,SaO2,TroponinT,TroponinI
0,132539.0,2025-03-10 00:00:00,0.0,-0.953317,0.283445,-0.596332,1.004449,-0.069047,-0.739159,-1.269243,...,0.5,0.625,-0.583333,0.0,2.96,0.271429,-0.75,1.0,0.933333,-0.22449
1,132539.0,2025-03-10 01:00:00,0.0,-0.953317,0.283445,-0.596332,-0.232975,1.997003,0.2327,-2.92406,...,0.5,0.375,-0.166667,0.0,3.2,0.0,0.75,-15.5,-0.177778,0.408163
2,132539.0,2025-03-10 02:00:00,0.0,-0.953317,0.283445,-0.596332,-1.132919,2.628972,2.44021,0.385573,...,0.5,0.625,2.0,0.0,-0.4,2.185714,-0.25,1.0,0.333333,-0.285714
3,132539.0,2025-03-10 03:00:00,0.0,-0.953317,0.283445,-0.596332,-0.120482,-1.721888,1.169851,-0.441835,...,0.5,-1.0,-0.416667,0.0,0.72,2.185714,-0.25,0.5,0.333333,2.040816
4,132539.0,2025-03-10 04:00:00,0.0,-0.953317,0.283445,-0.596332,-0.120482,1.462261,-0.392067,-2.510356,...,0.5,0.125,0.0,0.0,-0.16,-0.957143,0.375,1.0,0.2,0.795918
5,132539.0,2025-03-10 05:00:00,0.0,-0.953317,0.283445,-0.596332,1.341928,1.146276,-0.235875,0.178721,...,0.5,-2.125,1.666667,0.0,-0.16,1.557143,-0.375,0.0,-0.244444,0.183673
6,132539.0,2025-03-10 06:00:00,0.0,-0.953317,0.283445,-0.596332,1.341928,-0.506564,-0.322648,-0.234983,...,0.5,0.625,0.833333,0.0,0.72,1.557143,-0.125,0.0,0.8,-0.081633
7,132539.0,2025-03-10 08:00:00,0.0,-0.953317,0.283445,-0.596332,-1.357905,-0.11766,-0.010265,1.419833,...,0.5,-0.25,0.25,0.0,0.32,-0.657143,-0.625,0.0,20.911111,8.408163
8,132539.0,2025-03-10 09:00:00,0.0,-0.953317,0.283445,-0.596332,-1.132919,0.344163,0.2327,-0.028131,...,0.5,-0.25,-0.416667,0.0,0.0,-0.657143,-0.25,0.0,-0.088889,-0.020408
9,132539.0,2025-03-10 10:00:00,0.0,-0.953317,0.283445,-0.596332,-1.470399,0.076791,0.302119,0.385573,...,0.5,-0.375,4.0,0.0,0.16,-0.014286,-1.0,0.0,1.555556,0.040816


ICU TYpe da cavare

In [180]:

# Define file names
file_names = ["Outcomes-a.txt", "Outcomes-b.txt", "Outcomes-c.txt"]

# Directory path
base_path = PROJ_ROOT / "data" / "data_1" / "predicting-mortality-of-icu-patients-the-physionet-computing-in-cardiology-challenge-2012-1.0.0"

# Read files into DataFrames containing all variables
outcomes_a, outcomes_b, outcomes_c = [pd.read_csv(base_path / name) for name in file_names]

# Extract only the "RecordID" and "In-hospital_death" column into separate DataFrames
death_a, death_b, death_c = [df[["RecordID", "In-hospital_death"]] for df in [outcomes_a, outcomes_b, outcomes_c]]
death_a.head()

#CHECK for missing values in the outcome data
print(death_a.isnull().sum())
print(death_b.isnull().sum())
print(death_c.isnull().sum())



RecordID             0
In-hospital_death    0
dtype: int64
RecordID             0
In-hospital_death    0
dtype: int64
RecordID             0
In-hospital_death    0
dtype: int64


In [181]:
# Check if each "In-hospital_death" column contains only 0 and 1
for name, df in zip(["a", "b", "c"], [death_a, death_b, death_c]):
    print(f"\nValue counts for death_{name}:\n")
    print(df["In-hospital_death"].value_counts())
    print("\nContains only 0 and 1:", df["In-hospital_death"].isin([0, 1]).all())



Value counts for death_a:

In-hospital_death
0    3446
1     554
Name: count, dtype: int64

Contains only 0 and 1: True

Value counts for death_b:

In-hospital_death
0    3432
1     568
Name: count, dtype: int64

Contains only 0 and 1: True

Value counts for death_c:

In-hospital_death
0    3415
1     585
Name: count, dtype: int64

Contains only 0 and 1: True


So 1 means he died. We are going to fit a classifier to predict death. 

### Initial setup for the classifier to work

In [182]:
# Define the sets you want to modify
sets = ["set_a", "set_b", "set_c"]

# Delete the column ICUType from each set
for set_name in sets:
    sets_dict[set_name].drop(columns=["ICUType"], inplace=True)

# Check if the column was deleted by printing all the columns of set_a
sets_dict["set_a"].columns

Index(['RecordID', 'Time', 'Gender', 'Height', 'Weight', 'Age', 'Albumin',
       'Cholesterol', 'DiasABP', 'HCO3', 'HCT', 'HR', 'Mg', 'MAP', 'Na',
       'NIDiasABP', 'NIMAP', 'NISysABP', 'SysABP', 'PaCO2', 'PaO2',
       'Platelets', 'RespRate', 'Temp', 'ALP', 'ALT', 'AST', 'BUN',
       'Bilirubin', 'Creatinine', 'FiO2', 'GCS', 'Glucose', 'K', 'Lactate',
       'MechVent', 'Urine', 'WBC', 'pH', 'SaO2', 'TroponinT', 'TroponinI'],
      dtype='object')

In [183]:
# Define training, validation and testing sets
train_set = sets_dict["set_a"]
val_set = sets_dict["set_b"]
test_set = sets_dict["set_c"]


In [184]:

#change class of RecordID to int32
train_set["RecordID"] = train_set["RecordID"].astype("int32")
val_set["RecordID"] = val_set["RecordID"].astype("int32")
test_set["RecordID"] = test_set["RecordID"].astype("int32")


In [185]:
# Merge the training, validation and testing sets with the corresponding death DataFrames
train_set = train_set.merge(death_a, on="RecordID")
val_set = val_set.merge(death_b, on="RecordID") 
test_set = test_set.merge(death_c, on="RecordID")

# Check if the merge was successful by printing the first 5 rows of the training set
train_set.head()

Unnamed: 0,RecordID,Time,Gender,Height,Weight,Age,Albumin,Cholesterol,DiasABP,HCO3,...,K,Lactate,MechVent,Urine,WBC,pH,SaO2,TroponinT,TroponinI,In-hospital_death
0,132539,2025-03-10 00:00:00,0.0,-0.953317,0.283445,-0.596332,1.004449,-0.069047,-0.739159,-1.269243,...,0.625,-0.583333,0.0,2.96,0.271429,-0.75,1.0,0.933333,-0.22449,0
1,132539,2025-03-10 01:00:00,0.0,-0.953317,0.283445,-0.596332,-0.232975,1.997003,0.2327,-2.92406,...,0.375,-0.166667,0.0,3.2,0.0,0.75,-15.5,-0.177778,0.408163,0
2,132539,2025-03-10 02:00:00,0.0,-0.953317,0.283445,-0.596332,-1.132919,2.628972,2.44021,0.385573,...,0.625,2.0,0.0,-0.4,2.185714,-0.25,1.0,0.333333,-0.285714,0
3,132539,2025-03-10 03:00:00,0.0,-0.953317,0.283445,-0.596332,-0.120482,-1.721888,1.169851,-0.441835,...,-1.0,-0.416667,0.0,0.72,2.185714,-0.25,0.5,0.333333,2.040816,0
4,132539,2025-03-10 04:00:00,0.0,-0.953317,0.283445,-0.596332,-0.120482,1.462261,-0.392067,-2.510356,...,0.125,0.0,0.0,-0.16,-0.957143,0.375,1.0,0.2,0.795918,0


In [186]:
# Define aggregation rules
aggregation_rules = {
    "Age": "last",
    "Gender": "last",
    "Height": "last",
    "Albumin": "last",
    "ALP": "last",
    "ALT": "last",
    "AST": "last",
    "Bilirubin": "last",
    "BUN": "last",
    "Cholesterol": "last",
    "Creatinine": "last",
    "DiasABP": "mean",
    "FiO2": "mean",
    "GCS": "min",
    "Glucose": "mean",
    "HCO3": "last",
    "HCT": "last",
    "HR": "mean",
    "K": "last",
    "Lactate": "max",
    "Mg": "last",
    "MAP": "mean",
    "MechVent": "last",
    "Na": "last",
    "NIDiasABP": "mean",
    "NIMAP": "mean",
    "NISysABP": "mean",
    "PaCO2": "last",
    "PaO2": "mean",
    "pH": "last",
    "Platelets": "last",
    "RespRate": "mean",
    "SaO2": "mean",
    "SysABP": "mean",
    "Temp": "max",
    "TroponinI": "max",
    "TroponinT": "max",
    "Urine": "sum",
    "WBC": "last",
    "Weight": "last",
    "In-hospital_death": "max"  # If any 1 exists for a patient, return 1
}

# Perform aggregation
train_aggregated = train_set.groupby("RecordID").agg(aggregation_rules).reset_index()
val_aggregated = val_set.groupby("RecordID").agg(aggregation_rules).reset_index()
test_aggregated = test_set.groupby("RecordID").agg(aggregation_rules).reset_index()


# Display the processed dataset
train_aggregated.head()

Unnamed: 0,RecordID,Age,Gender,Height,Albumin,ALP,ALT,AST,Bilirubin,BUN,...,RespRate,SaO2,SysABP,Temp,TroponinI,TroponinT,Urine,WBC,Weight,In-hospital_death
0,132539,-0.596332,0.0,-0.953317,-0.120482,0.372549,-0.263158,0.289474,0.222222,-0.55,...,-0.328319,-0.536458,0.15865,1.448365,8.408163,20.911111,38.32,-0.3,0.283445,0
1,132540,0.667051,1.0,0.615786,-0.120482,0.372549,-0.263158,0.289474,0.222222,0.1,...,-0.011361,-0.428571,-0.151136,1.096722,8.408163,20.911111,30.265333,0.257143,-0.232912,0
2,132541,-1.170597,0.0,-0.432085,-1.020426,0.431373,0.77193,0.77193,2.333333,-0.8,...,0.004531,-0.885417,0.17595,2.386078,8.408163,20.911111,24.688,-0.757143,-1.078178,0
3,132543,0.207639,1.0,1.156482,1.341928,0.431373,-0.333333,-0.535088,-0.555556,-0.45,...,-0.746235,-0.52551,0.173347,0.041795,8.408163,20.911111,141.272,-0.514286,0.143735,0
4,132545,1.356169,0.0,-1.117689,0.104504,0.372549,-0.263158,0.289474,0.222222,0.3,...,-0.051654,-0.239362,0.154832,0.979508,8.408163,20.911111,-2.032,-0.957143,-0.835986,0


## LOGISTIC REGRESSION

In [187]:
# Separate Predictors (X) and Target (y)
X_train = train_aggregated.drop(columns=["RecordID", "In-hospital_death"])
y_train = train_aggregated["In-hospital_death"]

X_val = val_aggregated.drop(columns=["RecordID", "In-hospital_death"])
y_val = val_aggregated["In-hospital_death"]

X_test = test_aggregated.drop(columns=["RecordID", "In-hospital_death"])
y_test = test_aggregated["In-hospital_death"]


In [None]:
#El bondiano fa una logistica daje

Train ROC AUC score: 0.7765
Validation ROC AUC score: 0.7796
Test ROC AUC score: 0.7692
