# Q2 - Supervised Learning

# !!!! BE CAREFUL WITH ICUTYPE COLUMN !!!!!!

## Step 0 - Select method for each features

In [4]:
import pandas as pd
import numpy as np
from project_1.config import PROCESSED_DATA_DIR, PROJ_ROOT

In [5]:
# Load the data from Parquet files
sets_dict = {}
sets = ["a", "b", "c"]

for set_name in sets:
    directory = PROCESSED_DATA_DIR / f"set_{set_name}_final.parquet"
    temp_set = pd.read_parquet(directory)
    sets_dict[f"set_{set_name}"] = temp_set

# Assure the loading was correct
print(sets_dict["set_a"].shape)
sets_dict["set_a"].head(10)


(183416, 43)


Unnamed: 0,RecordID,Time,Gender,Height,Weight,Age,Albumin,Cholesterol,DiasABP,HCO3,...,Urine,WBC,pH,MechVent,TroponinT,ALP,ALT,AST,Bilirubin,TroponinI
0,132539.0,2025-03-10 00:00:00,0.0,-0.949073,0.397026,-0.596332,1.671639,-0.013487,-0.832594,-0.109176,...,11.571429,0.753623,1.125,0.0,1.923077,0.132075,-0.176471,0.450704,1.545455,0.285714
1,132539.0,2025-03-10 01:00:00,0.0,-0.949073,0.397026,-0.596332,1.967793,0.172112,-0.608431,-0.109176,...,2.857143,-0.42029,0.125,0.0,-0.246154,0.0,-0.294118,0.43662,0.0,-0.126984
2,132539.0,2025-03-10 02:00:00,0.0,-0.949073,0.397026,-0.596332,-1.734132,0.125712,0.848629,0.830987,...,-0.357143,-0.014493,-0.875,0.0,0.0,0.773585,-0.205882,-0.380282,0.181818,-0.095238
3,132539.0,2025-03-10 03:00:00,0.0,-0.949073,0.397026,-0.596332,1.523562,0.38091,-0.832594,-0.579257,...,0.642857,0.188406,-0.375,0.0,0.215385,-0.698113,-0.588235,1.126761,-0.181818,4.650794
4,132539.0,2025-03-10 04:00:00,0.0,-0.949073,0.397026,-0.596332,0.487023,-0.96468,1.483758,-0.814297,...,-0.142857,-1.144928,1.0,0.0,2.738462,-0.490566,-0.558824,-0.225352,0.363636,0.904762
5,132539.0,2025-03-10 05:00:00,0.0,-0.949073,0.397026,-0.596332,0.042792,-0.013487,0.736548,-0.579257,...,-0.142857,0.42029,0.375,0.0,-0.123077,0.509434,5.323529,35.577465,24.909091,1.587302
6,132539.0,2025-03-10 06:00:00,0.0,-0.949073,0.397026,-0.596332,0.6351,-1.405476,0.2135,-0.344216,...,0.642857,-0.521739,-0.875,0.0,-0.246154,-0.45283,-0.088235,25.661972,-0.454545,-0.206349
7,132539.0,2025-03-10 08:00:00,0.0,-0.949073,0.397026,-0.596332,0.6351,0.984105,-0.290867,0.360906,...,0.285714,0.637681,-0.625,0.0,0.138462,1.584906,0.058824,1.450704,-0.090909,-0.095238
8,132539.0,2025-03-10 09:00:00,0.0,-0.949073,0.397026,-0.596332,-0.401439,-1.591075,1.483758,1.301068,...,0.0,0.492754,-0.125,0.0,1.046154,-0.735849,-0.176471,-0.309859,3.363636,-0.047619
9,132539.0,2025-03-10 10:00:00,0.0,-0.949073,0.397026,-0.596332,-0.105285,-1.196678,0.064058,-0.814297,...,0.142857,-0.521739,-1.125,0.0,0.661538,1.188679,0.058824,-0.352113,-0.363636,-0.095238


# Check to check

In [6]:
df = sets_dict["set_a"] # To try
static_cols = ["Age", "Height", "Weight", "Gender"]
# Check that the values of Age, Height, Weight and Gender are the same for all distinct patient (RecordID)
# Group by RecordID and compute the number of unique values in each static column
consistency_check = df.groupby("RecordID")[static_cols].nunique()

# Identify patients (RecordIDs) where any static column has more than one unique value
inconsistencies = consistency_check[consistency_check > 1].dropna(how="all")

if inconsistencies.empty:
    print("All static columns are constant for each patient.")
else:
    print("Inconsistencies found in static variables:")
    for record_id, row in inconsistencies.iterrows():
        # Filter to only those static variables that are not constant (i.e., > 1 unique value)
        inconsistent_cols = row[row > 1]
        print(f"RecordID {record_id} has multiple values in columns: {list(inconsistent_cols.index)}")

All static columns are constant for each patient.


In [1]:
# Variabili statiche (usate direttamente senza aggregazione)
static_variables = ["Age", "Gender", "Height"] # Also Weight should be a static, however we take the last measured

# Variabili dinamiche divise per funzione di aggregazione
mean_variables = [
    "DiasABP", "FiO2", "Glucose", "HR", "MAP", "NIDiasABP", "NIMAP", 
    "NISysABP", "PaO2", "RespRate", "SaO2", "SysABP"
]

last_measured_variables = [
    "Albumin", "ALP", "ALT", "AST", "Bilirubin", "BUN", "Cholesterol", "Creatinine", 
    "HCO3", "HCT", "K", "Mg", "MechVent", "Na", "PaCO2", "pH", "Platelets", "WBC", "Weight"
]

max_variables = ["Lactate", "Temp", "TropI", "TropT"]

min_variables = ["GCS"]

sum_variables = ["Urine"]

# Stampa le variabili per verifica
print("Static Variables:", static_variables)
print("Mean Variables:", mean_variables)
print("Last Measured Variables:", last_measured_variables)
print("Max Variables:", max_variables)
print("Min Variables:", min_variables)
print("Sum Variables:", sum_variables)

Static Variables: ['Age', 'Gender', 'Height', 'ICUType']
Mean Variables: ['DiasABP', 'FiO2', 'Glucose', 'HR', 'MAP', 'NIDiasABP', 'NIMAP', 'NISysABP', 'PaO2', 'RespRate', 'SaO2', 'SysABP']
Last Measured Variables: ['Albumin', 'ALP', 'ALT', 'AST', 'Bilirubin', 'BUN', 'Cholesterol', 'Creatinine', 'HCO3', 'HCT', 'K', 'Mg', 'MechVent', 'Na', 'PaCO2', 'pH', 'Platelets', 'WBC', 'Weight']
Max Variables: ['Lactate', 'Temp', 'TropI', 'TropT']
Min Variables: ['GCS']
Sum Variables: ['Urine']


In [None]:
def collapse_patient_rows(df):
    """
    Groups the DataFrame by 'RecordID', collapsing all rows per patient into a single row.
    The aggregation for each column is determined by the following rules:
    
      - static_variables: take the first value.
      - mean_variables: compute the mean.
      - last_measured_variables: take the last value.
      - max_variables: take the maximum value.
      - min_variables: take the minimum value.
      - sum_variables: compute the sum.
    
    Parameters:
      df (pd.DataFrame): The input DataFrame that includes 'RecordID' and all relevant variables.
      
    Returns:
      pd.DataFrame: A DataFrame with one row per patient (RecordID).
    """
    # Define variable lists (modify as needed)
    static_variables = ["Age", "Gender", "Height"]  
    mean_variables = [
        "DiasABP", "FiO2", "Glucose", "HR", "MAP", "NIDiasABP", "NIMAP", 
        "NISysABP", "PaO2", "RespRate", "SaO2", "SysABP"
    ]
    last_measured_variables = [
        "Albumin", "ALP", "ALT", "AST", "Bilirubin", "BUN", "Cholesterol", "Creatinine", 
        "HCO3", "HCT", "K", "Mg", "MechVent", "Na", "PaCO2", "pH", "Platelets", "WBC", "Weight"
    ]
    max_variables = ["Lactate", "Temp", "TropI", "TropT"]
    min_variables = ["GCS"]
    sum_variables = ["Urine"]

    # Build an aggregation dictionary
    agg_dict = {}
    
    for col in static_variables:
        agg_dict[col] = "first"
    for col in mean_variables:
        agg_dict[col] = "mean"
    for col in last_measured_variables:
        agg_dict[col] = "last"
    for col in max_variables:
        agg_dict[col] = "max"
    for col in min_variables:
        agg_dict[col] = "min"
    for col in sum_variables:
        agg_dict[col] = "sum"

    # Optionally, if the DataFrame has a "Time" column and you don't need it in the collapsed data,
    # you can drop it before grouping. Alternatively, you can choose a suitable aggregation (e.g., first, last).
    # Here, we drop it.
    if "Time" in df.columns:
        df = df.drop(columns=["Time"])
    
    # Group by 'RecordID' and aggregate using the defined dictionary.
    df_grouped = df.groupby("RecordID").agg(agg_dict).reset_index()
    
    return df_grouped

## Step 1 - Create the models (Logistic Regression && Random Forest)

## Step 2 - Training Loop

## Step 3 - Compute Score