# This Jupyter Notebook should serve at the following purposes:
- Load preprocessed data from the specified folder
- Do some initial feature engineering (Dimensionality Reduction using correlation between features, with a 0.65 threshold)
- Save the adjusted set to a new file, ready to be loaded for the model selection part

!WARNING! The three sets MIGHT have DIFFERENT FEATURES. In case that happens, we should learn a way to deal with that.

# Load processed data

In [12]:
import pandas as pd
import numpy as np
import plotly.express as px
import os
from pathlib import Path

PROJECT_ROOT = Path(os.getcwd()).resolve().parent
DATA_DIR = PROJECT_ROOT / "data"

print(f"Project root: {PROJECT_ROOT}")

Project root: /Users/francescobondi/Desktop/stuff/ETH/FS25/ML for Healthcare/project-1-ml4hc


In [None]:
# Load the data
patients_df = pd.read_hdf(PROJECT_ROOT / "data/processed/train_set.h5")
patients_df.head(10)

Parameter,RecordID,Time,Age,BUN,Creatinine,GCS,Gender,Glucose,HCO3,HCT,...,PaCO2,PaO2,pH,DiasABP,MAP,SaO2,SysABP,Lactate,Cholesterol,TroponinI
0,132539.0,2025-03-10 00:00:00,54.0,,,,0.0,,,,...,,,,,,,,,,
1,132539.0,2025-03-10 00:07:00,54.0,,,15.0,0.0,,,,...,,,,,,,,,,
2,132539.0,2025-03-10 00:37:00,54.0,,,15.0,0.0,,,,...,,,,,,,,,,
3,132539.0,2025-03-10 01:37:00,54.0,,,15.0,0.0,,,,...,,,,,,,,,,
4,132539.0,2025-03-10 02:37:00,54.0,,,15.0,0.0,,,,...,,,,,,,,,,
5,132539.0,2025-03-10 03:08:00,54.0,,,15.0,0.0,,,33.7,...,,,,,,,,,,
6,132539.0,2025-03-10 03:37:00,54.0,,,15.0,0.0,,,33.7,...,,,,,,,,,,
7,132539.0,2025-03-10 04:37:00,54.0,,,15.0,0.0,,,33.7,...,,,,,,,,,,
8,132539.0,2025-03-10 05:37:00,54.0,,,15.0,0.0,,,33.7,...,,,,,,,,,,
9,132539.0,2025-03-10 07:37:00,54.0,,,15.0,0.0,,,33.7,...,,,,,,,,,,


In [20]:
# Get GCS data
gcs_values = patients_df.loc[patients_df["RecordID"] == 132539.0, "GCS"]
gcs_values_list = gcs_values.tolist()
print(len(gcs_values_list), gcs_values_list)

# Check how many rows are for specific patient
print(len(patients_df.loc[patients_df["RecordID"] == 132539.0]))

51 [nan, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 14.0, 14.0, 14.0, 14.0, 14.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0]
51


# Step 1. Dimensionality Reduction with PCA

In [26]:
import matplotlib.pyplot as plt

def remove_highly_correlated_features(df, threshold=0.9):
    """
    Removes features that are highly correlated with others.
    The correlation matrix is computed using pairwise deletion (ignoring NA pairs).
    
    Parameters:
        df (pd.DataFrame): DataFrame containing only the features to consider (numeric).
        threshold (float): Absolute correlation threshold above which one feature is dropped.
        
    Returns:
        df_reduced (pd.DataFrame): DataFrame with highly correlated features removed.
        dropped_features (list): List of dropped feature names.
    """
    # Compute the correlation matrix (pairwise complete observations)
    corr_matrix = df.corr()
    
    # Get the upper triangle of the correlation matrix
    # This prevents checking the same pair twice.
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # Identify columns to drop: if any value in a column's upper triangle exceeds the threshold
    to_drop = [column for column in upper.columns if any(upper[column].abs() > threshold)]
    
    # Drop the identified columns from the dataframe
    df_reduced = df.drop(columns=to_drop)
    
    return df_reduced, to_drop

feature_columns = [col for col in patients_df.columns if col not in ["RecordID", "Time"]]

# Extract the numeric features (assuming the rest are numeric)
patients_df_features = patients_df[feature_columns]

# Remove highly correlated features with a threshold of 0.9
patients_df_reduced, dropped_features = remove_highly_correlated_features(patients_df_features, threshold=0.65)

print("Dropped features due to high correlation:")
print(dropped_features)

# If desired, recombine with the non-feature columns.
patients_df_final = pd.concat([patients_df[["RecordID", "Time"]], patients_df_reduced], axis=1)
print("Final DataFrame shape:", patients_df_final.shape)



Dropped features due to high correlation:
['Creatinine', 'NIMAP', 'NISysABP', 'AST', 'SysABP']
Final DataFrame shape: (299264, 38)
