In [None]:
import os
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

file_path = "dataset/imputed_combined_data.parquet" 
df = pd.read_parquet(file_path)

df.head()

max_length = df.groupby("patient_id").size().max()

# SOFA Score Calculation
def calculate_sofa(row):
    sofa = 0

    def assign_score(value, thresholds):
        for threshold, score in thresholds:
            if value >= threshold:
                return score
        return 0  

    # Respiration 
    if row.get('FiO2', 0) > 0:
        pao2_fio2 = row.get('SaO2', 0) / row['FiO2']
        sofa += assign_score(pao2_fio2, [(100, 4), (200, 3), (300, 2), (400, 1)])

    # Coagulation
    sofa += assign_score(row.get('Platelets', float('inf')), [(20, 4), (50, 3), (100, 2), (150, 1)])

    # Liver Function
    sofa += assign_score(row.get('Bilirubin_total', 0), [(12, 4), (6, 3), (2, 2), (1.2, 1)])

    # Cardiovascular
    if row.get('MAP', 100) < 70:
        sofa += 1

    # Renal Function
    sofa += assign_score(row.get('Creatinine', 0), [(5, 4), (3.5, 3), (2, 2), (1.2, 1)])

    return sofa

def add_temporal_features(df):
    #Adds rolling statistics (moving averages, standard deviation, rate of change) for some features (may or may not be useful).
    time_window_sizes = [3, 6, 12]  # Rolling window sizes (in time steps)
    feature_cols = ['HeartRate', 'RespiratoryRate', 'MAP', 'SpO2', 'Creatinine', 'Platelets']

    df.sort_values(['patient_id', 'ICULOS'], inplace=True)

    for col in feature_cols:
        if col in df.columns:
            for window in time_window_sizes:
                df[f'{col}_MA_{window}h'] = df.groupby('patient_id')[col].transform(lambda x: x.rolling(window, min_periods=1).mean())
                df[f'{col}_SD_{window}h'] = df.groupby('patient_id')[col].transform(lambda x: x.rolling(window, min_periods=1).std())
                df[f'{col}_Delta'] = df.groupby('patient_id')[col].diff()
    return df

def preprocess_data(output_file):
    global df 

    df['SOFA'] = df.apply(calculate_sofa, axis=1)
    df = add_temporal_features(df)

    if 'Gender' in df.columns:
        df['Gender'] = LabelEncoder().fit_transform(df['Gender'].astype(str))

    feature_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    scaler = MinMaxScaler()
    df[feature_cols] = scaler.fit_transform(df[feature_cols])

    # Process each patient separately
    padded_data_list = []
    patient_ids = df['patient_id'].unique()

    for patient_id in patient_ids:
        patient_data = df[df['patient_id'] == patient_id]
        sequence_length = len(patient_data)

        # Pad patient's data to max_length
        padded_patient_data = np.zeros((max_length, len(feature_cols)))
        padded_patient_data[:sequence_length, :] = patient_data[feature_cols].values

        if sequence_length < max_length:
            last_entry = padded_patient_data[sequence_length - 1]  # Last valid row
            padded_patient_data[sequence_length:, :] = last_entry  # Fill with last row

        padded_data_list.append(padded_patient_data)

    # Convert list to DataFrame and save
    padded_df = pd.DataFrame(np.concatenate(padded_data_list, axis=0), columns=feature_cols)
    padded_df.to_parquet(output_file, index=False)

    print(f"Preprocessed data saved to {output_file}")

output_file = "preprocessed_data.parquet"
preprocess_data(output_file)
