In [None]:
import os
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

file_path = "dataset/imputed_combined_data.parquet"  # Adjust path if needed
df = pd.read_parquet(file_path)

df.head()

max_length = df.groupby("patient_id").size().max()

# SOFA Score Calculation
def calculate_sofa(row):
    sofa = 0

    def assign_score(value, thresholds):
        for threshold, score in thresholds:
            if value >= threshold:
                return score
        return 0  

    # Respiration 
    if row.get('FiO2', 0) > 0:
        pao2_fio2 = row.get('SaO2', 0) / row['FiO2']
        sofa += assign_score(pao2_fio2, [(100, 4), (200, 3), (300, 2), (400, 1)])

    # Coagulation
    sofa += assign_score(row.get('Platelets', float('inf')), [(20, 4), (50, 3), (100, 2), (150, 1)])

    # Liver Function
    sofa += assign_score(row.get('Bilirubin_total', 0), [(12, 4), (6, 3), (2, 2), (1.2, 1)])

    # Cardiovascular
    if row.get('MAP', 100) < 70:
        sofa += 1

    # Renal Function
    sofa += assign_score(row.get('Creatinine', 0), [(5, 4), (3.5, 3), (2, 2), (1.2, 1)])

    return sofa

def add_temporal_features(df):
    #Adds rolling statistics (moving averages, standard deviation, rate of change) for some features (may or may not be useful).
    time_window_sizes = [3, 6, 12]  # Rolling window sizes (in time steps)
    feature_cols = ['HeartRate', 'RespiratoryRate', 'MAP', 'SpO2', 'Creatinine', 'Platelets']

    df.sort_values(['patient_id', 'ICULOS'], inplace=True)

    for col in feature_cols:
        if col in df.columns:
            for window in time_window_sizes:
                df[f'{col}_MA_{window}h'] = df.groupby('patient_id')[col].transform(lambda x: x.rolling(window, min_periods=1).mean())
                df[f'{col}_SD_{window}h'] = df.groupby('patient_id')[col].transform(lambda x: x.rolling(window, min_periods=1).std())
                df[f'{col}_Delta'] = df.groupby('patient_id')[col].diff()
    return df

# Data Preprocessing Function
def preprocess_data(output_file):
    global df 

    df['SOFA'] = df.apply(calculate_sofa, axis=1)

    df = add_temporal_features(df)

    # Encode categorical features (Gender: 0 = Female, 1 = Male)
    if 'Gender' in df.columns:
        df['Gender'] = LabelEncoder().fit_transform(df['Gender'].astype(str))

    feature_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    scaler = MinMaxScaler()
    df[feature_cols] = scaler.fit_transform(df[feature_cols])

    padded_data = np.zeros((max_length, len(feature_cols)))
    sequence_length = min(max_length, df.shape[0])
    padded_data[:sequence_length, :] = df[feature_cols].iloc[:sequence_length].values

    pd.DataFrame(padded_data, columns=feature_cols).to_csv(output_file, index=False)
    print(f"Preprocessed data saved to {output_file}")

# Example usage
output_file = "preprocessed_data.csv"
preprocess_data(output_file)


UnboundLocalError: cannot access local variable 'df' where it is not associated with a value