In [None]:
import os
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

import get_data

# Load dataset
combined_df, patientIds = get_data.get_dataset()

# Save to Parquet for fast access
DATA_PATH = get_data.get_dataset_abspath()
save_path = os.path.join(DATA_PATH, "raw_combined_data.parquet")
combined_df.to_parquet(save_path, compression="snappy")

# Access Data
load_path = os.path.join(DATA_PATH, "raw_combined_data.parquet")
combined_df = pd.read_parquet(load_path)

print(combined_df.head())

max_length = combined_df.groupby("patient_id").size().max()

def preprocess_data(output_file):

    data = combined_df
    
    # Encode features (Gender: 0 = Female, 1 = Male)
    if 'Gender' in data.columns:
        data['Gender'] = LabelEncoder().fit_transform(data['Gender'].astype(str))
    
    feature_cols = data.select_dtypes(include=[np.number]).columns.tolist()
    scaler = MinMaxScaler()
    data[feature_cols] = scaler.fit_transform(data[feature_cols])

    #ensures that al the patients have the same length

    padded_data = np.zeros((max_length, len(feature_cols)))
    sequence_length = min(max_length, data.shape[0])
    padded_data[:sequence_length, :] = data[feature_cols].iloc[:sequence_length].values

    # Save preprocessed data
    pd.DataFrame(padded_data, columns=feature_cols).to_csv(output_file, index=False)
    print(f"Preprocessed data saved to {output_file}")

# Example usage
output_file = "preprocessed_data.csv"
preprocess_data(output_file)

Processing training_setA:   0%|          | 0/20336 [00:00<?, ?it/s]

Processing training_setB:   0%|          | 0/20000 [00:00<?, ?it/s]

     HR  O2Sat   Temp     SBP   MAP    DBP  Resp  EtCO2  BaseExcess  HCO3  \
0  80.0  100.0  36.50  121.00  58.0  41.00  13.5    NaN         1.0  25.0   
1  76.0  100.0  36.25  113.25  61.0  41.50  12.0    NaN         1.0  25.0   
2  80.0  100.0  36.25  132.75  71.5  46.25  12.0    NaN         NaN   NaN   
3  78.0  100.0  36.10  103.50  58.0  43.00  12.0    NaN        -3.0   NaN   
4  74.0  100.0  36.00  128.75  69.5  44.50  12.5    NaN        -3.0   NaN   

   ...  Platelets    Age  Gender  Unit1  Unit2  HospAdmTime  ICULOS  \
0  ...      160.0  77.27       1    0.0    1.0       -69.14       3   
1  ...        NaN  77.27       1    0.0    1.0       -69.14       4   
2  ...        NaN  77.27       1    0.0    1.0       -69.14       5   
3  ...        NaN  77.27       1    0.0    1.0       -69.14       6   
4  ...        NaN  77.27       1    0.0    1.0       -69.14       7   

   SepsisLabel  patient_id  dataset  
0            0           1        A  
1            0           1        

note: currently uses data sets directly, will need to adapt it for the imputed data csv file