In [12]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import os
import pandas as pd
import gc
import joblib
from scipy.io import loadmat
import h5py
import pickle

In [13]:
def fit_scalers(split="train"):
    full_path = "../dataset/edaicwoz_labels/" + split + "_split.csv"
    split_file_df = pd.read_csv(full_path)
    
    scalers = {
        "audio_egemaps": StandardScaler(),
        "audio_mfcc": StandardScaler(),
        "audio_vgg16": StandardScaler(),
        "audio_densenet": StandardScaler(),
        "visual_of": StandardScaler(),
        "visual_resnet": StandardScaler(),
        "visual_vgg": StandardScaler()
    }
    
    for idx, row in enumerate(split_file_df.iterrows()):
        p_id = row[1]["Participant_ID"]
        features_dir = f"../dataset/edaicwoz_participant/{p_id}_P/{p_id}_P/features"
        
        # Load each feature and update scaler
        egemaps = pd.read_csv(os.path.join(features_dir, f"{p_id}_OpenSMILE2.3.0_egemaps.csv"), sep=";").iloc[:, 1:].values
        scalers["audio_egemaps"].partial_fit(egemaps)
        
        mfcc = pd.read_csv(os.path.join(features_dir, f"{p_id}_OpenSMILE2.3.0_mfcc.csv"), sep=";").iloc[:, 1:].values
        scalers["audio_mfcc"].partial_fit(mfcc)
        
        vgg16 = pd.read_csv(os.path.join(features_dir, f"{p_id}_vgg16.csv")).iloc[:, 2:].values
        scalers["audio_vgg16"].partial_fit(vgg16)
        
        densenet = pd.read_csv(os.path.join(features_dir, f"{p_id}_densenet201.csv")).iloc[:, 2:].values
        scalers["audio_densenet"].partial_fit(densenet)
        
        openface_pg = pd.read_csv(os.path.join(features_dir, f"{p_id}_OpenFace2.1.0_Pose_gaze_AUs.csv")).select_dtypes(include=np.number).values
        scalers["visual_of"].partial_fit(openface_pg)
        
        resnet = loadmat(os.path.join(features_dir, f"{p_id}_CNN_ResNet.mat"))["feature"]
        scalers["visual_resnet"].partial_fit(resnet)
        
        vgg = loadmat(os.path.join(features_dir, f"{p_id}_CNN_VGG.mat"))["feature"]
        scalers["visual_vgg"].partial_fit(vgg)
        
        del egemaps, mfcc, vgg16, densenet, openface_pg, resnet, vgg
        gc.collect()
        
        if idx % 10 == 0:
            print(f"Fitting scalers: processed {idx+1} participants")
    
    # Save the scalers for future use
    with open("../storage/transform/feature_scalers.pkl", "wb") as f:
        pickle.dump(scalers, f)
        print("Dumped scalers")
    
    return scalers


In [14]:
def get_data_normalized(split="train", scalers=None):
    # If scalers not provided, try to load them
    if scalers is None:
        try:
            with open("../storage/transform/feature_scalers.pkl", "rb") as f:
                scalers = pickle.load(f)
        except FileNotFoundError:
            raise ValueError("No scalers provided and couldn't find saved scalers.")
    
    full_path = "../dataset/edaicwoz_labels/" + split + "_split.csv"
    split_file_df = pd.read_csv(full_path)
    
    # Ensure directory exists
    os.makedirs("../storage/transform", exist_ok=True)
    h5_filename = f"../storage/transform/edaicwoz_{split}_normalized.h5"
    
    # Remove file if it exists to avoid append issues
    if os.path.exists(h5_filename):
        os.remove(h5_filename)
    
    for idx, row in enumerate(split_file_df.iterrows()):
        p_id = row[1]["Participant_ID"]
        features_dir = f"../dataset/edaicwoz_participant/{p_id}_P/{p_id}_P/features"
        
        # Load features
        egemaps = pd.read_csv(os.path.join(features_dir, f"{p_id}_OpenSMILE2.3.0_egemaps.csv"), sep=";").iloc[:, 1:].values
        mfcc = pd.read_csv(os.path.join(features_dir, f"{p_id}_OpenSMILE2.3.0_mfcc.csv"), sep=";").iloc[:, 1:].values
        vgg16 = pd.read_csv(os.path.join(features_dir, f"{p_id}_vgg16.csv")).iloc[:, 2:].values
        densenet = pd.read_csv(os.path.join(features_dir, f"{p_id}_densenet201.csv")).iloc[:, 2:].values
        openface_pg = pd.read_csv(os.path.join(features_dir, f"{p_id}_OpenFace2.1.0_Pose_gaze_AUs.csv")).select_dtypes(include=np.number).values
        resnet = loadmat(os.path.join(features_dir, f"{p_id}_CNN_ResNet.mat"))["feature"]
        vgg = loadmat(os.path.join(features_dir, f"{p_id}_CNN_VGG.mat"))["feature"]
        
        ptsd_binary = row[1]["PCL-C (PTSD)"]
        
        # Normalize using scalers
        egemaps_norm = scalers["audio_egemaps"].transform(egemaps)
        mfcc_norm = scalers["audio_mfcc"].transform(mfcc)
        vgg16_norm = scalers["audio_vgg16"].transform(vgg16)
        densenet_norm = scalers["audio_densenet"].transform(densenet)
        openface_pg_norm = scalers["visual_of"].transform(openface_pg)
        resnet_norm = scalers["visual_resnet"].transform(resnet)
        vgg_norm = scalers["visual_vgg"].transform(vgg)
        
        features = {
            "info": np.array([p_id, ptsd_binary]),
            "audio_egemaps": egemaps_norm,
            "audio_mfcc": mfcc_norm,
            "audio_vgg16": vgg16_norm,
            "audio_densenet": densenet_norm,
            "visual_of": openface_pg_norm,
            "visual_resnet": resnet_norm,
            "visual_vgg": vgg_norm
        }
        
        # Store normalized features
        with h5py.File(h5_filename, 'a') as f:
            grp = f.create_group(str(p_id))
            for key, data in features.items():
                grp.create_dataset(key, data=data)
        
        # Clean up memory
        del egemaps, mfcc, vgg16, densenet, openface_pg, resnet, vgg
        del egemaps_norm, mfcc_norm, vgg16_norm, densenet_norm, openface_pg_norm, resnet_norm, vgg_norm
        del features
        gc.collect()
        
        if idx % 10 == 0:
            print(f"Stored normalized data for {idx+1} participants")


In [15]:
# First fit scalers using training data only
print("Fitting scalers on training data...")
scalers = fit_scalers(split="train")

# Now apply normalization to all splits
print("Normalizing training data...")
get_data_normalized(split="train", scalers=scalers)

print("Normalizing validation data...")
get_data_normalized(split="dev", scalers=scalers)

print("Normalizing test data...")
get_data_normalized(split="test", scalers=scalers)

print("All data normalized and saved!")

Fitting scalers on training data...
Fitting scalers: processed 1 participants
Fitting scalers: processed 11 participants
Fitting scalers: processed 21 participants
Fitting scalers: processed 31 participants
Fitting scalers: processed 41 participants
Fitting scalers: processed 51 participants
Fitting scalers: processed 61 participants
Fitting scalers: processed 71 participants
Fitting scalers: processed 81 participants
Fitting scalers: processed 91 participants
Fitting scalers: processed 101 participants
Fitting scalers: processed 111 participants
Fitting scalers: processed 121 participants
Fitting scalers: processed 131 participants
Fitting scalers: processed 141 participants
Fitting scalers: processed 151 participants
Fitting scalers: processed 161 participants
Dumped scalers
Normalizing training data...
Stored normalized data for 1 participants
Stored normalized data for 11 participants
Stored normalized data for 21 participants
Stored normalized data for 31 participants
Stored norma