In [1]:
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import IncrementalPCA, PCA

from tqdm import tqdm 

import os
import pandas as pd
import numpy as np
import time
import datetime

from scipy.io import loadmat
import glob

import h5py
import gc

import joblib

In [25]:
def calculate_target_lengths(split_df, percentile=95):
    """Calculate target temporal lengths for each feature."""
    target_lengths = {}
    
    # First pass to calculate lengths
    print("Calculating target temporal lengths...")
    for _, row in split_df.iterrows():
        p_id = row["Participant_ID"]
        features_dir = f"../dataset/edaicwoz_participant/{p_id}_P/{p_id}_P/features"
        
        # Load one sample of each feature type
        paths = {
            'audio_vgg16': f"{p_id}_vgg16.csv",
            'audio_densenet': f"{p_id}_densenet201.csv",
            'visual_resnet': f"{p_id}_CNN_ResNet.mat",
            'visual_vgg': f"{p_id}_CNN_VGG.mat",
            'audio_mfcc': f"{p_id}_OpenSMILE2.3.0_mfcc.csv",
            'audio_egemaps': f"{p_id}_OpenSMILE2.3.0_egemaps.csv",
            'visual_of': f"{p_id}_OpenFace2.1.0_Pose_gaze_AUs.csv"
        }
        
        for feat_name, path in paths.items():
            full_path = os.path.join(features_dir, path)
            if feat_name.endswith('.mat'):
                data = loadmat(full_path)['feature']
            else:
                data = pd.read_csv(full_path).iloc[:, 1:].values
                
            current_len = data.shape[0]
            if feat_name not in target_lengths:
                target_lengths[feat_name] = []
            target_lengths[feat_name].append(current_len)
    
    # Calculate percentile-based target lengths
    return {
        feat: int(np.percentile(lens, percentile))
        for feat, lens in target_lengths.items()
    }

In [31]:
def preprocess_data(features, pca_models=None, scalers=None, target_lengths=None):
    if pca_models is None:
        pca_models = {}
    if scalers is None:
        scalers = {}
    if target_lengths is None:
        target_lengths = {}

    processed = {}
    
    # Temporal standardization first
    for feat_name in features:
        feat_data = features[feat_name]
        
        # Pad/truncate temporal dimension
        if feat_name in target_lengths:
            current_len = feat_data.shape[0]
            target_len = target_lengths[feat_name]
            
            if current_len >= target_len:
                # Truncate
                feat_data = feat_data[:target_len]
            else:
                # Pad with zeros
                pad_shape = (target_len - current_len,) + feat_data.shape[1:]
                feat_data = np.vstack([feat_data, np.zeros(pad_shape)])
        
        # Apply scaling
        if feat_name not in scalers:
            scalers[feat_name] = RobustScaler()
            scalers[feat_name].fit(feat_data)
        scaled = scalers[feat_name].transform(feat_data)

        # Apply PCA if needed
        if feat_name in pca_models:
            if pca_models[feat_name] is None:
                # Initialize PCA if first time
                pca_models[feat_name] = PCA(n_components=0.95)
                pca_models[feat_name].fit(scaled)
            reduced = pca_models[feat_name].transform(scaled)
            processed[feat_name] = reduced
        else:
            processed[feat_name] = scaled

    return processed, pca_models, scalers, target_lengths

In [33]:
def save_to_h5file(data_dict, split_name):
    filename = f"{split_name}_processed.h5"
    
    with h5py.File(filename, 'a') as hf:
        for pid, features in data_dict.items():
            if str(pid) in hf:
                del hf[str(pid)]
                
            grp = hf.create_group(str(pid))
            for feat_name, feat_data in features.items():
                # Enforce float32 and chunked storage
                grp.create_dataset(feat_name, 
                                 data=feat_data.astype('float32'),
                                 chunks=True,
                                 maxshape=(None,) + feat_data.shape[1:],
                                 compression=None)

In [34]:
def get_data(split="train"):
    full_path = "../dataset/edaicwoz_labels/" + split + "_split.csv"
    split_file_df = pd.read_csv(full_path)
    
    if "train" in split:
        target_lengths = calculate_target_lengths(split_file_df, percentile=95)
    else:
        # For dev/test, use training target lengths (needs to be loaded from disk)
        target_lengths = load_target_lengths()  # Implement this to load from file
        
    pca_models = defaultdict(lambda: None)
    scalers = {}
    
    for idx, row in enumerate(split_file_df.iterrows()):        
        p_id = row[1]["Participant_ID"]
        features_dir = f"../dataset/edaicwoz_participant/{p_id}_P/{p_id}_P/features"
        aggregated_features_for_participant = []
        
        # audio
        boaw_egemaps_path = os.path.join(features_dir, f"{p_id}_BoAW_openSMILE_2.3.0_eGeMAPS.csv") # 1
        boaw_mfcc_path = os.path.join(features_dir, f"{p_id}_BoAW_openSMILE_2.3.0_MFCC.csv") # 2
        egemaps_path = os.path.join(features_dir, f"{p_id}_OpenSMILE2.3.0_egemaps.csv") # 8
        mfcc_path = os.path.join(features_dir, f"{p_id}_OpenSMILE2.3.0_mfcc.csv") # 9
        vgg16_path = os.path.join(features_dir, f"{p_id}_vgg16.csv") # 10
        densenet_path = os.path.join(features_dir, f"{p_id}_densenet201.csv") # 6
        
        # visual
        bovw_openface_pg_path = os.path.join(features_dir, f"{p_id}_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv") # 3
        openface_pg_path = os.path.join(features_dir, f"{p_id}_OpenFace2.1.0_Pose_gaze_AUs.csv") # 7
        resnet_path = os.path.join(features_dir, f"{p_id}_CNN_ResNet.mat") # 4
        vgg_path = os.path.join(features_dir, f"{p_id}_CNN_VGG.mat") # 5

        ptsd_binary = row[1]["PCL-C (PTSD)"]
        
        features = {
            # "audio_boaw_ege": pd.read_csv(boaw_egemaps_path, header=None).iloc[:, 1:].values,
            # "audio_boaw_mfcc": pd.read_csv(boaw_mfcc_path, header=None).iloc[:, 1:].values,
            "audio_egemaps": pd.read_csv(egemaps_path, sep=";").iloc[:, 1:].values,
            "audio_mfcc": pd.read_csv(mfcc_path, sep=";").iloc[:, 1:].values,
            "audio_vgg16": pd.read_csv(vgg16_path).iloc[:, 1:].values,
            "audio_densenet": pd.read_csv(densenet_path).iloc[:, 1:].values,
            "visual_bovw_of": pd.read_csv(bovw_openface_pg_path, header=None).iloc[:, 1:].select_dtypes(include=np.number).values,
            "visual_of": pd.read_csv(openface_pg_path).select_dtypes(include=np.number) .values,
            "visual_resnet": loadmat(resnet_path)["feature"],
            "visual_vgg": loadmat(vgg_path)['feature'],
        }
        
        processed, pca_models, scalers, _ = preprocess_data(
            features, 
            pca_models, 
            scalers,
            target_lengths
        )
        
        save_to_h5file({p_id: processed}, split_base)
        
        del features, processed
        gc.collect() 

        print(f".... {p_id} done [processed and saved] ....")

In [35]:
get_data()

Calculating target temporal lengths...


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc0 in position 133: invalid start byte

In [24]:
with h5py.File('../storage/transform/train_processed.h5', 'r') as hf:
    for i in ['302', '303', '304', '305']:
        print(f"{i} features: ")
        print(f"audio_vgg16: {hf[i]['audio_vgg16'].shape}")
        print(f"audio_densenet: {hf[i]['audio_densenet'].shape}")
        print(f"visual_resnet: {hf[i]['visual_resnet'].shape}")
        print(f"visual_vgg: {hf[i]['visual_vgg'].shape}")
        print(f"audio_mfcc: {hf[i]['audio_mfcc'].shape}")
        print(f"audio_egemaps: {hf[i]['audio_egemaps'].shape}")
        print(f"visual_of: {hf[i]['visual_of'].shape}")

302 features: 
audio_vgg16: (758, 512)
audio_densenet: (758, 512)
visual_resnet: (22766, 512)
visual_vgg: (22766, 512)
audio_mfcc: (75878, 40)
audio_egemaps: (75876, 24)
visual_of: (22766, 53)
303 features: 
audio_vgg16: (985, 512)
audio_densenet: (985, 512)
visual_resnet: (29565, 512)
visual_vgg: (29565, 512)
audio_mfcc: (98528, 40)
audio_egemaps: (98526, 24)
visual_of: (29565, 53)
304 features: 
audio_vgg16: (788, 512)
audio_densenet: (788, 512)
visual_resnet: (23780, 512)
visual_vgg: (23780, 512)
audio_mfcc: (79258, 40)
audio_egemaps: (79256, 24)
visual_of: (23780, 53)
305 features: 
audio_vgg16: (1704, 512)
audio_densenet: (1704, 512)
visual_resnet: (51122, 512)
visual_vgg: (51122, 512)
audio_mfcc: (170398, 40)
audio_egemaps: (170396, 24)
visual_of: (51122, 53)


In [8]:
def get_data(split="train"):
    full_path = "../dataset/edaicwoz_labels/" + split + "_split.csv"
    split_file_df = pd.read_csv(full_path)
    
    for idx, row in enumerate(split_file_df.iterrows()):        
        p_id = row[1]["Participant_ID"]
        features_dir = f"../dataset/edaicwoz_participant/{p_id}_P/{p_id}_P/features"
        aggregated_features_for_participant = []
        
        # audio
        boaw_egemaps_path = os.path.join(features_dir, f"{p_id}_BoAW_openSMILE_2.3.0_eGeMAPS.csv") # 1
        boaw_mfcc_path = os.path.join(features_dir, f"{p_id}_BoAW_openSMILE_2.3.0_MFCC.csv") # 2
        egemaps_path = os.path.join(features_dir, f"{p_id}_OpenSMILE2.3.0_egemaps.csv") # 8
        mfcc_path = os.path.join(features_dir, f"{p_id}_OpenSMILE2.3.0_mfcc.csv") # 9
        vgg16_path = os.path.join(features_dir, f"{p_id}_vgg16.csv") # 10
        densenet_path = os.path.join(features_dir, f"{p_id}_densenet201.csv") # 6
        
        # visual
        bovw_openface_pg_path = os.path.join(features_dir, f"{p_id}_BoVW_openFace_2.1.0_Pose_Gaze_AUs.csv") # 3
        openface_pg_path = os.path.join(features_dir, f"{p_id}_OpenFace2.1.0_Pose_gaze_AUs.csv") # 7
        resnet_path = os.path.join(features_dir, f"{p_id}_CNN_ResNet.mat") # 4
        vgg_path = os.path.join(features_dir, f"{p_id}_CNN_VGG.mat") # 5

        ptsd_binary = row[1]["PCL-C (PTSD)"]
        
        features = {
            "audio_boaw_ege": pd.read_csv(boaw_egemaps_path, header=None).iloc[:, 2:].values,
            "audio_boaw_mfcc": pd.read_csv(boaw_mfcc_path, header=None).iloc[:, 2:].values,
            "audio_egemaps": pd.read_csv(egemaps_path, sep=";").iloc[:, 1:].values,
            "audio_mfcc": pd.read_csv(mfcc_path, sep=";").iloc[:, 1:].values,
            "audio_vgg16": pd.read_csv(vgg16_path).iloc[:, 2:].values,
            "audio_densenet": pd.read_csv(densenet_path).iloc[:, 2:].values,
            "visual_bovw_of": pd.read_csv(bovw_openface_pg_path, header=None).iloc[:, 2:].select_dtypes(include=np.number).values,
            "visual_of": pd.read_csv(openface_pg_path).select_dtypes(include=np.number) .values,
            "visual_resnet": loadmat(resnet_path)["feature"],
            "visual_vgg": loadmat(vgg_path)['feature'],
        }
        
        print(p_id)
        
        print(f"audio_egemaps: {features['audio_egemaps'].shape}")
        print(f"audio_mfcc: {features['audio_mfcc'].shape}")
        print(f"audio_vgg16: {features['audio_vgg16'].shape}")
        print(f"audio_densenet: {features['audio_densenet'].shape}")
        print(f"visual_bovw_of: {features['visual_bovw_of'].shape}")
        print(f"visual_of: {features['visual_of'].shape}")
        print(f"visual_resnet: {features['visual_resnet'].shape}")
        print(f"visual_vgg: {features['visual_vgg'].shape}")
        
        print(f"\ndesnsenet: {features['audio_densenet']}")
        print(f"\nvgg16: {features['audio_vgg16']}")
        print(f"\nvgg: {features['visual_vgg']}")
        
        # print(f"audio_boaw_ege: {features['audio_boaw_ege'].shape}")
        # print(f"audio_boaw_mfcc: {features['audio_boaw_mfcc'].shape}")
        # print(f"visual_bovw_of: {features['visual_bovw_of'].shape}")
        
        del features
        gc.collect()

In [9]:
get_data()

302
audio_egemaps: (75876, 24)
audio_mfcc: (75878, 40)
audio_vgg16: (758, 4096)
audio_densenet: (758, 1920)
visual_bovw_of: (7589, 100)
visual_of: (22766, 53)
visual_resnet: (22766, 2048)
visual_vgg: (22766, 4096)

desnsenet: [[4.13115500e-05 2.62318800e-04 1.19398500e-03 ... 2.32202140e-01
  6.62418370e-01 9.13221800e-01]
 [4.20029320e-05 4.18474300e-04 8.80428900e-04 ... 1.06265634e-01
  7.02257900e-01 6.60984800e-01]
 [3.20921600e-05 2.79641740e-04 4.78916070e-04 ... 1.27977520e-01
  1.58896650e+00 7.55539500e-01]
 ...
 [5.21376240e-05 3.94868750e-04 6.91148700e-04 ... 1.32203280e-01
  1.11729870e+00 4.82479960e-01]
 [2.53436600e-05 4.57050630e-04 1.03011260e-03 ... 2.56852100e-01
  1.04215840e+00 1.11969670e+00]
 [4.05782500e-05 4.35246800e-04 7.67430230e-04 ... 5.02721850e-01
  1.07464660e+00 3.51064830e-01]]

vgg16: [[0.         0.         0.         ... 2.8741732  0.24951726 0.        ]
 [0.         0.         0.         ... 2.2591581  0.         0.        ]
 [0.         0.     

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.