In [6]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
from numpy.lib.stride_tricks import as_strided
from tqdm import tqdm
from utils import vggish_input, params
from typing import Tuple, List

# CONFIGURATION
DATA_ROOT = Path("../Data/Experiment_Data")
RAW_DIR = DATA_ROOT / "1_RawDataset"
SAVE_DIR = DATA_ROOT / "3_PreprocessDataset_Oversample"
CLASS_LIST = ['Tooth_brushing', 'Washing_hands', 'Shower', 'Wiping', 'Vacuum_Cleaner', 'Other']
OVERSAMPLE_FACTORS = {
    '202': {'Tooth_brushing': 1.4, 'Washing_hands': 2.4, 'Wiping': 2.7},
    '206': {'Tooth_brushing': 2.5},
    '209': {'Washing_hands': 1.5},
}

# UTILITY FUNCTIONS
def add_time_zero_row(df: pd.DataFrame, time_col='Time', unixtime_col='UnixTime') -> pd.DataFrame:
    """
    Ensure a row with elapsed time zero exists; if missing, insert one.
    """
    df = df.copy()
    df[time_col] = pd.to_numeric(df[time_col], errors='coerce')
    df[unixtime_col] = pd.to_numeric(df[unixtime_col], errors='coerce')

    if (df[time_col] == 0).any():
        return df

    first = df.iloc[0].copy()
    new = first.copy()
    new[time_col] = 0
    new[unixtime_col] = first[unixtime_col] - int(first[time_col] * 1000)
    return pd.concat([pd.DataFrame([new]), df], ignore_index=True)

def frame_signal(data: np.ndarray, window_length: int, hop_length: int) -> np.ndarray:
    """
    Split a 2D array into overlapping frames via stride tricks.
    """
    if data.shape[0] < window_length:
        pad_n = window_length - data.shape[0]
        pad = np.zeros((pad_n, data.shape[1]), dtype=data.dtype)
        data = np.vstack([data, pad])

    n_frames = 1 + (data.shape[0] - window_length) // hop_length
    shape = (n_frames, window_length, data.shape[1])
    strides = (hop_length * data.strides[0],) + data.strides
    return as_strided(data, shape=shape, strides=strides)

def rebuild_waveform(df: pd.DataFrame) -> Tuple[np.ndarray, float]:
    """
    Convert multi-column audio data into flattened waveform and return start time.
    """
    audio_cols = [c for c in df.columns if c.startswith('AudioData')]
    arr = df[audio_cols].to_numpy(dtype=np.int16)
    times = df['UnixTime_s'].to_numpy()
    return arr.flatten(), float(times[0])

def generate_mel_chunks(waveform: np.ndarray,
                        sr: int = 16000,
                        lower_edge_hz: int = 10,
                        upper_edge_hz: int = 8000,
                        chunk_secs: int = 600) -> np.ndarray:
    """
    Compute mel spectrogram chunks for large waveform in time slices.
    """
    chunk_samples = int(chunk_secs * sr)
    mel_list = []

    for start in tqdm(range(0, len(waveform), chunk_samples)):
        chunk = waveform[start:start + chunk_samples]
        if chunk.size == 0:
            break
        mel = vggish_input.wavform_to_concat_examples(
            chunk, lower_edge_hz, upper_edge_hz, sr
        )
        mel_list.append(mel)

    return np.concatenate(mel_list, axis=0)

def proportional_oversample(sensor_arr: np.ndarray,
                             intervals: List[Tuple[float, float, str]],
                             target_act: str,
                             window_length: int,
                             hop_step: int,
                             factor: float) -> np.ndarray:
    """
    Generate additional frames for under-represented activity by adjusting hop length.
    """
    extra = []

    for st, et, act in intervals:
        if act != target_act:
            continue

        mask = (sensor_arr[:, 0] >= st) & (sensor_arr[:, 0] <= et)
        seg = sensor_arr[mask]
        if seg.shape[0] < window_length:
            continue

        orig = frame_signal(seg, window_length, hop_step)
        Ni = orig.shape[0]
        Mi = int(round((factor - 1) * Ni))
        if Mi <= 0:
            continue

        new_hop = max(1, int(hop_step / factor))
        pool = frame_signal(seg, window_length, new_hop)

        replace = pool.shape[0] < Mi
        idx = np.random.choice(pool.shape[0], Mi, replace=replace)
        extra.append(pool[idx])

    if extra:
        return np.vstack(extra)
    return np.empty((0, window_length, sensor_arr.shape[1]))

# PREPROCESSING PIPELINE
if __name__ == '__main__':
    participants = [d.name for d in RAW_DIR.iterdir() if d.is_dir()]

    for pid in participants:
        raw_folder = RAW_DIR / pid
        save_folder = SAVE_DIR / pid
        os.makedirs(save_folder, exist_ok=True)

        # Load and sync annotation & prediction
        anno_file = next(raw_folder.glob('*annotation.csv'))
        pred_file = next(raw_folder.glob('*Predicted_Activity.csv'))
        anno_df = pd.read_csv(anno_file, on_bad_lines='skip')
        pred_df = pd.read_csv(pred_file, on_bad_lines='skip')

        # Harmonize activity names
        anno_df['Activity'] = anno_df['Activity'].str.replace('Toothbrushing', 'Tooth_brushing')
        pred_df['Predict'] = pred_df['Predict'].str.replace(' ', '_')

        # Ensure time zero rows exist
        anno_df = add_time_zero_row(anno_df)
        pred_df = add_time_zero_row(pred_df)

        # Sync UnixTime scales
        delta = pred_df.loc[pred_df.Time == 0, 'UnixTime'].iat[0] - anno_df.loc[anno_df.Time == 0, 'UnixTime'].iat[0]
        anno_df['UnixTime'] += delta

        # Truncate annotation at prediction end and append stop event
        end_unix = pred_df['UnixTime'].max()
        anno_df = anno_df[anno_df['UnixTime'] <= end_unix]
        stop = pred_df.loc[pred_df.UnixTime.idxmax()].to_dict()
        stop.update({'Event': 'Session Stop', 'Activity': '', 'Confirm': ''})
        anno_df = pd.concat([anno_df, pd.DataFrame([stop])], ignore_index=True)

        # Convert UnixTime to seconds and compute intervals
        anno_df['UnixTime_s'] = anno_df['UnixTime'] / 1000
        intervals = []
        stack = {}
        for _, row in anno_df.query("Event not in ['Session Start','Session Stop']").iterrows():
            t, act = row.UnixTime_s, row.Activity
            if row.Event == 'Start':
                stack[act] = t
            elif act in stack:
                intervals.append((stack.pop(act), t, act))

        # Process sensor data
        imu_file = next(raw_folder.glob('*SensorData.csv'))
        audio_file = next(raw_folder.glob('*AudioData.csv'))
        sensor_df = pd.read_csv(imu_file, on_bad_lines='skip')
        audio_df = pd.read_csv(audio_file, on_bad_lines='skip')

        # Clean and synchronize data
        sensor_df = sensor_df.dropna(subset=sensor_df.columns[:17]).reset_index(drop=True)
        sensor_df = add_time_zero_row(sensor_df)
        audio_df = add_time_zero_row(audio_df)
        stop_ts = anno_df.loc[anno_df.Event == 'Session Stop', 'UnixTime'].iat[0]
        sensor_df = sensor_df[sensor_df.UnixTime <= stop_ts]
        audio_df = audio_df[audio_df.UnixTime <= stop_ts]
        for df in (sensor_df, audio_df):
            df['UnixTime_s'] = df['UnixTime'] / 1000

        # Frame IMU data
        imu_cols   = ['UnixTime_s','AccX','AccY','AccZ','GyroX','GyroY','GyroZ','RotVecX','RotVecY','RotVecZ']
        imu_arr = sensor_df[imu_cols].to_numpy()
        imu_sr     = 50
        window_sec = 2.0
        hop_sec    = 0.2
        iwlen      = int(window_sec * imu_sr)
        iwstep     = int(hop_sec * imu_sr)

        imu_frames = frame_signal(imu_arr, iwlen, iwstep)

        # Audio processing
        waveform, audio_start = rebuild_waveform(audio_df)
        mel_chunks = generate_mel_chunks(waveform)
        n_mels = mel_chunks.shape[0]
        hop = params.STFT_HOP_LENGTH_SECONDS
        win = params.STFT_WINDOW_LENGTH_SECONDS
        timestamps = audio_start + np.arange(n_mels) * hop + win


100%|██████████| 50/50 [03:38<00:00,  4.37s/it]
