In [None]:
# Raw IMU & Audio CSV → Pickle Preprocessing Pipeline
import os
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.io import wavfile
from tqdm import tqdm

from utils import vggish_input, params

# ── Configuration ──
RAW_DIR    = Path("../../Data/Pilot_Data/1. RawDataset")
SAVE_DIR   = Path("../../Data/Pilot_Data/3. PreprocessDataset_for_Audio")
PID        = "16"
SR         = 16000
LE_HZ      = params.MEL_MIN_HZ
UE_HZ      = params.MEL_MAX_HZ

# ── Helpers ──
def add_time_zero(df, time_col='Time', ux_col='UnixTime'):
    df = df.copy()
    if (df[time_col] == 0).any():
        return df
    first = df.iloc[0].copy()
    offset = int(first[time_col] * 1000)
    new = first.copy()
    new[time_col] = 0
    new[ux_col]   = first[ux_col] - offset
    return pd.concat([pd.DataFrame([new]), df], ignore_index=True)


def load_csvs(pid):
    base = RAW_DIR / pid
    imu_f = next(base.glob("*SensorData.csv"))
    aud_f = next(base.glob("*AudioData.csv"))
    ann_f = next(base.glob("*annotation_modified.csv"))
    imu_df  = pd.read_csv(imu_f)
    aud_df  = pd.read_csv(aud_f)
    ann_df  = pd.read_csv(ann_f)
    return imu_df, aud_df, ann_df


def sync_and_trim(imu_df, aud_df, ann_df):
    imu_df = add_time_zero(imu_df)
    aud_df = add_time_zero(aud_df)
    ann_df = add_time_zero(ann_df)

    zero_imu = imu_df.loc[imu_df.Time==0, 'UnixTime'].iloc[0]
    zero_ann = ann_df.loc[ann_df.Time==0, 'UnixTime'].iloc[0]
    ann_df['UnixTime'] += zero_imu - zero_ann

    end_ts = ann_df.query("Event=='session_stop'").UnixTime.max()
    imu_df = imu_df[imu_df.UnixTime <= end_ts].reset_index(drop=True)
    aud_df = aud_df[aud_df.UnixTime <= end_ts].reset_index(drop=True)
    return imu_df, aud_df, ann_df


def extract_intervals(ann_df):
    df = ann_df.query("Event.isin(['start','stop'])").reset_index(drop=True)
    stack, intervals = {}, []
    for _, r in df.iterrows():
        t, act, ev = r.UnixTime, r.Activity, r.Event
        if ev=='start': stack[act] = t
        elif ev=='stop' and act in stack:
            intervals.append((stack.pop(act), t, act))
    return intervals


def label_audio_runs(aud_df, intervals):
    aud = aud_df.copy()
    aud['Activity'] = 'Other'
    cols = [c for c in aud.columns if c.startswith('AudioData')]
    for st, et, act in intervals:
        mask = aud.UnixTime.between(st, et)
        aud.loc[mask, 'Activity'] = act
    aud['run_id'] = (aud.Activity != aud.Activity.shift()).cumsum()
    samples = []
    for _, grp in aud.groupby('run_id'):
        arr = grp[cols].to_numpy(dtype=np.int16).flatten()
        lbl = grp.Activity.iat[0]
        samples.append((arr, lbl))
    return samples

def window_and_save(samples, pid):
    save_d = SAVE_DIR / pid
    save_d.mkdir(parents=True, exist_ok=True)
    audio_windows, labels = [], []
    for wav, lbl in tqdm(samples, desc="Windowing audio"):
        feats = vggish_input.wavform_to_examples(
            wav, lower_edge_hertz=LE_HZ, upper_edge_hertz=UE_HZ, sr=SR
        )
        audio_windows.append(feats)
        labels += [lbl] * feats.shape[0]
    X = np.vstack(audio_windows)
    y = np.array(labels)
    with open(save_d / f"{pid}_preprocessing_for_audio.pkl", 'wb') as f:
        pickle.dump({'Audio':X, 'Activity':y}, f)
    print(f"Saved {X.shape} + {y.shape} to {save_d}")


# ── Pipeline ──
imu_df, aud_df, ann_df = load_csvs(PID)
imu_df, aud_df, ann_df = sync_and_trim(imu_df, aud_df, ann_df)
intervals = extract_intervals(ann_df)
samples = label_audio_runs(aud_df, intervals)
window_and_save(samples, PID)
