In [None]:
import os
import pickle as pkl
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm
from numpy.lib.stride_tricks import as_strided
from utils import vggish_input, params

# Configurations
DATA_DIR = Path("../../Data/Experiment_Data/1. RawDataset")
ANNO_DIR = Path("../../Data/Experiment_Data/2. PreprocessDataset")
SAVE_DIR = Path("../../Data/Experiment_Data/5. PreprocessDataset_Window_Audio")
SAVE_DIR.mkdir(parents=True, exist_ok=True)

# Parameters
IMU_SR = 50
WINDOW_SEC = 2.0
HOP_SEC = 0.2
SUB_SR = 16000
CHUNK_HOURS = 1

CLASS_LIST = ['Tooth_brushing', 'Washing_hands', 'Shower', 'Wiping', 'Vacuum_Cleaner', 'Other']

# Helpers
def add_time_zero_row(df, tcol='Time', ucol='UnixTime'):
    df = df.copy()
    df[tcol] = pd.to_numeric(df[tcol], errors='coerce')
    df[ucol] = pd.to_numeric(df[ucol], errors='coerce')
    if (df[tcol] == 0).any():
        return df
    first = df.iloc[0].copy()
    first[ucol] -= int(first[tcol] * 1000)
    first[tcol] = 0
    return pd.concat([first.to_frame().T, df], ignore_index=True)


def frame(data, win_len, hop_len):
    n = data.shape[0]
    if n < win_len:
        pad = win_len - n
        data = np.vstack([data, np.zeros((pad, data.shape[1]))])
    n_frames = 1 + (data.shape[0] - win_len) // hop_len
    shape = (n_frames, win_len, data.shape[1])
    strides = (hop_len * data.strides[0],) + data.strides
    return as_strided(data, shape=shape, strides=strides)


def rebuild_waveform(df):
    audio_cols = [c for c in df.columns if c.startswith('AudioData')]
    arr = df[audio_cols].to_numpy(dtype=np.int16)
    return arr.flatten(), df['UnixTime_s'].iloc[0]


def generate_mel_chunks(wav, sr, low, high, hours):
    samples_per_chunk = int(hours * 3600 * sr)
    mel_list = []
    for start in tqdm(range(0, len(wav), samples_per_chunk)):
        chunk = wav[start:start + samples_per_chunk]
        if chunk.size == 0:
            break
        mel = vggish_input.wavform_to_concat_examples(
            chunk, lower_edge_hertz=low, upper_edge_hertz=high, sr=sr)
        mel_list.append(mel)
    return np.concatenate(mel_list, axis=0)


# Main loop
for pid in sorted(os.listdir(DATA_DIR)):
    pdir = DATA_DIR / pid
    if not pdir.is_dir():
        continue
    print(f"Processing {pid}...")

    # Load dataframes
    sensor = pd.read_csv(next(pdir.glob("*SensorData.csv")), engine='python')
    audio  = pd.read_csv(next(pdir.glob("*AudioData.csv")), engine='python')
    anno   = pd.read_csv(ANNO_DIR / pid / f"{pid}_Annotation_processed.csv")

    # Clean sensor noise
    exp = sensor.columns[:17]
    extra = sensor.columns[17:]
    mask = sensor[exp].isnull().any(axis=1) | sensor[extra].notnull().any(axis=1)
    sensor = sensor.loc[~mask].reset_index(drop=True)

    # Zero-time rows
    sensor = add_time_zero_row(sensor)
    audio  = add_time_zero_row(audio)

    # Sync timestamps
    t0_s = sensor.loc[sensor.Time==0, 'UnixTime'].iloc[0]
    t0_a = audio .loc[audio .Time==0, 'UnixTime'].iloc[0]
    t0_n = anno  .loc[anno  .Time==0, 'UnixTime'].iloc[0]
    audio['UnixTime'] += (t0_s - t0_a)
    anno ['UnixTime'] += (t0_s - t0_n)

    # Filter by session stop
    endux = anno.loc[anno.Event=='Session Stop','UnixTime'].max()
    sensor = sensor[sensor.UnixTime <= endux].reset_index(drop=True)
    audio  = audio [audio .UnixTime <= endux].reset_index(drop=True)

    # Convert to seconds
    for df in (sensor, audio, anno):
        df['UnixTime_s'] = df['UnixTime'] / 1000.0

    # IMU frames
    imu_arr = sensor[['UnixTime_s','AccX','AccY','AccZ','GyroX','GyroY','GyroZ','RotVecX','RotVecY','RotVecZ']].to_numpy()
    wlen = int(WINDOW_SEC * IMU_SR)
    hlen = int(HOP_SEC * IMU_SR)
    imu_frames = frame(imu_arr, wlen, hlen)

    # Build intervals
    intervals = []
    stack = {}
    for _, r in anno.query("Event!='Session Start' and Event!='Session Stop'").iterrows():
        t, ev, act = r.UnixTime_s, r.Event, r.Activity
        if ev == 'Start':
            stack[act] = t
        elif ev=='End' and act in stack:
            intervals.append((stack.pop(act), t, act))

    # Rebuild waveform & mel chunks
    wav, start = rebuild_waveform(audio)
    mel = generate_mel_chunks(wav, SUB_SR, 10, SUB_SR//2, CHUNK_HOURS)

    # Timestamp for mel frames
    mel_hop = params.STFT_HOP_LENGTH_SECONDS_2sec
    mel_win = params.STFT_WINDOW_LENGTH_SECONDS_2sec
    n_mels = mel.shape[0]
    ts = start + np.arange(n_mels)*mel_hop + mel_win

    # Windowed examples & labels
    ex_len = int(params.EXAMPLE_WINDOW_SECONDS_2sec / mel_hop)
    X, Y = [], []
    for f in imu_frames:
        s_t, e_t = f[0,0], f[-1,0]
        i0 = np.searchsorted(ts, s_t)
        i1 = i0 + ex_len
        if i1 > n_mels:
            continue
        seg = mel[i0:i1]
        if seg.shape[0] < ex_len:
            pad = ex_len - seg.shape[0]
            seg = np.vstack([seg, np.zeros((pad, seg.shape[1]))])
        # Label by overlap
        counts = {c:0 for c in CLASS_LIST}
        for st, et, ac in intervals:
            counts[ac] += max(0, min(e_t, et)-max(s_t, st))
        covered = sum(counts.values())
        counts['Other'] += max(0, (e_t-s_t)-covered)
        label = max(counts, key=counts.get)
        X.append(seg)
        Y.append(label)

    X = np.stack(X)
    Y = np.array(Y)
    print(pid, '→', X.shape, Y.shape)

    # Save
    out = SAVE_DIR / pid
    out.mkdir(exist_ok=True)
    with open(out / f"{pid}_preprocessing.pkl", 'wb') as f:
        pkl.dump({'Audio':X, 'Activity':Y}, f)
