In [None]:
## Preprocess pickle files: add noise & convert to VGGish examples
import os
import pickle
import numpy as np
from pathlib import Path
from tqdm import tqdm
from utils import vggish_input

SUB_SR = 16_000
HAND = 'Right'
BASE_DIR = Path("../../Data/Train_Data/2. TrainingDataset")
OUTPUT_ROOT = Path("../../Data/Train_Data/6. AudioExamples_noise")
SNR_RANGE_DB = (8, 12)  # dB
CHUNK_SECONDS = 10  # seconds per chunk

OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)


def add_noise(wav: np.ndarray, snr_db: float) -> np.ndarray:
    """Add Gaussian noise to `wav` at specified SNR in dB."""
    rms = np.sqrt(np.mean(wav.astype(np.float32) ** 2))
    noise_rms = rms / (10 ** (snr_db / 20))
    noise = np.random.normal(0, noise_rms, size=wav.shape)
    noisy = wav.astype(np.float32) + noise
    return np.clip(noisy, -32768, 32767).astype(np.int16)


def process_file(pkl_path: Path, out_dir: Path):
    """Load one pickle, subsample, chunk, add noise, convert to examples, then save."""
    with open(pkl_path, "rb") as f:
        data = pickle.load(f)
    audio = data["Audio"].astype(np.int16)
    # subsample
    factor = len(audio) // (len(audio) * SUB_SR // 16000)
    audio = audio[::factor]

    chunk_size = SUB_SR * CHUNK_SECONDS
    examples_list = []
    for start in range(0, len(audio), chunk_size):
        chunk = audio[start:start + chunk_size]
        if chunk.size == 0:
            break
        snr = np.random.uniform(*SNR_RANGE_DB)
        noisy_chunk = add_noise(chunk, snr)
        feats = vggish_input.wavform_to_examples(
            noisy_chunk,
            lower_edge_hertz=10,
            upper_edge_hertz=SUB_SR // 2,
            sr=SUB_SR
        )
        examples_list.append(feats)

    if examples_list:
        out_path = out_dir / pkl_path.name
        out_dir.mkdir(parents=True, exist_ok=True)
        with open(out_path, "wb") as f:
            pickle.dump(np.vstack(examples_list), f)


for participant_dir in sorted(BASE_DIR.iterdir()):
    hand_dir = participant_dir / HAND
    if not hand_dir.is_dir():
        continue
    out_subdir = OUTPUT_ROOT / participant_dir.name / HAND / str(SUB_SR)
    for pkl_file in tqdm(hand_dir.glob("*.pkl"), desc=f"Processing {participant_dir.name}"):
        if "Other" in pkl_file.stem:
            continue
        process_file(pkl_file, out_subdir)