In [11]:
!pip install kagglehub librosa soundfile pandas tqdm scikit-learn

import os
from pathlib import Path

import kagglehub
import librosa
import soundfile as sf
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split



In [12]:
# Download latest version of the dataset
path = kagglehub.dataset_download("sayuksh/denoising-audio-collection")
RAW_ROOT = Path(path)

print("RAW_ROOT:", RAW_ROOT)

print("\nTop-level contents:")
for p in RAW_ROOT.iterdir():
    print(" ", p.name)

Using Colab cache for faster access to the 'denoising-audio-collection' dataset.
RAW_ROOT: /kaggle/input/denoising-audio-collection

Top-level contents:
  trainset_56spk_txt
  noisy_trainset_56spk_wav
  noisy_testset_wav
  logfiles
  clean_trainset_28spk_wav
  clean_trainset_56spk_wav
  clean_testset_wav
  noisy_trainset_28spk_wav
  trainset_28spk_txt
  testset_txt
  license_text


In [25]:
TRAIN_CLEAN_ROOT = RAW_ROOT / "clean_trainset_28spk_wav"
TRAIN_NOISY_ROOT = RAW_ROOT / "noisy_trainset_28spk_wav"
TEST_CLEAN_ROOT  = RAW_ROOT / "clean_testset_wav"
TEST_NOISY_ROOT  = RAW_ROOT / "noisy_testset_wav"

print("TRAIN_CLEAN_ROOT:", TRAIN_CLEAN_ROOT)
print("TRAIN_NOISY_ROOT:", TRAIN_NOISY_ROOT)
print("TEST_CLEAN_ROOT :", TEST_CLEAN_ROOT)
print("TEST_NOISY_ROOT :", TEST_NOISY_ROOT)

# recursive search for .wav
train_clean_files = sorted(list(TRAIN_CLEAN_ROOT.rglob("*.wav")))
train_noisy_files = sorted(list(TRAIN_NOISY_ROOT.rglob("*.wav")))
test_clean_files  = sorted(list(TEST_CLEAN_ROOT.rglob("*.wav")))
test_noisy_files  = sorted(list(TEST_NOISY_ROOT.rglob("*.wav")))

print("\n# train clean:", len(train_clean_files))
print("# train noisy:", len(train_noisy_files))
print("# test clean :", len(test_clean_files))
print("# test noisy :", len(test_noisy_files))

print("\nSample train clean files:")
for f in train_clean_files[:5]:
    print(" ", f)

TRAIN_CLEAN_ROOT: /kaggle/input/denoising-audio-collection/clean_trainset_28spk_wav
TRAIN_NOISY_ROOT: /kaggle/input/denoising-audio-collection/noisy_trainset_28spk_wav
TEST_CLEAN_ROOT : /kaggle/input/denoising-audio-collection/clean_testset_wav
TEST_NOISY_ROOT : /kaggle/input/denoising-audio-collection/noisy_testset_wav

# train clean: 11572
# train noisy: 11572
# test clean : 824
# test noisy : 824

Sample train clean files:
  /kaggle/input/denoising-audio-collection/clean_trainset_28spk_wav/clean_trainset_28spk_wav/p226_001.wav
  /kaggle/input/denoising-audio-collection/clean_trainset_28spk_wav/clean_trainset_28spk_wav/p226_002.wav
  /kaggle/input/denoising-audio-collection/clean_trainset_28spk_wav/clean_trainset_28spk_wav/p226_003.wav
  /kaggle/input/denoising-audio-collection/clean_trainset_28spk_wav/clean_trainset_28spk_wav/p226_004.wav
  /kaggle/input/denoising-audio-collection/clean_trainset_28spk_wav/clean_trainset_28spk_wav/p226_005.wav


In [19]:
TARGET_SR = 16000      # Whisper sample rate
SEGMENT_SEC = 2.0      # seconds per chunk

OUT_ROOT = Path("/content/data_16k")
OUT_ROOT.mkdir(parents=True, exist_ok=True)

def load_mono_resample(path, target_sr=16000):
    """Load audio, convert to mono, resample, normalize to [-1, 1]."""
    y, sr = librosa.load(path, sr=None, mono=True)
    if sr != target_sr:
        y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
        sr = target_sr
    if np.max(np.abs(y)) > 0:
        y = y / np.max(np.abs(y))
    return y, sr

def segment_audio(y, sr, segment_sec=None):
    """Split waveform into fixed-length segments, padding last one if needed."""
    if segment_sec is None:
        return [y]
    seg_len = int(sr * segment_sec)
    n = len(y)
    segments = []
    if n <= seg_len:
        padded = np.zeros(seg_len, dtype=y.dtype)
        padded[:n] = y
        return [padded]
    for start in range(0, n, seg_len):
        end = start + seg_len
        chunk = y[start:end]
        if len(chunk) < seg_len:
            padded = np.zeros(seg_len, dtype=y.dtype)
            padded[:len(chunk)] = chunk
            chunk = padded
        segments.append(chunk)
    return segments

In [20]:
def make_pairs_recursive(clean_root: Path, noisy_root: Path):
    """
    Recursively find .wav files under clean_root and noisy_root,
    and pair them by filename.
    """
    clean_files = list(clean_root.rglob("*.wav"))
    noisy_files = list(noisy_root.rglob("*.wav"))

    print(f"Found {len(clean_files)} clean and {len(noisy_files)} noisy wav files")

    # Map noisy files by filename
    noisy_by_name = {p.name: p for p in noisy_files}

    pairs = []
    missing = 0
    for cpath in clean_files:
        fname = cpath.name
        if fname in noisy_by_name:
            pairs.append((str(cpath), str(noisy_by_name[fname])))
        else:
            missing += 1
    print(f"Paired {len(pairs)} files, missing matches for {missing} clean files")
    return pairs

train_pairs_full = make_pairs_recursive(TRAIN_CLEAN_ROOT, TRAIN_NOISY_ROOT)
test_pairs       = make_pairs_recursive(TEST_CLEAN_ROOT,  TEST_NOISY_ROOT)

if len(train_pairs_full) == 0 or len(test_pairs) == 0:
    raise RuntimeError("No pairs found — check dataset structure.")

# Split train into train / val
train_pairs, val_pairs = train_test_split(
    train_pairs_full, test_size=0.1, random_state=42
)

print("Train pairs:", len(train_pairs))
print("Val pairs  :", len(val_pairs))
print("Test pairs :", len(test_pairs))

Found 11572 clean and 11572 noisy wav files
Paired 11572 files, missing matches for 0 clean files
Found 824 clean and 824 noisy wav files
Paired 824 files, missing matches for 0 clean files
Train pairs: 10414
Val pairs  : 1158
Test pairs : 824


In [21]:
def process_split(pairs, split_name, out_root, target_sr=16000, segment_sec=2.0):
    split_root = out_root / split_name
    clean_out = split_root / "clean"
    noisy_out = split_root / "noisy"
    clean_out.mkdir(parents=True, exist_ok=True)
    noisy_out.mkdir(parents=True, exist_ok=True)

    records = []

    for clean_path, noisy_path in tqdm(pairs, desc=f"Processing {split_name}"):
        clean_wave, sr = load_mono_resample(clean_path, target_sr)
        noisy_wave, _ = load_mono_resample(noisy_path, target_sr)

        min_len = min(len(clean_wave), len(noisy_wave))
        clean_wave = clean_wave[:min_len]
        noisy_wave = noisy_wave[:min_len]

        clean_segs = segment_audio(clean_wave, sr, segment_sec)
        noisy_segs = segment_audio(noisy_wave, sr, segment_sec)

        base_id = Path(clean_path).stem

        for i, (cseg, nseg) in enumerate(zip(clean_segs, noisy_segs)):
            seg_id = f"{base_id}_seg{i:03d}"
            clean_out_path = clean_out / f"{seg_id}.wav"
            noisy_out_path = noisy_out / f"{seg_id}.wav"

            sf.write(clean_out_path, cseg, sr)
            sf.write(noisy_out_path, nseg, sr)

            records.append({
                "id": seg_id,
                "split": split_name,
                "clean_path": str(clean_out_path),
                "noisy_path": str(noisy_out_path),
                "sr": sr,
                "duration_sec": len(cseg) / sr,
            })

    return pd.DataFrame(records)

df_train = process_split(train_pairs, "train", OUT_ROOT, TARGET_SR, SEGMENT_SEC)
df_val   = process_split(val_pairs,   "val",   OUT_ROOT, TARGET_SR, SEGMENT_SEC)
df_test  = process_split(test_pairs,  "test",  OUT_ROOT, TARGET_SR, SEGMENT_SEC)

df_all = pd.concat([df_train, df_val, df_test], ignore_index=True)
metadata_path = OUT_ROOT / "metadata.csv"
df_all.to_csv(metadata_path, index=False)

print("Saved metadata to:", metadata_path)
df_all.head()

Processing train: 100%|██████████| 10414/10414 [08:23<00:00, 20.69it/s]
Processing val: 100%|██████████| 1158/1158 [00:49<00:00, 23.43it/s]
Processing test: 100%|██████████| 824/824 [00:32<00:00, 25.34it/s]


Saved metadata to: /content/data_16k/metadata.csv


Unnamed: 0,id,split,clean_path,noisy_path,sr,duration_sec
0,p270_362_seg000,train,/content/data_16k/train/clean/p270_362_seg000.wav,/content/data_16k/train/noisy/p270_362_seg000.wav,16000,2.0
1,p270_362_seg001,train,/content/data_16k/train/clean/p270_362_seg001.wav,/content/data_16k/train/noisy/p270_362_seg001.wav,16000,2.0
2,p268_271_seg000,train,/content/data_16k/train/clean/p268_271_seg000.wav,/content/data_16k/train/noisy/p268_271_seg000.wav,16000,2.0
3,p268_271_seg001,train,/content/data_16k/train/clean/p268_271_seg001.wav,/content/data_16k/train/noisy/p268_271_seg001.wav,16000,2.0
4,p227_138_seg000,train,/content/data_16k/train/clean/p227_138_seg000.wav,/content/data_16k/train/noisy/p227_138_seg000.wav,16000,2.0


In [22]:
import torch
from torch.utils.data import Dataset

class DenoisingDataset(Dataset):
    """Returns (noisy, clean) waveforms as [1, T] float tensors."""
    def __init__(self, metadata_csv, split="train"):
        self.df = pd.read_csv(metadata_csv)
        self.df = self.df[self.df["split"] == split].reset_index(drop=True)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        noisy, sr_n = librosa.load(row["noisy_path"], sr=None, mono=True)
        clean, sr_c = librosa.load(row["clean_path"], sr=None, mono=True)
        assert sr_n == sr_c

        if np.max(np.abs(noisy)) > 0:
            noisy = noisy / np.max(np.abs(noisy))
        if np.max(np.abs(clean)) > 0:
            clean = clean / np.max(np.abs(clean))

        noisy_t = torch.from_numpy(noisy).float().unsqueeze(0)
        clean_t = torch.from_numpy(clean).float().unsqueeze(0)
        return noisy_t, clean_t

train_ds = DenoisingDataset(metadata_path, split="train")
noisy_ex, clean_ex = train_ds[0]
print("Noisy shape:", noisy_ex.shape)
print("Clean shape:", clean_ex.shape)

Noisy shape: torch.Size([1, 32000])
Clean shape: torch.Size([1, 32000])


In [27]:
# FINAL DATASET VALIDATION

print("Loading one sample from each split...\n")

train_ds = DenoisingDataset(metadata_path, split="train")
val_ds   = DenoisingDataset(metadata_path, split="val")
test_ds  = DenoisingDataset(metadata_path, split="test")

noisy_tr, clean_tr = train_ds[0]
noisy_val, clean_val = val_ds[0]
noisy_te, clean_te = test_ds[0]

print("---- TRAIN SAMPLE ----")
print("Noisy shape:", noisy_tr.shape)
print("Clean shape:", clean_tr.shape)
print("Noisy min/max:", noisy_tr.min().item(), noisy_tr.max().item())
print("Clean min/max:", clean_tr.min().item(), clean_tr.max().item())

print("\n---- VAL SAMPLE ----")
print("Noisy shape:", noisy_val.shape)
print("Clean shape:", clean_val.shape)

print("\n---- TEST SAMPLE ----")
print("Noisy shape:", noisy_te.shape)
print("Clean shape:", clean_te.shape)

# Confirm lengths match
assert noisy_tr.shape == clean_tr.shape, "Train noisy/clean length mismatch!"
assert noisy_val.shape == clean_val.shape, "Val noisy/clean length mismatch!"
assert noisy_te.shape == clean_te.shape, "Test noisy/clean length mismatch!"

# Confirm normalization
assert noisy_tr.max() <= 1.0 and noisy_tr.min() >= -1.0, "Noisy not normalized!"
assert clean_tr.max() <= 1.0 and clean_tr.min() >= -1.0, "Clean not normalized!"

print("\n DATASET CHECK PASSED")
print("Output format: [1, T]")
print("Values normalized to [-1, 1]")
print("Clean/Noisy aligned")

Loading one sample from each split...

---- TRAIN SAMPLE ----
Noisy shape: torch.Size([1, 32000])
Clean shape: torch.Size([1, 32000])
Noisy min/max: -0.9681386947631836 1.0
Clean min/max: -0.7856379747390747 1.0

---- VAL SAMPLE ----
Noisy shape: torch.Size([1, 32000])
Clean shape: torch.Size([1, 32000])

---- TEST SAMPLE ----
Noisy shape: torch.Size([1, 32000])
Clean shape: torch.Size([1, 32000])

 DATASET CHECK PASSED
Output format: [1, T]
Values normalized to [-1, 1]
Clean/Noisy aligned
