This notebook creates additional classes: silence and unknown.

In [3]:
import os
import shutil

In [2]:
LABELS = {"yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go"}
OTHER_LABELS = ["silence", "unknown"]

SPLITS = ["train", "val", "test"]


In [3]:
def consolidate_unknown(root_dir: str):
    """
    For each split under root_dir, move any folder
    whose name is not in LABELS into an 'unknown' folder.
    """
    for split in SPLITS:
        split_path = os.path.join(root_dir, split)
        if not os.path.isdir(split_path):
            print(f"Warning: '{split_path}' does not exist, skipping.")
            continue

        unknown_path = os.path.join(split_path, "unknown")
        os.makedirs(unknown_path, exist_ok=True)

        for entry in os.listdir(split_path):
            entry_path = os.path.join(split_path, entry)
            if not os.path.isdir(entry_path):
                continue
            if entry == "unknown":
                continue

            if entry not in LABELS:
                print(f"  Moving '{split}/{entry}/' → '{split}/unknown/'")
                for filename in os.listdir(entry_path):
                    src_file = os.path.join(entry_path, filename)
                    dst_file = os.path.join(unknown_path, filename)
                    base, ext = os.path.splitext(filename)
                    counter = 1
                    while os.path.exists(dst_file):
                        dst_file = os.path.join(
                            unknown_path, f"{base}_{counter}{ext}"
                        )
                        counter += 1
                    shutil.move(src_file, dst_file)
                os.rmdir(entry_path)

In [4]:
consolidate_unknown("../data-no-noise-no-silence")

  Moving 'train/bed/' → 'train/unknown/'
  Moving 'train/bird/' → 'train/unknown/'
  Moving 'train/cat/' → 'train/unknown/'
  Moving 'train/dog/' → 'train/unknown/'
  Moving 'train/eight/' → 'train/unknown/'
  Moving 'train/five/' → 'train/unknown/'
  Moving 'train/four/' → 'train/unknown/'
  Moving 'train/happy/' → 'train/unknown/'
  Moving 'train/house/' → 'train/unknown/'
  Moving 'train/marvin/' → 'train/unknown/'
  Moving 'train/nine/' → 'train/unknown/'
  Moving 'train/one/' → 'train/unknown/'
  Moving 'train/seven/' → 'train/unknown/'
  Moving 'train/sheila/' → 'train/unknown/'
  Moving 'train/six/' → 'train/unknown/'
  Moving 'train/three/' → 'train/unknown/'
  Moving 'train/tree/' → 'train/unknown/'
  Moving 'train/two/' → 'train/unknown/'
  Moving 'train/wow/' → 'train/unknown/'
  Moving 'train/zero/' → 'train/unknown/'
  Moving 'val/bed/' → 'val/unknown/'
  Moving 'val/bird/' → 'val/unknown/'
  Moving 'val/cat/' → 'val/unknown/'
  Moving 'val/dog/' → 'val/unknown/'
  Moving 

In [1]:
import librosa
import numpy as np
import soundfile as sf
import noisereduce as nr
import matplotlib.pyplot as plt

In [4]:
CHUNK_DURATION_S = 1.0
BG_DIR = "C:/Users/SPCX/Downloads/tensorflow-speech-recognition-challenge/train/train/audio/_background_noise_"
SPLIT_FILES = {
    "train": ["doing_the_dishes.wav",
              "dude_miaowing.wav",
              "exercise_bike.wav",
              "pink_noise.wav"],
    "val":   ["running_tap.wav"],
    "test":  ["white_noise.wav"],
}

global_min_db = -40
global_max_db = 60
sampling_rate = 44100
frame_length = 1024
hop_length = 512

for split, files in SPLIT_FILES.items():
    out_dir = os.path.join("../data-no-noise-no-silence", split, "silence")
    os.makedirs(out_dir, exist_ok=True)

    for fname in files:
        print(f"Processing {fname}...")
        src_path = os.path.join(BG_DIR, fname)
        audio, sr = librosa.load(src_path, sr=sampling_rate)
        noise_profile = audio[:int(sr * 0.5)]
        reduced_noise = nr.reduce_noise(y=audio, sr=sr, y_noise=noise_profile, prop_decrease=1.0)
        non_silent_intervals = librosa.effects.split(reduced_noise, top_db=30)  # Adjust top_db to control sensitivity
        non_silent_audio = np.concatenate([reduced_noise[start:end] for start, end in non_silent_intervals])
        total_samples = len(audio)
        samples_per_chunk = int(sr * CHUNK_DURATION_S)
        n_chunks = total_samples // samples_per_chunk
        base = os.path.splitext(fname)[0]
        print(f"  Splitting into {n_chunks} chunks of {samples_per_chunk} samples each...")
        for i in range(n_chunks):
            out_name = f"{base}_{i:03d}.png"
            start = i * samples_per_chunk
            end = start + samples_per_chunk
            chunk = audio[start:end]
            stft = librosa.stft(chunk)
            stft_db = librosa.amplitude_to_db(np.abs(stft))
            plt.figure(figsize=(14, 5), dpi=400)
            librosa.display.specshow(stft_db, sr=sr, x_axis=None, y_axis=None,
                                    vmin=global_min_db, vmax=global_max_db, cmap='gray')
            plt.axis('off')
            plt.savefig(os.path.join(out_dir, out_name), bbox_inches='tight', pad_inches=0)
            plt.close()


Processing doing_the_dishes.wav...
  Splitting into 95 chunks of 44100 samples each...
Processing dude_miaowing.wav...
  Splitting into 61 chunks of 44100 samples each...
Processing exercise_bike.wav...
  Splitting into 61 chunks of 44100 samples each...
Processing pink_noise.wav...
  Splitting into 60 chunks of 44100 samples each...
Processing running_tap.wav...
  Splitting into 61 chunks of 44100 samples each...
Processing white_noise.wav...
  Splitting into 60 chunks of 44100 samples each...


- doing_the_dishes: 95s
- dude_miaowing: 61s
- exercise_bike:61s
- pink_noise: 60s
- running_tap: 61s
- white_noise: 60s