In [1]:
import torch
import soundfile as sf    # writing .wav file

In [22]:
def dsp_denoise_waveform(
    y: torch.Tensor,
    sr: int = 48000,
    noise_start: float = 0.2,
    noise_duration: float = 0.8,
    n_fft: int = 4096,
    hop_length: int = 128,
    win_length: int = 4096,
    alpha: float = 5,
    gain_floor: float = 0,
):
    device = y.device

    # 1. mono audio only
    if y.ndim > 1:
      y = y.mean(dim=0) # Average across channel


    # 2. pick a content that only contains noise
    noise_start_sample = int(noise_start * sr)
    noise_end_sample = min(noise_start_sample + int(noise_duration * sr), len(y))

    noise_clip = y[noise_start_sample:noise_end_sample]

    if noise_clip.numel() == 0: # check number of element in tensor
        raise ValueError("Noise segment is empty. Check noise_start/noise_duration.")

    # 3. STFT or Full Signal and Noise Segment
    window = torch.hann_window(win_length, device=device)
    D = torch.stft(y,
                   n_fft=n_fft,
                   hop_length=hop_length,
                   win_length=win_length,
                   window = window,
                   return_complex=True,
                   )
    D_noise = torch.stft(noise_clip,
                    n_fft=n_fft,
                    hop_length=hop_length,
                    win_length=win_length,
                    window = window,
                    return_complex=True,
                    )

    # 4. Magnitude and phase of the noisy signal
    mag = torch.abs(D)
    phase = torch.angle(D)

    # 5. Estimate noise power spectral density from the noise-only region
    noise_mag = torch.abs(D_noise)
    noise_power = (noise_mag ** 2).mean(dim=1, keepdim=True)  # [freq_bins, 1] Average of noise power

    # 6. Estimate signal power and compute Wiener-like gain
    eps = 1e-10  # small constant to avoid divide-by-zero
    noisy_power = mag ** 2 # per instanteneous

    # estimate clean signal power
    signal_power_est = torch.clamp(noisy_power - noise_power, min = 0.0)

    # Wiener gain: S / (S + N)
    gain = signal_power_est / (signal_power_est + noise_power + eps)

    # alpha to control gain
    gain = gain ** alpha
    # gain floor to be the minimum gain
    gain = torch.clamp(gain, min = gain_floor)

    # 7. Apply gain to noisy magnitude
    mag_denoised = gain * mag

    # 8. Reconstruct complex STFT and invert back to time-domain
    D_denoised = mag_denoised * torch.exp(1j * phase)
    y_denoised = torch.istft(D_denoised,
                             n_fft=n_fft,
                             hop_length=hop_length,
                             win_length=win_length,
                             window=window,
                             length=y.shape[-1],
                            )

    # 9.  Normalize output to avoid clipping
    max_val = torch.max(torch.abs(y_denoised)) + eps
    if max_val > 1.0:
        y_denoised = y_denoised / max_val

    return y_denoised

def dsp_denoise_file(
    input_path: str,
    output_path: str,
    noise_start: float = 0.2,
    noise_duration: float = 0.8,
    target_sr: int = 48000,
    num_passes: int = 12,
    alpha: float = 5,
    gain_floor: float = 0,
    device: str = "cpu",
):
    # 1. Load audio (mono, resampled)
    y_np, sr = sf.read(input_path)
    if y_np.ndim > 1:
       y_np = y_np.mean(axis=1)

    # 2. Convert to torch tensor on the chosen device
    y = torch.from_numpy(y_np).to(device).float()

    # 3. Apply DSP denoise
    for _ in range(num_passes):
        y = dsp_denoise_waveform(
            y=y,
            sr=sr,
            noise_start=noise_start,
            noise_duration=noise_duration,
            alpha=alpha,
            gain_floor=gain_floor,
        )

    # 4. Save result
    sf.write(output_path, y.cpu().numpy(), sr)

In [23]:
input_path = "noisy_audio.wav"
output_path = "denoised.wav"

dsp_denoise_file(
    input_path=input_path,
    output_path=output_path,
    noise_start=0.2,
    noise_duration=0.7,
    target_sr=48000,
)

print("Done! Saved to", output_path)

Done! Saved to denoised.wav


In [24]:
from transformers import pipeline

pipe = pipeline(
    task="automatic-speech-recognition",
    model="openai/whisper-small"
)

noisy_result = pipe("noisy_audio.wav")
print(noisy_result["text"])

denoised_result = pipe("denoised.wav")
print(denoised_result["text"])

true_result = pipe("clean_audio.wav")
print(true_result["text"])

Device set to use cuda:0


 Six spins of fresh snow peas, five thick slabs of blue cheese, and maybe a snack for her brother Bob.
 6 bins of fresh snow peas, 5-6 slabs of blue cheese, and maybe a snack for her brother Bob.
 Six spoons of fresh snow peas, five thick slabs of blue cheese and maybe a snack for her brother Bob.


In [25]:
# Model to determine Alpha and Gain Floor
import soundfile as sf
import numpy as np
import torch
import re

device = "cuda" if torch.cuda.is_available() else "cpu"

# 1. Get "ground truth" transcript from clean audio using Whisper
true_result = pipe("clean_audio.wav")
ref_text = true_result["text"]
print("Reference transcript:", ref_text)

# 2. Small helper: normalize text & compute WER
def normalize_text(t):
    t = t.lower()
    t = re.sub(r"[^a-z0-9\s']", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

def wer(ref, hyp):
    # Tokenize
    r = normalize_text(ref).split()
    h = normalize_text(hyp).split()

    # Classic Levenshtein distance
    R = len(r)
    H = len(h)
    dp = [[0] * (H + 1) for _ in range(R + 1)]

    for i in range(R + 1):
        dp[i][0] = i
    for j in range(H + 1):
        dp[0][j] = j

    for i in range(1, R + 1):
        for j in range(1, H + 1):
            cost = 0 if r[i - 1] == h[j - 1] else 1
            dp[i][j] = min(
                dp[i - 1][j] + 1,      # deletion
                dp[i][j - 1] + 1,      # insertion
                dp[i - 1][j - 1] + cost  # substitution
            )

    return dp[R][H] / max(R, 1)


# 3. Evaluate a given (alpha, gain_floor) on one file
def evaluate_params(alpha, gain_floor):
    # load noisy audio
    y_np, sr = sf.read("noisy_audio.wav")
    if y_np.ndim > 1:
        y_np = y_np.mean(axis=1)
    y = torch.from_numpy(y_np).float().to(device)

    # denoise
    with torch.no_grad():
        y_den = dsp_denoise_waveform(
            y=y,
            sr=sr,
            noise_start=0.2,
            noise_duration=0.8,
            alpha=float(alpha),
            gain_floor=float(gain_floor),
        )

    y_den_np = y_den.cpu().numpy()

    # run whisper on denoised audio (pass numpy array directly)
    asr_out = pipe({"array": y_den_np, "sampling_rate": sr})
    hyp_text = asr_out["text"]

    this_wer = wer(ref_text, hyp_text)
    return this_wer, hyp_text


# 4. Grid search over alpha and gain_floor
alpha_values = np.linspace(5, 40, 8)      # e.g., [5,10,...,40]
gain_values  = np.linspace(0.0, 0.8, 9)   # e.g., [0.0,0.1,...,0.8]

best_score = float("inf")
best_alpha = None
best_gain  = None
best_hyp   = None

for a in alpha_values:
    for g in gain_values:
        score, hyp = evaluate_params(a, g)
        print(f"alpha={a:.2f}, gain_floor={g:.2f}, WER={score:.4f}")
        if score < best_score:
            best_score = score
            best_alpha = a
            best_gain  = g
            best_hyp   = hyp

print("\nBest params:")
print("  alpha     =", best_alpha)
print("  gain_floor=", best_gain)
print("  best WER  =", best_score)
print("  hyp text  =", best_hyp)


Reference transcript:  Six spoons of fresh snow peas, five thick slabs of blue cheese and maybe a snack for her brother Bob.
alpha=5.00, gain_floor=0.00, WER=1.0000
alpha=5.00, gain_floor=0.10, WER=1.0000
alpha=5.00, gain_floor=0.20, WER=1.0000
alpha=5.00, gain_floor=0.30, WER=14.8000
alpha=5.00, gain_floor=0.40, WER=18.3500
alpha=5.00, gain_floor=0.50, WER=18.3500
alpha=5.00, gain_floor=0.60, WER=18.3500
alpha=5.00, gain_floor=0.70, WER=18.3500
alpha=5.00, gain_floor=0.80, WER=18.3500
alpha=10.00, gain_floor=0.00, WER=1.0000
alpha=10.00, gain_floor=0.10, WER=1.0000
alpha=10.00, gain_floor=0.20, WER=1.0000
alpha=10.00, gain_floor=0.30, WER=18.3500
alpha=10.00, gain_floor=0.40, WER=18.3500
alpha=10.00, gain_floor=0.50, WER=18.3500
alpha=10.00, gain_floor=0.60, WER=18.3500
alpha=10.00, gain_floor=0.70, WER=11.1500
alpha=10.00, gain_floor=0.80, WER=18.3500
alpha=15.00, gain_floor=0.00, WER=1.0000
alpha=15.00, gain_floor=0.10, WER=1.0000
alpha=15.00, gain_floor=0.20, WER=14.8000
alpha=15.0