In [None]:
import numpy as np
import librosa            # audio utilities
import soundfile as sf    # writing .wav file

In [None]:
def dsp_denoise_waveform(
    y: np.ndarray,
    sr: int = 48000,
    noise_start: float = 0.2,
    noise_duration: float = 0.8, # duration for audio containing noise only
    n_fft: int = 4096,
    hop_length: int = 128,
    win_length: int = 4096,
):

    # 1. mono audio only
    if y.ndim > 1:
      y = librosa.to_mono(y)


    # 2. pick a content that only contains noise
    noise_start_sample = int(noise_start * sr)
    noise_end_sample = noise_start_sample + int(noise_duration * sr)
    noise_end_sample = min(noise_end_sample, len(y))

    noise_clip = y[noise_start_sample:noise_end_sample]

    if len(noise_clip) == 0:
        raise ValueError("Noise segment is empty. Check noise_start/noise_duration.")

    # 3. STFT or Full Signal and Noise Segment
    D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
    D_noise = librosa.stft(noise_clip, n_fft=n_fft, hop_length=hop_length, win_length=win_length)

    # 4. Magnitude and phase of the noisy signal
    mag = np.abs(D)
    phase = np.angle(D)

    # 5. Estimate noise power spectral density from the noise-only region
    noise_mag = np.abs(D_noise)
    noise_power = np.mean(noise_mag ** 2, axis=1, keepdims=True)  # [freq_bins, 1]

    # 6. Estimate signal power and compute Wiener-like gain
    eps = 1e-10  # small constant to avoid divide-by-zero
    noisy_power = mag ** 2

    # estimate clean signal power
    signal_power_est = np.maximum(noisy_power - noise_power, 0.0)

    # Wiener gain: S / (S + N)
    gain = signal_power_est / (signal_power_est + noise_power + eps)
    alpha = 1.8   # >1.0 = stronger, <1.0 = softer
    gain = gain ** alpha

    gain_floor = 0.1
    gain = np.maximum(gain, gain_floor)

    # 7. Apply gain to noisy magnitude
    mag_denoised = gain * mag

    # 8. Reconstruct complex STFT and invert back to time-domain
    D_denoised = mag_denoised * np.exp(1j * phase)
    y_denoised = librosa.istft(D_denoised, hop_length=hop_length, win_length=win_length)

    # 9. Match original length (istft may differ by a few samples)
    if len(y_denoised) > len(y):
        y_denoised = y_denoised[:len(y)]
    else:
        # pad with zeros at end if slightly shorter
        pad_len = len(y) - len(y_denoised)
        y_denoised = np.pad(y_denoised, (0, pad_len), mode="constant")

    # 10. Optional: normalize output to avoid clipping
    max_val = np.max(np.abs(y_denoised)) + eps
    if max_val > 1.0:
        y_denoised = y_denoised / max_val

    return y_denoised

def dsp_denoise_file(
    input_path: str,
    output_path: str,
    noise_start: float = 0.2,
    noise_duration: float = 0.8,
    target_sr: int = 48000,
):


    # 1. Load audio (mono, resampled)
    y, sr = librosa.load(input_path, sr=target_sr, mono=True)

    # 2. Apply DSP denoiser
    y_denoised = dsp_denoise_waveform(
       y=y,
       sr=sr,
       noise_start=noise_start,
       noise_duration=noise_duration,
    )

    # 3. Save result
    sf.write(output_path, y_denoised, sr)

In [None]:



input_path = "sample_data/test.wav"
output_path = "sample_data/outputfinal.wav"

dsp_denoise_file(
    input_path=input_path,
    output_path=output_path,
    noise_start=0.2,
    noise_duration=0.7,
    target_sr=48000,
)

print("Done! Saved to", output_path)

Done! Saved to sample_data/output.wav


In [None]:
import librosa

y, sr = librosa.load("sample_data/test.wav", sr=None)
print(sr)

48000
