In [None]:
import numpy as np
import warnings
from IPython.display import Audio, display
from scipy.io import wavfile
import matplotlib.pyplot as plt
import librosa, librosa.display
import numpy as np
from scipy.signal import sawtooth

og_len = 4000 # 4 seconds
channels = 2  # Stereo audio
sr = 44100 # stream rate
audio_path = "../../resources/vocals/withers1.wav"

warnings.simplefilter("ignore", wavfile.WavFileWarning)
sr_loaded, y = wavfile.read(audio_path)  # y has shape (samples, channels) if stereo
waveform = y.T.astype(np.float32) / np.max(np.abs(y))  # normalize

display(Audio(waveform, rate=sr))


In [None]:
if y.ndim > 1:
    y = np.mean(y, axis=1)

# Normalize to float
y = y.astype(np.float32)
y /= np.max(np.abs(y))

f0, voiced_flag, voiced_probs = librosa.pyin(
    y,
    fmin=librosa.note_to_hz('C2'),
    fmax=librosa.note_to_hz('C6'),
    frame_length=2048,
    sr=sr
)

rms = librosa.feature.rms(y=y, frame_length=2048)[0]
times = librosa.times_like(f0, sr=sr)

plt.figure(figsize=(12, 4))
plt.plot(times, f0, label='Pitch (Hz)')
plt.plot(times, rms * 1000, label='Amplitude x1000')
plt.legend()
plt.title("Detected Pitch and Amplitude")
plt.show()

# Oscillator resynthesis

In [None]:


# Use same sampling rate (sr) and hop length from pyin
hop_length = 512  # librosa default if you didn’t specify
duration = len(y) / sr
t = np.linspace(0, duration, len(y))

# Interpolate f0 and RMS to full sample rate
times = librosa.times_like(f0, sr=sr, hop_length=hop_length)
valid_idx = ~np.isnan(f0)
interp_f0 = np.interp(t, times[valid_idx], f0[valid_idx])
interp_rms = np.interp(t, times, rms)

phase = 2 * np.pi * np.cumsum(interp_f0) / sr
carrier = sawtooth(phase)

# Apply amplitude envelope (RMS)
resynth = carrier * interp_rms * 2.5  # scale factor for audibility

display(Audio(resynth, rate=sr))


# Add Additive Synthesis Harmonics

In [None]:
harmonics = np.zeros_like(resynth)
num_harmonics = 5
for k in range(1, num_harmonics + 1):
    harmonics += (1.0 / k) * np.sin(k * phase)

resynth_harm = harmonics * interp_rms * 0.6
display(Audio(resynth_harm, rate=sr))

In [None]:
from scipy.signal import iirpeak, lfilter

def apply_formant_filter(signal, sr, formants):
    """Apply a simple parallel formant filter bank to a mono signal."""
    out = np.zeros_like(signal)
    for f, bw in formants:
        b, a = iirpeak(f / (sr / 2), f / bw)  # center frequency, Q = f/bw
        filtered = lfilter(b, a, signal)
        out += filtered
    # Normalize
    out /= len(formants)
    return out

# Vowel formants (ah)
formants_ah = [(730, 80), (1090, 90), (2440, 120)]

resynth_vowel = apply_formant_filter(resynth_harm, sr, formants_ah)
display(Audio(resynth_vowel, rate=sr))
