In [1]:
import numpy as np
from IPython.display import Audio, display
from scipy.io import wavfile


In [3]:
og_len = 5000 # 5 seconds
channels = 2  # Stereo audio
sr = 44100 # stream rate
audio_path = "resources/audio1_pitchshift.wav"

sr_loaded, y = wavfile.read(audio_path)  # y has shape (samples, channels) if stereo

# Convert to float32 and shape to (channels, samples)
y = y.T.astype(np.float32) / np.max(np.abs(y))  # normalize
waveform = y
og_len = waveform.shape[1]

display(Audio(waveform, rate=sr))

In [6]:
win_len = 5000 # / 44100 = 113ms
win_f = np.hanning(win_len)

anls_hop_len = 2000 # / 44100 = 45ms
scaling = 2 ** (5 / 12)  # 5 semitones
synth_hop_len = int(anls_hop_len * scaling)

new_waveform_len = int(np.ceil(og_len / anls_hop_len) * synth_hop_len) + win_len
new_waveform = np.zeros((channels, new_waveform_len)) # two channels, stereo output
new_scales = np.zeros(new_waveform_len)

# Loop through the windows
for i in range(0, og_len, anls_hop_len):
    clipped_len = min(win_len, og_len - i)

    idx = int(i * scaling)

    # add window to new waveform
    new_waveform[:, idx:idx + clipped_len] += waveform[:, i:i + clipped_len] * win_f[None, :clipped_len]

    # add up windowing weights for normalization
    new_scales[idx:idx + clipped_len] += win_f[:clipped_len]

new_waveform = new_waveform / np.where(new_scales == 0, 1, new_scales)

wav_to_save = new_waveform.T
wav_int16 = np.int16(wav_to_save / np.max(np.abs(wav_to_save)) * 32767)
wavfile.write("out/audio1_shifted.wav", int(sr * scaling), wav_int16)

display(Audio(new_waveform, rate=sr * scaling))