In [3]:
import torch
import torchaudio

audio, sample_rate = torchaudio.load('audio_16k/Basta_16k.wav')
audio = audio.unsqueeze(0)
audio = audio.unsqueeze(0)
audio_08 = torch.nn.functional.interpolate(audio, scale_factor=0.8, mode='bilinear')
audio_12 = torch.nn.functional.interpolate(audio, scale_factor=1.2, mode='bilinear')
torchaudio.save('outputs/interpolation_0_8.wav', audio_08.squeeze(0).squeeze(0), sample_rate)
torchaudio.save('outputs/interpolation_1_2.wav', audio_12.squeeze(0).squeeze(0), sample_rate)

The interpoolation result in a different sized vector but the samplimg rate is not changed.
listening to the files, we obsereved that the interpoolation with a factor of 0.8 resulted in a faster audio, and the interpoolation with a factor of 1.2 resulted in a slower audio. In addition, the pitch of the audio was changed - the audio with a factor of 0.8 had a higher pitch, and the audio with a factor of 1.2 had a lower pitch. We understand this phenomenon as a result of the interpoolation process, which stretches the audio wave in time, effectively increasing the distance between the signal's peaks, which decreases its frequency (resulting in a lower pitch). The opposite is true for the interpoolation with a factor of 0.8.

In [5]:
import librosa
import torch
import torchaudio


def naive_tempo_shift(wav, factor):
    # Convert the waveform to a PyTorch tensor
    wav_tensor = torch.from_numpy(wav)

    # output_length = int(len(wav_tensor[0]) * factor)
    # Compute the magnitude spectrogram of the audio
    spec = torch.stft(wav_tensor, n_fft=2048, hop_length=512, return_complex=True)
    # Compute the stretched waveform by inverting the spectrogram
    stretched_wav = torch.istft(spec, n_fft=2048, hop_length=int(512 / factor))

    return stretched_wav


wav, sr = librosa.load('audio_16k/Basta_16k.wav', sr=16000, mono=False)
wav_08 = naive_tempo_shift(wav, 0.8)
wav_12 = naive_tempo_shift(wav, 1.2)
torchaudio.save('outputs/naive_pitch_shift_0_8.wav', wav_08, sr)
torchaudio.save('outputs/naive_pitch_shift_1_2.wav', wav_12, sr)



In [6]:
import math

import numpy as np
import torch
import torchaudio
import librosa


def construct_hann_window(win_size):
    # return a vector representing a hanning window, hint: see torch.hann_window
    hann_window = torch.hann_window(window_length=win_size)
    return hann_window


def get_complex_stft(signal, win_size, hop, window):
    # Convert the waveform to a PyTorch tensor
    # signal = torch.from_numpy(signal)
    # return a complex representation of the stft (x + jy form)
    stft = torch.stft(signal, n_fft=win_size, win_length=win_size, hop_length=hop, window=window, return_complex=True)
    return stft


def get_acc_phase_delta_1(stft_left, stft_right):
    # calculate angular distance between two complex STFTs
    phase_delta = torch.angle(stft_right) - torch.angle(stft_left)

    # accumulate phase, follow the recursive formula
    phase = torch.zeros_like(phase_delta)
    # phase = torch.zeros([2, 513, 1188])

    phase[:, :, 0] = phase_delta[:, :, 0]
    phase = torch.cumsum(phase_delta, axis=-1)

    # round phase back to 0 - 2 * pi range
    phase = phase - 2 * np.pi * torch.round(phase / (2 * np.pi))

    return phase


def get_acc_phase_delta(stft_left, stft_right):
    # calculate angular distance between two complex STFTs

    # phase_delta = angle(stft_right) - angle(stft_left)
    phase_delta = torch.angle(stft_right) - torch.angle(stft_left)

    # accumulate phase, follow this recursive formula
    # for i in {1...length(phase_delta)}: phase[i] := phase_delta[i] + phase[i - 1];
    # phase[0] = phase_delta[0]

    phase = torch.zeros_like(phase_delta)
    # phase = torch.zeros([2, 513, 1188])

    phase[:, :, 0] = phase_delta[:, :, 0]
    phase = torch.cumsum(phase_delta, axis=-1)

    # round phase back to [-2 * pi, 2 * pi] range
    # phase = phase - (2 * pi * round(phase_delta / (2 * pi)))

    phase = phase - (2 * math.pi * torch.round(phase_delta / (2 * math.pi)))

    return phase


def get_re_im_from_phase(phase):
    real = torch.cos(phase)
    imag = torch.sin(phase)
    return real, imag


def time_stretch(signal, factor, win_size=1024, hop=1024 // 4):
    # create window
    hann_window = construct_hann_window(win_size)

    # draw two complex STFTs
    new_hop = int(hop * factor)
    stft_left = get_complex_stft(signal[:, :-hop], win_size, new_hop, hann_window)
    stft_right = get_complex_stft(signal[:, hop:], win_size, new_hop, hann_window)

    # calculate accumulated phase delta and reconstruct phase from it
    phase = get_acc_phase_delta(stft_left, stft_right)

    # reconstruct component from phase
    re, im = get_re_im_from_phase(phase)

    # perform stft  per channel
    first_channel_complex_new_stft = torch.view_as_complex(
        (torch.stack([re[0], im[0]]) * abs(stft_right)).permute(1, 2, 0).contiguous())
    second_channel_complex_new_stft = torch.view_as_complex(
        (torch.stack([re[1], im[1]]) * abs(stft_right)).permute(1, 2, 0).contiguous())

    first_channel_output = torch.istft(first_channel_complex_new_stft, win_length=win_size, hop_length=hop,
                                       window=hann_window, n_fft=win_size)
    second_channel_output = torch.istft(second_channel_complex_new_stft, win_length=win_size, hop_length=hop,
                                        window=hann_window, n_fft=win_size)
    # append the two channels back together
    return torch.stack([first_channel_output, second_channel_output])


wav, sr = torchaudio.load('audio_16k/Basta_16k.wav')
wav_08 = time_stretch(wav, 0.8)
wav_12 = time_stretch(wav, 1.2)
torchaudio.save("outputs/phase_vocoder_0_8.wav", wav_08, sr)
torchaudio.save('outputs/phase_vocoder_1_2.wav', wav_12, sr)
