In [1]:
import torch
import torchaudio
import torch.nn as nn
from typing import Optional, Callable
import IPython.display as ipd
import librosa
import matplotlib.pyplot as plt
import sys; sys.path.insert(0, '../../src')
import temporal_audio_vae
import pghipy

In [2]:
def plot_spectrogram(specgram, title=None, ylabel="freq_bin", ax=None):
    if ax is None:
        _, ax = plt.subplots(1, 1)
    if title is not None:
        ax.set_title(title)
    ax.set_ylabel(ylabel)
    ax.imshow(
        librosa.power_to_db(specgram),
        origin="lower",
        aspect="auto",
        interpolation="nearest",
    )


def plot_spectrogram_diff(a, b, title=None):
    fig, ax = plt.subplots(1, 3)
    fig.set_size_inches(20, 5)
    if title:
        fig.suptitle(title)
    ax[0].set_title("A")
    ax[1].set_title("B")
    ax[2].set_title("diff")
    for i, s in enumerate([a, b, a - b]):
        image = ax[i].imshow(
            librosa.power_to_db(s),
            origin="lower",
            aspect="auto",
            interpolation="nearest",
        )
        fig.colorbar(image, ax=ax[i], orientation="horizontal", fraction=0.1)

In [11]:
FS = 48000

dataset = temporal_audio_vae.datasets.LoopDataset("../../data/loops/")

x = torch.cat([dataset[i] for i in range(48,49)])

transform = temporal_audio_vae.transforms.Log1pMelSpec(
    sample_rate=44100,
    n_mels=128,
    n_fft=1024,
    win_length=512,  # win_length * 2
    hop_length=256,  # => 1024 frames
    griffin_lim_iter=128,
)

ipd.display(ipd.Audio(x, rate=FS))

# transfo forward
specgram = transform.spectrogram(x)
mag, phase = torch.abs(specgram), torch.angle(specgram)
mel_specgram = transform.mel_scale(mag)  # dimension: (…, n_mels, time)

print(specgram.size())
print(mag.size())

# transfo backward
mag_hat = transform.inv_mel_scale(mel_specgram)
specgram_hat = mag * torch.exp(1j * phase)
x_hat_copy = transform.inv_spectrogram(specgram_hat)
x_hat_griffinlim = transform.griffin_lim(mag_hat)

# # Create Gaussian windows
# winpghi, gamma = pghipy.get_default_window(1024)
# winsynth = pghipy.calculate_synthesis_window(1024, 64, winpghi)

# # Estimate phase
# phase = pghi(S,win_length=NFFT,hop_length=HOP,gamma=gamma)

# # Invert
# S = S*np.exp(1.0j*phase)
# y_inv = istft(S,win_length=NFFT,hop_length=HOP,synthesis_window=winsynth)

# plot_spectrogram_diff(specgram, specgram_hat, "spectrogram")
print("copyphase")
ipd.display(ipd.Audio(x_hat_copy, rate=FS))
print("griffinlim")
ipd.display(ipd.Audio(x_hat_griffinlim, rate=FS))



torch.Size([513, 257])
torch.Size([513, 257])
copyphase


griffinlim


In [10]:

2**4

16

In [None]:
# test: forward then backward
html = """<table><thead><tr>
<td>id</td>
<td>original</td>
<td>copyphase</td>
<td>copyphase diff</td>
<td>griffinlim</td>
<td>griffinlim diff</td>
</tr></thead>"""
for i in [7809, 2016, 8888, 1234]:
    spec, phase = t.forward(dataset[i])
    # print(spec.shape, phase.shape)
    # plot_spectrogram(spec.cpu())
    t_copyphase = t.backward(spec, phase)
    t_griffinlim = t.backward(spec)
    html += "<tr>"
    html += f"<td>{i}</td>"
    html += "<td>" + ipd.Audio(dataset[i], rate=LoopDataset.FS)._repr_html_() + "</td>"
    html += "<td>" + ipd.Audio(t_copyphase.cpu(), rate=LoopDataset.FS)._repr_html_() + "</td>"
    html += "<td>" + ipd.Audio(t_copyphase.cpu() - dataset[i], rate=LoopDataset.FS)._repr_html_() + "</td>"
    html += "<td>" + ipd.Audio(t_griffinlim.cpu(), rate=LoopDataset.FS)._repr_html_() + "</td>"
    html += "<td>" + ipd.Audio(t_griffinlim.cpu() - dataset[i], rate=LoopDataset.FS)._repr_html_() + "</td>"
    html += "</tr>"
html += "</table>"

ipd.display(ipd.HTML(html))

NameError: name 't' is not defined