## Vanilla inference script

In [None]:
# Vanilla inference script
from processing import *

spec = inference(
    model_repo="darinchau/comp5421-project-sage-lake-20-comp5421-mel-spectrogram-step-2560",
    seed = 1,
    sample_count=1,
    verbose=True,
    return_intermediate_steps=True
)

In [None]:
from IPython.display import Audio
audio = mel_to_audio(spec["latents"][0, 0], TARGET_SR)
audio /= np.max(np.abs(audio))
Audio(audio, rate=TARGET_SR)

In [None]:
# Save audio as mp3
import os
from scipy.io import wavfile
os.makedirs("figures", exist_ok=True)
wavfile.write("figures/output.wav", TARGET_SR, audio)
!ffmpeg -i figures/output.wav figures/output.mp3

## Generate denoising animation

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import matplotlib
from processing import *

spec = inference(
    model_repo="darinchau/comp5421-project-sage-lake-20-comp5421-mel-spectrogram-step-2560",
    seed = 1,
    sample_count=1,
    verbose=True,
    sample_steps=100,
    return_intermediate_steps=True
)

In [None]:
import librosa
import matplotlib.pyplot as plt

def show_mel(mel, fig, ax):
    img = librosa.display.specshow(mel, sr=TARGET_SR, x_axis='time', y_axis='mel')
    cb = fig.colorbar(img, ax=ax, format='%+2.0f dB')

def show_mel_normalized(mel, fig, ax):
    mel = mel.copy().clip(min=-80., max=80.)
    mel = librosa.db_to_amplitude(mel)
    mel_basis = librosa.filters.mel(sr=TARGET_SR, n_fft=2048, n_mels=128)
    inv_mel_basis = np.linalg.pinv(mel_basis)
    stft_magnitude = np.dot(inv_mel_basis, mel)
    stft_magnitude_squared = stft_magnitude**2
    audio = librosa.griffinlim(stft_magnitude_squared, hop_length=512, n_iter=32)

    audio /= np.max(np.abs(audio))
    stft = np.abs(librosa.stft(audio, n_fft=2048, hop_length=512))
    mel = librosa.feature.melspectrogram(sr=TARGET_SR, S=stft**2, n_mels=128)
    log_mel = librosa.amplitude_to_db(mel)
    show_mel(log_mel, fig, ax)

fig, ax = plt.subplots()
show_mel(spec['latents'][0, 0], fig, ax)

In [None]:
import os
import imageio

def make_animation():
    os.makedirs('figures', exist_ok=True)
    data = spec['intermediates'][0]
    N = data.shape[0]

    # Create a writer object
    writer = imageio.get_writer('figures/animation.mp4', fps=30)

    for i in range(N):
        fig, ax = plt.subplots()
        show_mel(data[i], fig, ax)
        ax.set_title(f"Step {i + 1}")

        # Save the plot as an image in memory
        fig.canvas.draw()
        image = np.frombuffer(fig.canvas.tostring_rgb(), dtype='uint8')
        image = image.reshape(fig.canvas.get_width_height()[::-1] + (3,))

        writer.append_data(image)
        plt.close(fig)

    writer.close()

make_animation()

## Conversion test

In [None]:
from IPython.display import Audio

audio_path = "D:/audio-dataset-v3/audio/dQw4w9WgXcQ.wav"
mel = audio2mel(audio_path)
audio = mel_to_audio(mel, TARGET_SR)
Audio(audio, rate=TARGET_SR)

In [None]:
Audio(audio_path)

In [None]:
# Save these two songs
import os
from scipy.io import wavfile
os.makedirs("figures", exist_ok=True)
wavfile.write("figures/convinput.wav", TARGET_SR, audio)

x, sr = librosa.load(audio_path, sr=TARGET_SR, mono=True)
start, end = 0, 432*512-1
wavfile.write("figures/convoutput.wav", TARGET_SR, x[start:end])

In [None]:
fig, ax = plt.subplots()
show_mel(mel, fig, ax)

## Denoise an existing song

In [None]:
from IPython.display import Audio
from processing import *

audio_path = "D:/audio-dataset-v3/audio/dQw4w9WgXcQ.wav"
mel = audio2mel(audio_path)
spec = inference(
    model_repo="darinchau/comp5421-project-sage-lake-20-comp5421-mel-spectrogram-step-2560",
    seed = 1,
    sample_count=1,
    verbose=True,
    sample_steps=10,
    sample_step_start=100
)

In [None]:
Audio(mel_to_audio(spec['latents'][0, 0], TARGET_SR), rate=TARGET_SR)

In [None]:
spec['latents'][0, 0].shape

In [None]:
Audio(mel_to_audio(mel, TARGET_SR).clip(-32768, 32768), rate=TARGET_SR)

In [None]:
np.abs(mel_to_audio(mel, TARGET_SR)).mean()