# Audio Diffusion Models


In [None]:
from pathlib import Path
import torch
import numpy as np
import sys
from torch import Tensor
from typing import Callable

%load_ext autoreload
%autoreload 2

## Utility functions

In [None]:
from pathlib import Path
import torchaudio as ta
import matplotlib.pyplot as plt
from tqdm import tqdm
from einops import rearrange, reduce
from IPython.display import Audio, Markdown
from ipywidgets import widgets
from ipywidgets import HBox, VBox
from audio_diffusion_pytorch import KarrasSchedule

device = torch.device("cuda:0")

def display_audio(signal: torch.Tensor, sr: int):
    display(Audio(signal.squeeze(0).cpu(), rate=sr))

def display_md(text: str):
    display(Markdown(text))

# Sine-waves example


## Model definition
The model we will be using is a 1-dimensional UNet, improved with attention layers.

In [None]:
# NOTE: audio_diffusion_pytorch needs to be version 0.0.43
from audio_diffusion_pytorch import AudioDiffusionModel, LogNormalDistribution

model = AudioDiffusionModel(
    in_channels=1,
    channels=64,
    patch_factor=16,
    patch_blocks=1,
    resnet_groups=8,
    kernel_multiplier_downsample=2,
    kernel_sizes_init=[1, 3, 7],
    multipliers=[1, 2, 4, 4, 4],
    factors=[4, 4, 2, 2],
    num_blocks= [2, 2, 2, 2],
    attentions= [False, False, False, True],
    attention_heads=4,
    attention_features=64,
    attention_multiplier=2,
    use_nearest_upsample=False,
    use_skip_scale=True,
    use_attention_bottleneck=True,
    diffusion_sigma_distribution=LogNormalDistribution(mean=-3.0, std=1.0),
    diffusion_sigma_data=0.2,
    diffusion_dynamic_threshold=0.0,
)

model.to("cuda:0");

## Training

In [None]:
def train_model(model, num_samples: int, batch_size: int, sample_length: int, device: str, learning_rate: float):
    # Create optimizer
    optimizer = torch.optim.Adam(list(model.parameters()), lr=learning_rate, betas=(0.9, 0.99))
    
    # Training loop
    model.train()
    for i in tqdm(range(num_samples)):
            
        # Generate input sine wave
        period, shift = torch.rand(size=(2, batch_size,1,1))
        x = torch.linspace(0, 1, sample_length)
        x = torch.sin( 2*np.pi / period * (x + shift)).to(device)
        
        # Add noise to input (x + sigma * eps)
        sigmas = model.diffusion.sigma_distribution(num_samples=batch_size, device=device)
        x_noisy =  x + sigmas.view(-1,1,1) * torch.randn_like(x)

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Compute denoised values
        x_denoised = model.diffusion.denoise_fn(x_noisy, sigmas=sigmas)
        
        # Compute weighted loss, each entry weighted by sigma**2
        losses = torch.nn.functional.mse_loss(x_denoised, x, reduction="none")
        losses = reduce(losses, "b ... -> b", "mean")
        losses = losses * model.diffusion.loss_weight(sigmas)
        loss = losses.mean()

        # Compute the loss gradients
        loss.backward()

        # Adjust learning weights
        optimizer.step()

We can then start the training loop. We will use in total 256000 examples of sine-waves with random shift and period, with maximum period length equal to one:

In [None]:
# Set seed for reproducibility
torch.random.manual_seed(0)

# Training loop
train_model(
    model, 
    num_samples=1000, 
    batch_size=256, 
    sample_length=4096,
    device=device,
    learning_rate=1e-4,
)

# Show results
display_md("**Training Complete!**")

## Sampling
For sampling, similarly to NCSN, we can use annealed langevin dynamics:

In [None]:
@torch.no_grad()
def sample_with_annealed_langevin(noise: Tensor, denoise_fn: Callable, sigmas: Tensor, T: int = 100, eps: float = 2e-5):    
    # Create initial noise
    sigma_L = sigmas[-1]
    x = sigmas[0] * noise
    
    # For each noise level
    for sigma_i in tqdm(sigmas):
        alpha_i = eps*sigma_i**2/sigma_L**2
        
        # For each correction step
        for t in range(T):
            
            # Compute score (slightly different formulation from NCSN)
            score = (denoise_fn(x, sigma=sigma_i) - x) / sigma_i**2
            
            # Langevin Dynamics
            x = x + 0.5 * alpha_i * score + torch.randn_like(x) * torch.sqrt(alpha_i) 
    
    return x

In [None]:
# Set seed for reproducibility
torch.random.manual_seed(1)

# Set model in inference mode (no dropout, etc.)
model.eval()

# Starting noise (batch-size, num-sources, num-elements)
noise = torch.randn((4, 1, 4096) , device=device) 

# Denoise-function (approximates grad-log-prob)
gradlogp_fn = model.diffusion.denoise_fn

# Solver timesteps distribution
timesteps = KarrasSchedule(sigma_min=0.01, sigma_max=10.0, rho=5.0)(10, device) # < faster convergence

# Sample from learned distribution
y1, y2, y3, y4 = sample_with_annealed_langevin(noise, gradlogp_fn, timesteps[:-1])


# Plot results
plt.figure(figsize=(10,10))
plt.subplot(2,2,1)
plt.plot(y1.view(-1).cpu())

plt.subplot(2,2,2)
plt.plot(y2.view(-1).cpu())

plt.subplot(2,2,3)
plt.plot(y3.view(-1).cpu())

plt.subplot(2,2,4)
plt.plot(y4.view(-1).cpu())
plt.show()