# Multi-source Diffusion Models


## Applications
* **Music Generation** - Generate an audio track with respect of a learnt distribution
* **Source Separation** - Separate audio tracks from a mixture
* **Source Inpainting** - Given a single instrument track, generate other instruments coeherent tracks

In [None]:
from pathlib import Path
import torch

root_path = Path("..").resolve().absolute()
ckpts_path = Path("/home/irene/Documents/audio-diffusion-pytorch-trainer/logs/ckpts")
dataset_path = Path("/home/irene/Documents/audio-diffusion-pytorch-trainer/data/Slack/test/")
device = torch.device("cuda:0")

sampling_rate = 22050
length_samples = 262144
num_steps = 200
num_sources = 2

In [None]:
from misc import hparams, display_audio
import main.module_base

# load the diffusion models
model = main.module_base.Model(**{**hparams, "in_channels": num_sources})
model.load_state_dict(torch.load(ckpts_path/"drums_piano_epoch=358.pt"))
model.to(device);

# Music Generation
A simple generation algorithm using euler solver:

In [None]:
@torch.no_grad()
def sample_with_euler(noise, denoise_fn, sigmas):    
    # Create initial noise
    x = sigmas[0] * noise
        
    for i in range(len(sigmas) - 1):
        sigma, sigma_next = sigmas[i], sigmas[i+1]
        
        # Compute derivative
        d = (x - denoise_fn(x, sigma=sigma)) / sigma

        # Euler method
        x = x + d * (sigma_next - sigma)
    
    return x

Usually the addition of randomness helps with generation:

In [None]:
@torch.no_grad()
def sample_with_euler(noise, denoise_fn, sigmas, s_churn):    
    # Create initial noise
    x = sigmas[0] * noise
        
    for i in range(len(sigmas) - 1):
        sigma, sigma_next = sigmas[i], sigmas[i+1]
        
        # Inject randomness
        gamma = min(s_churn / (len(sigmas) - 1), 2 ** 0.5 - 1)
        sigma_hat = sigma * (gamma + 1)
        x_hat = x + torch.randn_like(x) * (sigma_hat ** 2 - sigmas[i] ** 2) ** 0.5
        
        # Compute derivative
        d = (x_hat - denoise_fn(x_hat, sigma=sigma_hat)) / sigma_hat
            
        # Euler method
        x = x_hat + d * (sigma_next - sigma_hat)
    
    return x

Using the above algorithms it is then possible to generate an audio chunk from the learned distribution:

In [None]:
from audio_diffusion_pytorch import KarrasSchedule

# starting noise (batch-size, num-sources, num-elements)
noise = torch.randn((1, num_sources, length_samples) , device=device) 

# denoise-function (approximates grad-log-prob)
gradlogp_fn = model.model.diffusion.denoise_fn

# solver timesteps distribution
timesteps = KarrasSchedule(1e-4, 10.0, rho=9.0)(num_steps, device) # < much faster than linear  

# sample from learned distribution
y = sample_with_euler(noise, gradlogp_fn, timesteps, s_churn=10.0)

# play sampled track (LEFT: drums, RIGHT: piano)
display_audio(y, sampling_rate)

# Source Separation
Choose and load an audio track from the Slakh2100 test split

In [None]:
def choose_audio_track(tracks_dir: Path):
    from ipywidgets import widgets
    w1 = widgets.Combobox(
        placeholder='Track00001.wav',
        options=list(file.name for file in tracks_dir.glob("*.wav")),
        description='Audio track:',
        ensure_option=True,
        disabled=False,
    )
    w2 = widgets.FloatText(
        value=100.0,
        description="Start second:",
    )
    return w1, w2

track_widget, start_widget = choose_audio_track(dataset_path/"bass")
display(track_widget)
display(start_widget)

In [None]:
from misc import load_audio, display_audio

# load audio tracks
start = round(start_widget.value * sampling_rate)
end = start + length_samples
signal_drums = load_audio(dataset_path / f'drums/{track_widget.value}', sampling_rate, start, end)
signal_piano = load_audio(dataset_path / f'piano/{track_widget.value}', sampling_rate, start, end)

# move signals to device
signal_drums = signal_drums.to(device)
signal_piano = signal_piano.to(device)
mixture = signal_drums + signal_piano

# display audio
print("Drums Track:")
display_audio(signal_drums, sampling_rate)

print("\nPiano Track:")
display_audio(signal_piano, sampling_rate)

print("\nMixture:")
display_audio(mixture, sampling_rate)

Separation algorithm:

In [None]:
@torch.no_grad()
def separate_with_euler(noise, denoise_fn, sigmas, mixture, s_churn=1.0):  
    # initial noise
    x = sigmas[0] * noise
        
    for i in range(len(sigmas) - 1):
        sigma, sigma_next = sigmas[i], sigmas[i+1]
        
        # inject randomness
        gamma = min(s_churn / (len(sigmas) - 1), 2 ** 0.5 - 1)
        sigma_hat = sigma * (gamma + 1)
        x_hat = x + torch.randn_like(x) * (sigma_hat ** 2 - sigmas[i] ** 2) ** 0.5
        
        # Compute conditioned derivative
        x_hat[:, 1, :] = mixture + torch.randn_like(mixture) * sigma - x_hat[:, 0, :]
        score = (x_hat - denoise_fn(x_hat, sigma=sigma_hat)) / sigma_hat
        score_drums, score_piano = score[:,:1], score[:,1:]
        d = score_drums - score_piano
            
        # Euler method
        x = x_hat + d * (sigma_next - sigma_hat)
    
    return x

Separation of input audio mixture:

In [None]:
from main.separation import separate

# starting noise (batch-size, num-sources, num-elements)
noise = torch.randn((1, num_sources, length_samples), device=device) 

# denoise-function (approximates grad-log-prob)
gradlogp_fn = model.model.diffusion.denoise_fn

# solver timesteps distribution
timesteps = KarrasSchedule(1e-4, 10.0, rho=9.0)(num_steps, device) 

# sample from learned distribution
y = separate_with_euler(noise, gradlogp_fn, timesteps, mixture)
y_drums, y_piano = y[:,:1,:], y[:,1:, :]

# display audio
print("Separated Drums Track:")
display_audio(y_drums, sampling_rate)
                      
print("\nSeparated Piano Track:")
display_audio(y_piano, sampling_rate)