# music2latent

In [None]:
# basic imports
import numpy as np
import IPython
import librosa

Initialize the EncoderDecoder model.

In [None]:
from music2latent import EncoderDecoder

encdec = EncoderDecoder()

In [None]:
import importlib
import music2latent
importlib.reload(music2latent)

Let's load an audio file for this tutorial:

In [None]:
audio_path = librosa.example('trumpet')
audio_path="/app/input/drum.wav"
wv, sr = librosa.load(audio_path, sr=44100)

IPython.display.display(IPython.display.Audio(wv, rate=sr))

# Encode

To encode an audio sample into latents, you need to provide the waveform as input, with shape [audio_channels, waveform_samples] or simply [waveform_samples,]:

In [None]:
wv, sr = librosa.load(audio_path, sr=44100)
print(f'waveform samples: {wv.shape}')

latent = encdec.encode(wv)
print(f'Shape of latents: {latent.shape}')

You can also process a batch of waveforms. Just use a numpy array with shape [batch_size, waveform_samples] as input:

In [None]:
wv, sr = librosa.load(audio_path, sr=44100)

# create a batch of waveforms
wv_batched = np.stack([wv]*3, axis=0)
print(f'batch of waveforms shape: {wv_batched.shape}')

latent_batched = encdec.encode(wv_batched)
print(f'Shape of batched latents: {latent_batched.shape}')

# Decode

To decode latent embeddings back to waveform, be sure to have latents with shape [batch_size/audio_channels, latent_dim, latent_length]:

In [None]:
wv_rec = encdec.decode(latent)
print(f'Shape of decoded waveform: {wv_rec.shape}')

print(wv.shape, wv_rec.shape)

print('Original')
IPython.display.display(IPython.display.Audio(wv, rate=sr))
print('Reconstructed')
IPython.display.display(IPython.display.Audio(wv_rec.squeeze().cpu().numpy(), rate=sr))

You can also specify how many denoising steps to perform (default is 1). However, we do not notice any improvements in audio quality by increasing the denoise_steps.

In [None]:
wv_rec = encdec.decode(latent, denoising_steps=3)
print(f'Shape of decoded waveform: {wv_rec.shape}')

print('Original')
IPython.display.display(IPython.display.Audio(wv, rate=sr))
print('Reconstructed')
IPython.display.display(IPython.display.Audio(wv_rec.squeeze().cpu().numpy(), rate=sr))

# Keeping GPU memory under control

The autoencoder model needs plenty of memory to encode and decode samples.
We offer a way to keep the memory usage under control.

You can specify both the __max_batch_size__ and __max_waveform_length__ to use for encoding or decoding samples.

If not specified, the default values are the ones in hparams_inference.py (__max_batch_size__=1, __max_waveform_length__=44100*10)

If the waveform sample to encode or to reconstruct is longer than __max_waveform_length__, the spectrogram representation will be split into multiple samples, processed sequentially, and then concatenated back together.

In [None]:
wv, sr = librosa.load(audio_path, sr=44100)
print(f'waveform samples: {wv.shape}')

# split spectrogram into 1 second chunks, process each chunk separately, concatenate the results
# much lower memory usage
latent = encdec.encode(wv, max_waveform_length=44100*1)
print(f'Shape of latents: {latent.shape}')

wv_rec = encdec.decode(latent, max_waveform_length=44100*1)
print(f'Shape of decoded waveform: {wv_rec.shape}')

print('Original')
IPython.display.display(IPython.display.Audio(wv, rate=sr))
print('Reconstructed')
IPython.display.display(IPython.display.Audio(wv_rec.squeeze().cpu().numpy(), rate=sr))

If you need to encode/decode batches of samples in parallel you can increase the __max_batch_size__ argument until you reach your maximum memory budget:

In [None]:
wv, sr = librosa.load(audio_path, sr=44100)

# create a batch of waveforms
wv_batched = np.stack([wv]*3, axis=0)
print(f'batch of waveforms shape: {wv_batched.shape}')

latent_batched = encdec.encode(wv_batched, max_batch_size=3)
print(f'Shape of batched latents: {latent_batched.shape}')

wv_rec = encdec.decode(latent, max_batch_size=3)
print(f'Shape of decoded waveform: {wv_rec.shape}')

print('Original')
IPython.display.display(IPython.display.Audio(wv, rate=sr))
print('Reconstructed')
IPython.display.display(IPython.display.Audio(wv_rec[0].squeeze().cpu().numpy(), rate=sr))

# Keep in Mind:

When using the latents for generation tasks using diffusion-type models, make sure to properly normalize the latents according to the chosen diffusion framework. The latents extracted with this library are rescaled to have unit standard deviation for a reference music dataset, but ensure that the latents are properly normalized for your specific use case.