In [1]:
import torch
import random
import numpy as np
from PIL import Image
from datasets import load_dataset
from IPython.display import Audio
from diffusers import AutoencoderKL, AudioDiffusionPipeline, Mel

2023-05-26 06:09:53.858068: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-26 06:09:53.886487: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
mel = Mel()
vae = AutoencoderKL.from_pretrained('../models/autoencoder-kl')
# vae = AutoencoderKL.from_pretrained('/home/genzorr/Documents/learning/Skoltech/dl/audio-diffusion-mod/lightning_logs/version_0/checkpoints/epoch=4-step=90.ckpt')
# vae = AutoencoderKL()
# vae.load_state_dict(state_dict=torch.load('/home/genzorr/Documents/learning/Skoltech/dl/audio-diffusion-mod/lightning_logs/version_0/checkpoints/epoch=4-step=90.ckpt')['state_dict'])

In [3]:
vae.config

FrozenDict([('in_channels', 1),
            ('out_channels', 1),
            ('down_block_types',
             ['DownEncoderBlock2D',
              'DownEncoderBlock2D',
              'DownEncoderBlock2D',
              'DownEncoderBlock2D']),
            ('up_block_types',
             ['UpDecoderBlock2D',
              'UpDecoderBlock2D',
              'UpDecoderBlock2D',
              'UpDecoderBlock2D']),
            ('block_out_channels', [128, 256, 512, 512]),
            ('layers_per_block', 2),
            ('act_fn', 'silu'),
            ('latent_channels', 1),
            ('norm_num_groups', 32),
            ('sample_size', [256, 256]),
            ('scaling_factor', 0.18215),
            ('_class_name', 'AutoencoderKL'),
            ('_diffusers_version', '0.16.1'),
            ('_name_or_path', '../models/autoencoder-kl')])

In [4]:
ds = load_dataset('teticio/audio-diffusion-256')

Downloading metadata:   0%|          | 0.00/708 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/660 [00:00<?, ?B/s]

Downloading and preparing dataset None/None (download: 910.09 MiB, generated: 911.42 MiB, post-processed: Unknown size, total: 1.78 GiB) to /home/genzorr/.cache/huggingface/datasets/teticio___parquet/teticio--audio-diffusion-256-09d174a5724b0931/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

### Reconstruct audio

In [None]:
image = random.choice(ds['train'])['image']
display(image)
Audio(data=mel.image_to_audio(image), rate=mel.get_sample_rate())

In [None]:
# encode
input_image = np.frombuffer(image.tobytes(), dtype="uint8").reshape(
    (image.height, image.width, 1))
input_image = ((input_image / 255) * 2 - 1).transpose(2, 0, 1)
posterior = vae.encode(torch.tensor([input_image],
                                    dtype=torch.float32)).latent_dist
latents = posterior.sample()

In [None]:
# reconstruct
output_image = vae.decode(latents)['sample']
output_image = torch.clamp(output_image, -1., 1.)
output_image = (output_image + 1.0) / 2.0  # -1,1 -> 0,1; c,h,w
output_image = (output_image.detach().cpu().numpy() *
                255).round().astype("uint8").transpose(0, 2, 3, 1)[0, :, :, 0]
output_image = Image.fromarray(output_image)
display(output_image)
Audio(data=mel.image_to_audio(output_image), rate=mel.get_sample_rate())

### Random sample from latent space
(Don't expect interesting results!)

In [None]:
# sample
output_image = vae.decode(torch.randn_like(latents))['sample']
output_image = torch.clamp(output_image, -1., 1.)
output_image = (output_image + 1.0) / 2.0  # -1,1 -> 0,1; c,h,w
output_image = (output_image.detach().cpu().numpy() *
                255).round().astype("uint8").transpose(0, 2, 3, 1)[0, :, :, 0]
output_image = Image.fromarray(output_image)
display(output_image)
Audio(data=mel.image_to_audio(output_image), rate=mel.get_sample_rate())

### Interpolate between two audios in latent space

In [None]:
image2 = random.choice(ds['train'])['image']
display(image2)
Audio(data=mel.image_to_audio(image2), rate=mel.get_sample_rate())

In [None]:
# encode
input_image2 = np.frombuffer(image2.tobytes(), dtype="uint8").reshape(
    (image2.height, image2.width, 1))
input_image2 = ((input_image2 / 255) * 2 - 1).transpose(2, 0, 1)
posterior2 = vae.encode(torch.tensor([input_image2],
                                     dtype=torch.float32)).latent_dist
latents2 = posterior2.sample()

In [None]:
# interpolate
alpha = 0.5  #@param {type:"slider", min:0, max:1, step:0.1}
output_image = vae.decode(
    AudioDiffusionPipeline.slerp(latents, latents2, alpha))['sample']
output_image = torch.clamp(output_image, -1., 1.)
output_image = (output_image + 1.0) / 2.0  # -1,1 -> 0,1; c,h,w
output_image = (output_image.detach().cpu().numpy() *
                255).round().astype("uint8").transpose(0, 2, 3, 1)[0, :, :, 0]
output_image = Image.fromarray(output_image)
display(output_image)
display(Audio(data=mel.image_to_audio(image), rate=mel.get_sample_rate()))
display(Audio(data=mel.image_to_audio(image2), rate=mel.get_sample_rate()))
display(
    Audio(data=mel.image_to_audio(output_image), rate=mel.get_sample_rate()))