In [1]:
!nvidia-smi --query-gpu=name --format=csv,noheader | head -n 1

NVIDIA GeForce RTX 4090


In [13]:
import io
import time
import torch
import numpy as np
import PIL
import torchaudio
import matplotlib.pyplot as plt
import einops
from IPython.display import Audio
from types import SimpleNamespace
from torchvision.transforms import ToPILImage, PILToTensor
from datasets import load_dataset, Image
from autocodec.codec import AutoCodecND, latent_to_pil, pil_to_latent

In [2]:
MUSDB = load_dataset("danjacobellis/musdb_segments", split='validation')

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/22 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/21 [00:00<?, ?it/s]

In [11]:
device = 'cuda'
checkpoint = torch.load('../../hf/autocodec/musdb_stereo_f512c16.pth', map_location="cpu",weights_only=False)
config = checkpoint['config']
state_dict = checkpoint['state_dict']
model = AutoCodecND(
    dim=1,
    input_channels=config.input_channels,
    J = int(np.log2(config.F)),
    latent_dim=config.latent_dim,
    encoder_depth = config.encoder_depth,
    encoder_kernel_size = config.encoder_kernel_size,
    decoder_depth = config.decoder_depth,
    lightweight_encode = config.lightweight_encode,
    lightweight_decode = config.lightweight_decode,
).to(device).to(torch.bfloat16)
model.load_state_dict(state_dict)
model.eval();

In [4]:
def pad(audio, p=2**16):
    B,C,L = audio.shape
    padding_size = (p - (L % p)) % p
    if padding_size > 0:
        audio = torch.nn.functional.pad(audio, (0, padding_size), mode='constant', value=0)
    return audio

In [36]:
def autocodec_compress(sample):
    with torch.no_grad():
        x, fs = torchaudio.load(sample['audio_mix']['bytes'],normalize=False)
        x = x.to(torch.float)
        x = x - x.mean()
        max_abs = x.abs().max()
        x = x / (max_abs + 1e-8)
        x = x/2
        L = x.shape[-1]
    
        t0 = time.time()
        x_padded = pad(x.unsqueeze(0), 2**16).to(device).to(torch.bfloat16)
        z = model.encode(x_padded)
        latent = model.quantize.compand(z).round().cpu()
        latent_reshaped = einops.rearrange(latent, 'b c (h w) -> b c h w', h=32)
        latent_img = latent_to_pil(latent_reshaped, n_bits=8, C=4)
        buff = io.BytesIO()
        latent_img[0].save(buff, format='TIFF', compression='tiff_adobe_deflate')
        tiff_bytes = buff.getbuffer()
        encode_time = time.time() - t0
        t0 = time.time()
        latent_decoded = pil_to_latent([PIL.Image.open(buff)], N=16, n_bits=8, C=4)
        latent_decoded = einops.rearrange(latent_decoded, 'b c h w -> b c (h w)')
        x_hat = model.decode(latent_decoded.to(device).to(torch.bfloat16))
        x_hat = x_hat.clamp(-1,1)
        decode_time = time.time() - t0
        x_hat = x_hat.to("cpu").to(torch.float)[0]
        CR = x.numel()/len(tiff_bytes)
        mse = torch.nn.functional.mse_loss(x,x_hat)
        PSNR = -10*mse.log10().item() + 6.02
        
    return {
        'compressed': tiff_bytes,
        'encode_time': encode_time,
        'decode_time': decode_time,
        'CR': CR,
        'L': L,
        'PSNR': PSNR,
    }

In [37]:
device = "cuda"
model = model.to(device)
gpu = MUSDB.map(
    autocodec_compress,
    writer_batch_size=16,
)

Map:   0%|          | 0/262 [00:00<?, ? examples/s]

In [41]:
for metric in ['encode_time','decode_time','CR','PSNR']:
    μ = np.mean(gpu[metric])
    print(f"{metric}: {μ}")

encode_time: 0.011013405013630408
decode_time: 0.005401287370055686
CR: 194.48654358290898
PSNR: 36.56908681461829


In [45]:
(torch.tensor(gpu['L'])/torch.tensor(gpu['encode_time'])).mean()/1e6

tensor(199.2718)

In [46]:
(torch.tensor(gpu['L'])/torch.tensor(gpu['decode_time'])).mean()/1e6

tensor(414.9786)