Music (stereo) – Stable Audio
---
---

In [1]:
import time
import numpy as np
import torch
from diffusers.models.autoencoders import AutoencoderOobleck
codec = AutoencoderOobleck.from_pretrained(
    "stabilityai/stable-audio-open-1.0",
    subfolder='vae',
    torch_dtype=torch.float
)
codec.eval();

for L in [2**12, 2**16]:
    encode_time = []
    for i_trial in range(101):
        x = torch.randn((1,2,L)).clamp(-1,1).to(torch.float)
        t0 = time.time()
        z = codec.encode(x).latent_dist.mode().to(torch.float16).to("cpu")
        torch.save(z,'temp.pth')
        encode_time.append(time.time() - t0)
    print(f'L: {L}; {L/np.median(encode_time)/1e3}')

Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.
  WeightNorm.apply(module, name, dim)


L: 4096; 88.73395201718911
L: 65536; 229.36807058662532


---
Music (stereo) – LiveAction
---
---

In [1]:
import io
import time
import torch
import numpy as np
import einops
from types import SimpleNamespace
from datasets import load_dataset, Image
from autocodec.codec import AutoCodecND, latent_to_pil, pil_to_latent

device = 'cpu'
checkpoint = torch.load('../../hf/autocodec/musdb_stereo_f512c16.pth', map_location="cpu",weights_only=False)
config = checkpoint['config']
state_dict = checkpoint['state_dict']
model = AutoCodecND(
    dim=1,
    input_channels=config.input_channels,
    J = int(np.log2(config.F)),
    latent_dim=config.latent_dim,
    encoder_depth = config.encoder_depth,
    encoder_kernel_size = config.encoder_kernel_size,
    decoder_depth = config.decoder_depth,
    lightweight_encode = config.lightweight_encode,
    lightweight_decode = config.lightweight_decode,
).to(device)
model.load_state_dict(state_dict)
model.eval();

for L in [2**12, 2**16]:
    encode_time = []
    for i_trial in range(101):
        x = torch.randn((1,2,L)).clamp(-1,1)
        t0 = time.time()
        z = model.quantize.compand(model.encode(x)).round().cpu()
        latent_img = latent_to_pil(z.unsqueeze(0), n_bits=8, C=1)
        buff = io.BytesIO()
        latent_img[0].save(buff, format='TIFF', compression='tiff_adobe_deflate')
        tiff_bytes = buff.getbuffer()
        encode_time.append(time.time() - t0)
    print(f'L: {L}; {L/np.median(encode_time)/1e3}')

L: 4096; 323.75752268958234
L: 65536; 5012.178748842129


---
RGB Image – LiveAction F16C48
---
---

In [1]:
import io
import time
import torch
import PIL.Image
import numpy as np
from types import SimpleNamespace
from autocodec.codec import AutoCodecND, latent_to_pil, pil_to_latent

device = "cpu"
checkpoint = torch.load('../../hf/autocodec/rgb_f16c48_ft.pth', map_location="cpu",weights_only=False)
config = checkpoint['config']
state_dict = checkpoint['state_dict']
model = AutoCodecND(
    dim=2,
    input_channels=config.input_channels,
    J = int(np.log2(config.F)),
    latent_dim=config.latent_dim,
    encoder_depth = config.encoder_depth,
    encoder_kernel_size = config.encoder_kernel_size,
    decoder_depth = config.decoder_depth,
    lightweight_encode = config.lightweight_encode,
    lightweight_decode = config.lightweight_decode,
).to(device)
model.load_state_dict(state_dict)
model.eval();

for S in [, 2**16]:
    encode_time = []
    for i_trial in range(5):
        x = torch.randn((1,2,L)).clamp(-1,1)
        t0 = time.time()
        z = model.quantize.compand(model.encode(x)).round().cpu()
        latent_img = latent_to_pil(z.unsqueeze(0), n_bits=8, C=1)
        buff = io.BytesIO()
        latent_img[0].save(buff, format='TIFF', compression='tiff_adobe_deflate')
        tiff_bytes = buff.getbuffer()
        encode_time = time.time() - t0
    print(f'L: {L}; {L/np.median(encode_time)/1e3}')

