In [1]:
import io
import time
import torch
import datasets
import PIL.Image
import numpy as np
import torch.nn as nn
from types import SimpleNamespace
from piq import LPIPS, DISTS, SSIMLoss
from huggingface_hub import snapshot_download
from cosmos_tokenizer.image_lib import ImageTokenizer
from torchvision.transforms.v2 import Pad, CenterCrop
from torchvision.transforms.v2.functional import to_pil_image, pil_to_tensor

In [2]:
device = "cuda"
lpips_loss = LPIPS().to(device)
dists_loss = DISTS().to(device)
ssim_loss = SSIMLoss().to(device)
kodak = datasets.load_dataset("danjacobellis/kodak", split='validation')
lsdir = datasets.load_dataset("danjacobellis/LSDIR_val", split='validation')
inet = datasets.load_dataset("timm/imagenet-1k-wds",split='validation')
model_path = snapshot_download(repo_id='nvidia/Cosmos-Tokenizer-DI8x8')
encoder = ImageTokenizer(checkpoint_enc=f'{model_path}/encoder.jit').to(device)
decoder = ImageTokenizer(checkpoint_dec=f'{model_path}/decoder.jit').to(device)



Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/64 [00:00<?, ?it/s]

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

In [3]:
def evaluate_quality_h1024(sample):
    img = sample['jpg'].convert("RGB")
    aspect = img.width/img.height
    img = img.resize((int(16*(1024*aspect//16)),1024),resample=PIL.Image.Resampling.LANCZOS)
    x_orig = pil_to_tensor(img).to(device).unsqueeze(0).to(torch.float) / 127.5 - 1.0
    orig_dim = x_orig.numel() 

    t0 = time.time()
    with torch.no_grad():
        z, _ = encoder.encode(x_orig)
    encode_time = time.time() - t0
    size_bytes = 2*z.numel()
    t0 = time.time()
    with torch.no_grad():
        z, _ = encoder.encode(x_orig)
        x_hat = decoder.decode(z).to(torch.float).clamp(-1,1)
    decode_time = time.time() - t0

    x_orig_01 = x_orig / 2 + 0.5
    x_hat_01 = x_hat / 2 + 0.5

    pixels = img.width * img.height
    bpp = 8 * size_bytes / pixels
    mse = torch.nn.functional.mse_loss(x_orig_01[0], x_hat_01[0])
    PSNR = -10 * mse.log10().item()
    LPIPS_dB = -10 * np.log10(lpips_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item())
    DISTS_dB = -10 * np.log10(dists_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item())
    SSIM = 1 - ssim_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item()

    return {
        'encode_time': encode_time,
        'decode_time': decode_time,
        'bpp': bpp,
        'PSNR': PSNR,
        'LPIPS_dB': LPIPS_dB,
        'DISTS_dB': DISTS_dB,
        'SSIM': SSIM,
    }

In [4]:
results_dataset = inet.map(evaluate_quality_h1024)



Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript, serialized code (most recent call last):
  File "code/__torch__/torch/nn/modules/container/___torch_mangle_138396.py", line 14, in forward
    quant_conv = self.quant_conv
    encoder = self.encoder
    _0 = (quant_conv).forward((encoder).forward(input, ), )
                               ~~~~~~~~~~~~~~~~ <--- HERE
    _1, _2, _3, = (quantizer).forward(_0, )
    return (_1, _2, _3)
  File "code/__torch__/projects/edify_tokenizer/v1/module/layers2d/___torch_mangle_138391.py", line 55, in forward
    _6 = (_00).forward((downsample).forward(_4, ), )
    _3 = (_1).forward((_0).forward((_11).forward(_6, ), ), )
    _7 = (attn_1).forward((block_1).forward(_3, ), )
          ~~~~~~~~~~~~~~~ <--- HERE
    _8 = (norm_out).forward((block_2).forward(_7, ), )
    input0 = torch.mul(_8, torch.sigmoid(_8))
  File "code/__torch__/projects/edify_tokenizer/v1/module/layers2d/___torch_mangle_138380.py", line 40, in forward
    k0 = torch.reshape(_2, [_6, _10, int(torch.mul(h, w))])
    w_ = torch.bmm(q1, k0)
    input = torch.mul(w_, CONSTANTS.c2)
            ~~~~~~~~~ <--- HERE
    w_0 = torch.softmax(input, 2)
    v0 = torch.reshape(_3, [_5, _9, int(torch.mul(h, w))])

Traceback of TorchScript, original code (most recent call last):
/lustre/fsw/portfolios/nvr/projects/nvr_picasso/freda/projects/edify_tokenizer1/cosmos/DI1024_FSQ_cosmos_8x8_1118b_adv/projects/edify_tokenizer/v1/module/layers2d.py(112): forward
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1542): _slow_forward
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1561): _call_impl
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1552): _wrapped_call_impl
/lustre/fsw/portfolios/nvr/projects/nvr_picasso/freda/projects/edify_tokenizer1/cosmos/DI1024_FSQ_cosmos_8x8_1118b_adv/projects/edify_tokenizer/v1/module/layers2d.py(205): forward
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1542): _slow_forward
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1561): _call_impl
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1552): _wrapped_call_impl
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/container.py(218): forward
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1542): _slow_forward
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1561): _call_impl
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py(1552): _wrapped_call_impl
/usr/local/lib/python3.10/dist-packages/torch/jit/_trace.py(1274): trace_module
/usr/local/lib/python3.10/dist-packages/torch/jit/_trace.py(694): _trace_impl
/usr/local/lib/python3.10/dist-packages/torch/jit/_trace.py(999): trace
/usr/local/lib/python3.10/dist-packages/torch/_dynamo/eval_frame.py(574): _fn
/lustre/fsw/portfolios/nvr/projects/nvr_picasso/freda/projects/edify_tokenizer1/cosmos/DI1024_FSQ_cosmos_8x8_1118b_adv/projects/edify_tokenizer/v1/checkpointer.py(285): _get_ema_jit
/lustre/fsw/portfolios/nvr/projects/nvr_picasso/freda/projects/edify_tokenizer1/cosmos/DI1024_FSQ_cosmos_8x8_1118b_adv/projects/edify_tokenizer/v1/checkpointer.py(96): save
/lustre/fsw/portfolios/nvr/projects/nvr_picasso/freda/projects/edify_tokenizer1/cosmos/DI1024_FSQ_cosmos_8x8_1118b_adv/projects/edify_tokenizer/v1/trainer.py(152): train
/lustre/fsw/portfolios/nvr/projects/nvr_picasso/freda/projects/edify_tokenizer1/cosmos/DI1024_FSQ_cosmos_8x8_1118b_adv/scripts/train.py(45): launch
/usr/local/lib/python3.10/dist-packages/loguru/_logger.py(1277): catch_wrapper
/lustre/fsw/portfolios/nvr/projects/nvr_picasso/freda/projects/edify_tokenizer1/cosmos/DI1024_FSQ_cosmos_8x8_1118b_adv/scripts/train.py(84): <module>
/usr/lib/python3.10/runpy.py(86): _run_code
/usr/lib/python3.10/runpy.py(196): _run_module_as_main
RuntimeError: CUDA out of memory. Tried to allocate 14.03 GiB. GPU 0 has a total capacity of 23.65 GiB of which 7.67 GiB is free. Including non-PyTorch memory, this process has 15.94 GiB memory in use. Of the allocated memory 14.53 GiB is allocated by PyTorch, and 974.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


In [None]:
print("mean\n---")
for metric in [
    'bpp',
    'PSNR',
    'LPIPS_dB',
    'DISTS_dB',
    'SSIM',
]:
    μ = np.mean(results_dataset[metric])
    print(f"{metric}: {μ}")