| Model (training dataset)             |   bpp   |  PSNR  | LPIPS (dB) | DISTS (dB) |  SSIM  |
|--------------------------------------|--------:|-------:|-----------:|-----------:|-------:|
| Cosmos di16×16 (Proprietary)         | 0.0625  | 21.7743 | 5.3784     | 10.6189    | 0.6449 |
| LiVeAction f16c12 (LSDIR)            | 0.1507  | 26.7708 | 4.5229     |  8.9960    | 0.7295 |
| LiVeAction f16c12 (ImageNet)         | 0.1849  | 26.9727 | 4.7563     |  9.3797    | 0.7394 |
| Cosmos di8×8 (Proprietary)           | 0.2500  | 25.9193 | 7.7112     | 13.2647    | 0.8558 |
| WaLLoC f8c12 (LSDIR)                 | 0.6171  | 30.5576 | 6.5138     | 13.2437    | 0.9501 |
| LiVeAction f16c48 (LSDIR)            | 0.6456  | 30.8464 | 6.7503     | 13.4228    | 0.8296 |
| LiVeAction f16c48 (ImageNet)         | 0.7803  | 31.0352 | 7.0449     | 13.8776    | 0.8351 |
| WaLLoC f8c48 (LSDIR)                 | 2.5436  | 37.3370 | 11.6739    | 18.2942    | 0.9873 |

In [1]:
import torch, io, datasets, PIL.Image,  numpy as np
from huggingface_hub import hf_hub_download
from types import SimpleNamespace
from piq import LPIPS, DISTS, SSIMLoss
from autocodec.codec import AutoCodecND, latent_to_pil, pil_to_latent
from torchvision.transforms.v2.functional import to_pil_image, pil_to_tensor

In [2]:
device = "cuda"
dataset = datasets.load_dataset("danjacobellis/kodak")
checkpoint_file = hf_hub_download(
    repo_id="danjacobellis/autocodec",
    filename="rgb_f16c48_ft.pth"
)
checkpoint = torch.load(checkpoint_file, map_location="cpu",weights_only=False)
config = checkpoint['config']
codec = AutoCodecND(
    dim=2,
    input_channels=config.input_channels,
    J = int(np.log2(config.F)),
    latent_dim=config.latent_dim,
    encoder_depth = config.encoder_depth,
    encoder_kernel_size = config.encoder_kernel_size,
    decoder_depth = config.decoder_depth,
    lightweight_encode = config.lightweight_encode,
    lightweight_decode = config.lightweight_decode,
).to(device).to(torch.bfloat16)
codec.load_state_dict(checkpoint['state_dict'])
codec.eval();

lpips_loss = LPIPS().to(device)
dists_loss = DISTS().to(device)
ssim_loss = SSIMLoss().to(device)



In [3]:
def evaluate_quality(sample):
    img = sample['image'].convert("RGB")
    x_orig = pil_to_tensor(img).to(device).unsqueeze(0).to(torch.bfloat16) / 127.5 - 1.0
    orig_dim = x_orig.numel() 
    with torch.no_grad():
        z = codec.encode(x_orig)
        latent = codec.quantize.compand(z).round()
    webp = latent_to_pil(latent.cpu(), n_bits=8, C=3)
    buff = io.BytesIO()
    webp[0].save(buff, format='WEBP', lossless=True)
    size_bytes = len(buff.getbuffer())
    latent_decoded = pil_to_latent(webp, N=config.latent_dim, n_bits=8, C=3).to(device).to(torch.bfloat16)
    with torch.no_grad():
        x_hat = codec.decode(latent_decoded).clamp(-1,1)
    x_orig_01 = x_orig / 2 + 0.5
    x_hat_01 = x_hat / 2 + 0.5
    pixels = img.width * img.height
    bpp = 8 * size_bytes / pixels
    mse = torch.nn.functional.mse_loss(x_orig_01[0], x_hat_01[0])
    PSNR = -10 * mse.log10().item()
    LPIPS_dB = -10 * np.log10(lpips_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item())
    DISTS_dB = -10 * np.log10(dists_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item())
    SSIM = 1 - ssim_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item()

    return {
        'pixels': pixels,
        'bpp': bpp,
        'PSNR': PSNR,
        'LPIPS_dB': LPIPS_dB,
        'DISTS_dB': DISTS_dB,
        'SSIM': SSIM,
    }

In [4]:
results_dataset = dataset['validation'].map(evaluate_quality)

In [5]:
print("mean\n---")
for metric in [
    'pixels',
    'bpp',
    'PSNR',
    'LPIPS_dB',
    'DISTS_dB',
    'SSIM',
]:
    μ = np.mean(results_dataset[metric])
    print(f"{metric}: {μ}")

mean
---
pixels: 393216.0
bpp: 0.7802615695529515
PSNR: 31.03515625
LPIPS_dB: 7.04490827424515
DISTS_dB: 13.877594294056875
SSIM: 0.8351236979166666


---

In [1]:
import torch, io, datasets, PIL.Image,  numpy as np
from huggingface_hub import hf_hub_download
from types import SimpleNamespace
from piq import LPIPS, DISTS, SSIMLoss
from autocodec.codec import AutoCodecND, latent_to_pil, pil_to_latent
from torchvision.transforms.v2.functional import to_pil_image, pil_to_tensor

In [2]:
device = "cuda"
dataset = datasets.load_dataset("danjacobellis/kodak")
checkpoint_file = hf_hub_download(
    repo_id="danjacobellis/autocodec",
    filename="rgb_f16c48.pth"
)
checkpoint = torch.load(checkpoint_file, map_location="cpu",weights_only=False)
config = checkpoint['config']
codec = AutoCodecND(
    dim=2,
    input_channels=config.input_channels,
    J = int(np.log2(config.F)),
    latent_dim=config.latent_dim,
    encoder_depth = config.encoder_depth,
    encoder_kernel_size = config.encoder_kernel_size,
    decoder_depth = config.decoder_depth,
    lightweight_encode = config.lightweight_encode,
    lightweight_decode = config.lightweight_decode,
).to(device).to(torch.bfloat16)
codec.load_state_dict(checkpoint['state_dict'])
codec.eval();

lpips_loss = LPIPS().to(device)
dists_loss = DISTS().to(device)
ssim_loss = SSIMLoss().to(device)

rgb_f16c48.pth:   0%|          | 0.00/306M [00:00<?, ?B/s]



In [3]:
def evaluate_quality(sample):
    img = sample['image'].convert("RGB")
    x_orig = pil_to_tensor(img).to(device).unsqueeze(0).to(torch.bfloat16) / 127.5 - 1.0
    orig_dim = x_orig.numel() 
    with torch.no_grad():
        z = codec.encode(x_orig)
        latent = codec.quantize.compand(z).round()
    webp = latent_to_pil(latent.cpu(), n_bits=8, C=3)
    buff = io.BytesIO()
    webp[0].save(buff, format='WEBP', lossless=True)
    size_bytes = len(buff.getbuffer())
    latent_decoded = pil_to_latent(webp, N=config.latent_dim, n_bits=8, C=3).to(device).to(torch.bfloat16)
    with torch.no_grad():
        x_hat = codec.decode(latent_decoded).clamp(-1,1)
    x_orig_01 = x_orig / 2 + 0.5
    x_hat_01 = x_hat / 2 + 0.5
    pixels = img.width * img.height
    bpp = 8 * size_bytes / pixels
    mse = torch.nn.functional.mse_loss(x_orig_01[0], x_hat_01[0])
    PSNR = -10 * mse.log10().item()
    LPIPS_dB = -10 * np.log10(lpips_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item())
    DISTS_dB = -10 * np.log10(dists_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item())
    SSIM = 1 - ssim_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item()

    return {
        'pixels': pixels,
        'bpp': bpp,
        'PSNR': PSNR,
        'LPIPS_dB': LPIPS_dB,
        'DISTS_dB': DISTS_dB,
        'SSIM': SSIM,
    }

In [4]:
results_dataset = dataset['validation'].map(evaluate_quality)

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

In [5]:
print("mean\n---")
for metric in [
    'pixels',
    'bpp',
    'PSNR',
    'LPIPS_dB',
    'DISTS_dB',
    'SSIM',
]:
    μ = np.mean(results_dataset[metric])
    print(f"{metric}: {μ}")

mean
---
pixels: 393216.0
bpp: 0.6455654568142362
PSNR: 30.846354166666668
LPIPS_dB: 6.750272797771676
DISTS_dB: 13.422796385256314
SSIM: 0.82958984375


---

In [1]:
import torch, io, datasets, PIL.Image,  numpy as np
from huggingface_hub import hf_hub_download
from types import SimpleNamespace
from piq import LPIPS, DISTS, SSIMLoss
from autocodec.codec import AutoCodecND, latent_to_pil, pil_to_latent
from torchvision.transforms.v2.functional import to_pil_image, pil_to_tensor

In [2]:
device = "cuda"
dataset = datasets.load_dataset("danjacobellis/kodak")
checkpoint_file = hf_hub_download(
    repo_id="danjacobellis/autocodec",
    filename="rgb_f16c12_ft.pth"
)
checkpoint = torch.load(checkpoint_file, map_location="cpu",weights_only=False)
config = checkpoint['config']
codec = AutoCodecND(
    dim=2,
    input_channels=config.input_channels,
    J = int(np.log2(config.F)),
    latent_dim=config.latent_dim,
    encoder_depth = config.encoder_depth,
    encoder_kernel_size = config.encoder_kernel_size,
    decoder_depth = config.decoder_depth,
    lightweight_encode = config.lightweight_encode,
    lightweight_decode = config.lightweight_decode,
).to(device).to(torch.bfloat16)
codec.load_state_dict(checkpoint['state_dict'])
codec.eval();

lpips_loss = LPIPS().to(device)
dists_loss = DISTS().to(device)
ssim_loss = SSIMLoss().to(device)

rgb_f16c12_ft.pth:  75%|#######5  | 231M/306M [00:00<?, ?B/s]



In [3]:
def evaluate_quality(sample):
    img = sample['image'].convert("RGB")
    x_orig = pil_to_tensor(img).to(device).unsqueeze(0).to(torch.bfloat16) / 127.5 - 1.0
    orig_dim = x_orig.numel() 
    with torch.no_grad():
        z = codec.encode(x_orig)
        latent = codec.quantize.compand(z).round()
    webp = latent_to_pil(latent.cpu(), n_bits=8, C=3)
    buff = io.BytesIO()
    webp[0].save(buff, format='WEBP', lossless=True)
    size_bytes = len(buff.getbuffer())
    latent_decoded = pil_to_latent(webp, N=config.latent_dim, n_bits=8, C=3).to(device).to(torch.bfloat16)
    with torch.no_grad():
        x_hat = codec.decode(latent_decoded).clamp(-1,1)
    x_orig_01 = x_orig / 2 + 0.5
    x_hat_01 = x_hat / 2 + 0.5
    pixels = img.width * img.height
    bpp = 8 * size_bytes / pixels
    mse = torch.nn.functional.mse_loss(x_orig_01[0], x_hat_01[0])
    PSNR = -10 * mse.log10().item()
    LPIPS_dB = -10 * np.log10(lpips_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item())
    DISTS_dB = -10 * np.log10(dists_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item())
    SSIM = 1 - ssim_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item()

    return {
        'pixels': pixels,
        'bpp': bpp,
        'PSNR': PSNR,
        'LPIPS_dB': LPIPS_dB,
        'DISTS_dB': DISTS_dB,
        'SSIM': SSIM,
    }

In [4]:
results_dataset = dataset['validation'].map(evaluate_quality)

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

In [5]:
print("mean\n---")
for metric in [
    'pixels',
    'bpp',
    'PSNR',
    'LPIPS_dB',
    'DISTS_dB',
    'SSIM',
]:
    μ = np.mean(results_dataset[metric])
    print(f"{metric}: {μ}")

mean
---
pixels: 393216.0
bpp: 0.18493143717447916
PSNR: 26.97265625
LPIPS_dB: 4.7562505625050795
DISTS_dB: 9.379736408196386
SSIM: 0.7394205729166666


---

In [1]:
import torch, io, datasets, PIL.Image,  numpy as np
from huggingface_hub import hf_hub_download
from types import SimpleNamespace
from piq import LPIPS, DISTS, SSIMLoss
from autocodec.codec import AutoCodecND, latent_to_pil, pil_to_latent
from torchvision.transforms.v2.functional import to_pil_image, pil_to_tensor

In [2]:
device = "cuda"
dataset = datasets.load_dataset("danjacobellis/kodak")
checkpoint_file = hf_hub_download(
    repo_id="danjacobellis/autocodec",
    filename="rgb_f16c12.pth"
)
checkpoint = torch.load(checkpoint_file, map_location="cpu",weights_only=False)
config = checkpoint['config']
codec = AutoCodecND(
    dim=2,
    input_channels=config.input_channels,
    J = int(np.log2(config.F)),
    latent_dim=config.latent_dim,
    encoder_depth = config.encoder_depth,
    encoder_kernel_size = config.encoder_kernel_size,
    decoder_depth = config.decoder_depth,
    lightweight_encode = config.lightweight_encode,
    lightweight_decode = config.lightweight_decode,
).to(device).to(torch.bfloat16)
codec.load_state_dict(checkpoint['state_dict'])
codec.eval();

lpips_loss = LPIPS().to(device)
dists_loss = DISTS().to(device)
ssim_loss = SSIMLoss().to(device)

rgb_f16c12.pth:   0%|          | 0.00/306M [00:00<?, ?B/s]



In [3]:
def evaluate_quality(sample):
    img = sample['image'].convert("RGB")
    x_orig = pil_to_tensor(img).to(device).unsqueeze(0).to(torch.bfloat16) / 127.5 - 1.0
    orig_dim = x_orig.numel() 
    with torch.no_grad():
        z = codec.encode(x_orig)
        latent = codec.quantize.compand(z).round()
    webp = latent_to_pil(latent.cpu(), n_bits=8, C=3)
    buff = io.BytesIO()
    webp[0].save(buff, format='WEBP', lossless=True)
    size_bytes = len(buff.getbuffer())
    latent_decoded = pil_to_latent(webp, N=config.latent_dim, n_bits=8, C=3).to(device).to(torch.bfloat16)
    with torch.no_grad():
        x_hat = codec.decode(latent_decoded).clamp(-1,1)
    x_orig_01 = x_orig / 2 + 0.5
    x_hat_01 = x_hat / 2 + 0.5
    pixels = img.width * img.height
    bpp = 8 * size_bytes / pixels
    mse = torch.nn.functional.mse_loss(x_orig_01[0], x_hat_01[0])
    PSNR = -10 * mse.log10().item()
    LPIPS_dB = -10 * np.log10(lpips_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item())
    DISTS_dB = -10 * np.log10(dists_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item())
    SSIM = 1 - ssim_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item()

    return {
        'pixels': pixels,
        'bpp': bpp,
        'PSNR': PSNR,
        'LPIPS_dB': LPIPS_dB,
        'DISTS_dB': DISTS_dB,
        'SSIM': SSIM,
    }

In [4]:
results_dataset = dataset['validation'].map(evaluate_quality)

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

In [5]:
print("mean\n---")
for metric in [
    'pixels',
    'bpp',
    'PSNR',
    'LPIPS_dB',
    'DISTS_dB',
    'SSIM',
]:
    μ = np.mean(results_dataset[metric])
    print(f"{metric}: {μ}")

mean
---
pixels: 393216.0
bpp: 0.15073140462239584
PSNR: 26.770833333333332
LPIPS_dB: 4.522872237559315
DISTS_dB: 8.996013821472557
SSIM: 0.7294921875


---

In [1]:
import torch, io, datasets, PIL.Image,  numpy as np, json
from huggingface_hub import hf_hub_download
from types import SimpleNamespace
from piq import LPIPS, DISTS, SSIMLoss
from walloc import walloc
from walloc.walloc import latent_to_pil, pil_to_latent
from torchvision.transforms.v2.functional import to_pil_image, pil_to_tensor

In [2]:
device = "cuda"
dataset = datasets.load_dataset("danjacobellis/kodak")
config_file = hf_hub_download(
    repo_id="danjacobellis/walloc",
    filename="RGB_16x.json"
)
codec_config = SimpleNamespace(**json.load(open(config_file)))
checkpoint_file = hf_hub_download(
    repo_id="danjacobellis/walloc",
    filename="RGB_16x.pth"
)
checkpoint = torch.load(checkpoint_file, map_location="cpu",weights_only=False)
codec = walloc.Codec2D(
    channels = codec_config.channels,
    J = codec_config.J,
    Ne = codec_config.Ne,
    Nd = codec_config.Nd,
    latent_dim = codec_config.latent_dim,
    latent_bits = codec_config.latent_bits,
    lightweight_encode = codec_config.lightweight_encode
)
codec.load_state_dict(checkpoint['model_state_dict'])
codec = codec.to(device)
codec.eval();

lpips_loss = LPIPS().to(device)
dists_loss = DISTS().to(device)
ssim_loss = SSIMLoss().to(device)



In [3]:
def evaluate_quality(sample):
    img = sample['image'].convert("RGB")
    x_orig = pil_to_tensor(img).to(device).unsqueeze(0).to(torch.float) / 255 - 0.5
    orig_dim = x_orig.numel() 
    with torch.no_grad():
        z = codec.encoder(codec.wavelet_analysis(x_orig,J=codec.J))
    webp = latent_to_pil(z.cpu(), n_bits=8, C=3)
    buff = io.BytesIO()
    webp[0].save(buff, format='WEBP', lossless=True)
    size_bytes = len(buff.getbuffer())
    latent_decoded = pil_to_latent(webp, N=codec_config.latent_dim, n_bits=8, C=3).to(device).to(torch.float16)
    with torch.no_grad():
        x_hat = codec.wavelet_synthesis(codec.decoder(z),J=codec.J).clamp(-0.5,0.5)
    x_orig_01 = x_orig + 0.5
    x_hat_01 = x_hat + 0.5
    pixels = img.width * img.height
    bpp = 8 * size_bytes / pixels
    mse = torch.nn.functional.mse_loss(x_orig_01[0], x_hat_01[0])
    PSNR = -10 * mse.log10().item()
    LPIPS_dB = -10 * np.log10(lpips_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item())
    DISTS_dB = -10 * np.log10(dists_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item())
    SSIM = 1 - ssim_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item()

    return {
        'pixels': pixels,
        'bpp': bpp,
        'PSNR': PSNR,
        'LPIPS_dB': LPIPS_dB,
        'DISTS_dB': DISTS_dB,
        'SSIM': SSIM,
    }

In [4]:
results_dataset = dataset['validation'].map(evaluate_quality)

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

In [5]:
print("mean\n---")
for metric in [
    'pixels',
    'bpp',
    'PSNR',
    'LPIPS_dB',
    'DISTS_dB',
    'SSIM',
]:
    μ = np.mean(results_dataset[metric])
    print(f"{metric}: {μ}")

mean
---
pixels: 393216.0
bpp: 0.6171129014756944
PSNR: 30.557596782843273
LPIPS_dB: 6.513779126102302
DISTS_dB: 13.243720676934606
SSIM: 0.950053483247757


---

In [1]:
import torch, io, datasets, PIL.Image,  numpy as np, json
from huggingface_hub import hf_hub_download
from types import SimpleNamespace
from piq import LPIPS, DISTS, SSIMLoss
from walloc import walloc
from walloc.walloc import latent_to_pil, pil_to_latent
from torchvision.transforms.v2.functional import to_pil_image, pil_to_tensor

In [2]:
device = "cuda"
dataset = datasets.load_dataset("danjacobellis/kodak")
config_file = hf_hub_download(
    repo_id="danjacobellis/walloc",
    filename="RGB_4x.json"
)
codec_config = SimpleNamespace(**json.load(open(config_file)))
checkpoint_file = hf_hub_download(
    repo_id="danjacobellis/walloc",
    filename="RGB_4x.pth"
)
checkpoint = torch.load(checkpoint_file, map_location="cpu",weights_only=False)
codec = walloc.Codec2D(
    channels = codec_config.channels,
    J = codec_config.J,
    Ne = codec_config.Ne,
    Nd = codec_config.Nd,
    latent_dim = codec_config.latent_dim,
    latent_bits = codec_config.latent_bits,
    lightweight_encode = codec_config.lightweight_encode
)
codec.load_state_dict(checkpoint['model_state_dict'])
codec = codec.to(device)
codec.eval();

lpips_loss = LPIPS().to(device)
dists_loss = DISTS().to(device)
ssim_loss = SSIMLoss().to(device)

RGB_4x.json:   0%|          | 0.00/388 [00:00<?, ?B/s]

RGB_4x.pth:   0%|          | 0.00/229M [00:00<?, ?B/s]



In [3]:
def evaluate_quality(sample):
    img = sample['image'].convert("RGB")
    x_orig = pil_to_tensor(img).to(device).unsqueeze(0).to(torch.float) / 255 - 0.5
    orig_dim = x_orig.numel() 
    with torch.no_grad():
        z = codec.encoder(codec.wavelet_analysis(x_orig,J=codec.J))
    webp = latent_to_pil(z.cpu(), n_bits=8, C=3)
    buff = io.BytesIO()
    webp[0].save(buff, format='WEBP', lossless=True)
    size_bytes = len(buff.getbuffer())
    latent_decoded = pil_to_latent(webp, N=codec_config.latent_dim, n_bits=8, C=3).to(device).to(torch.float16)
    with torch.no_grad():
        x_hat = codec.wavelet_synthesis(codec.decoder(z),J=codec.J).clamp(-0.5,0.5)
    x_orig_01 = x_orig + 0.5
    x_hat_01 = x_hat + 0.5
    pixels = img.width * img.height
    bpp = 8 * size_bytes / pixels
    mse = torch.nn.functional.mse_loss(x_orig_01[0], x_hat_01[0])
    PSNR = -10 * mse.log10().item()
    LPIPS_dB = -10 * np.log10(lpips_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item())
    DISTS_dB = -10 * np.log10(dists_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item())
    SSIM = 1 - ssim_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item()

    return {
        'pixels': pixels,
        'bpp': bpp,
        'PSNR': PSNR,
        'LPIPS_dB': LPIPS_dB,
        'DISTS_dB': DISTS_dB,
        'SSIM': SSIM,
    }

In [4]:
results_dataset = dataset['validation'].map(evaluate_quality)

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

In [5]:
print("mean\n---")
for metric in [
    'pixels',
    'bpp',
    'PSNR',
    'LPIPS_dB',
    'DISTS_dB',
    'SSIM',
]:
    μ = np.mean(results_dataset[metric])
    print(f"{metric}: {μ}")

mean
---
pixels: 393216.0
bpp: 2.5435909695095487
PSNR: 37.337041000525154
LPIPS_dB: 11.673939380783379
DISTS_dB: 18.29417419215874
SSIM: 0.9873491401473681


---

In [1]:
import torch, io, datasets, PIL.Image,  numpy as np
from huggingface_hub import snapshot_download
from types import SimpleNamespace
from piq import LPIPS, DISTS, SSIMLoss
from torchvision.transforms.v2.functional import to_pil_image, pil_to_tensor
from cosmos_tokenizer.image_lib import ImageTokenizer

In [2]:
device = "cuda"
dataset = datasets.load_dataset("danjacobellis/kodak")
model_path = snapshot_download(repo_id='nvidia/Cosmos-Tokenizer-DI8x8')
encoder = ImageTokenizer(checkpoint_enc=f'{model_path}/encoder.jit').to(device)
decoder = ImageTokenizer(checkpoint_dec=f'{model_path}/decoder.jit').to(device)

lpips_loss = LPIPS().to(device)
dists_loss = DISTS().to(device)
ssim_loss = SSIMLoss().to(device)

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

autoencoder.jit:   0%|          | 0.00/159M [00:00<?, ?B/s]

decoder.jit:   0%|          | 0.00/94.5M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

.gitattributes: 0.00B [00:00, ?B/s]

encoder.jit:   0%|          | 0.00/65.5M [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

model_config.yaml:   0%|          | 0.00/92.0 [00:00<?, ?B/s]



In [3]:
def evaluate_quality(sample):
    img = sample['image'].convert("RGB")
    x_orig = pil_to_tensor(img).to(device).unsqueeze(0).to(torch.float) / 127.5 - 1.0
    orig_dim = x_orig.numel() 
    with torch.no_grad():
        z = encoder.encode(x_orig)[0]
    size_bytes = 2*z.numel()
    with torch.no_grad():
        x_hat = decoder.decode(z).to(torch.float).clamp(-1,1)
    x_orig_01 = x_orig / 2 + 0.5
    x_hat_01 = x_hat / 2 + 0.5
    pixels = img.width * img.height
    bpp = 8 * size_bytes / pixels
    mse = torch.nn.functional.mse_loss(x_orig_01[0], x_hat_01[0])
    PSNR = -10 * mse.log10().item()
    LPIPS_dB = -10 * np.log10(lpips_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item())
    DISTS_dB = -10 * np.log10(dists_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item())
    SSIM = 1 - ssim_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item()

    return {
        'pixels': pixels,
        'bpp': bpp,
        'PSNR': PSNR,
        'LPIPS_dB': LPIPS_dB,
        'DISTS_dB': DISTS_dB,
        'SSIM': SSIM,
    }

In [4]:
results_dataset = dataset['validation'].map(evaluate_quality)



Map:   0%|          | 0/24 [00:00<?, ? examples/s]

In [5]:
print("mean\n---")
for metric in [
    'pixels',
    'bpp',
    'PSNR',
    'LPIPS_dB',
    'DISTS_dB',
    'SSIM',
]:
    μ = np.mean(results_dataset[metric])
    print(f"{metric}: {μ}")

mean
---
pixels: 393216.0
bpp: 0.25
PSNR: 25.919317603111267
LPIPS_dB: 7.711161962844201
DISTS_dB: 13.264738029047363
SSIM: 0.8557981674869856


---

In [1]:
import torch, io, datasets, PIL.Image,  numpy as np
from huggingface_hub import snapshot_download
from types import SimpleNamespace
from piq import LPIPS, DISTS, SSIMLoss
from torchvision.transforms.v2.functional import to_pil_image, pil_to_tensor
from cosmos_tokenizer.image_lib import ImageTokenizer

In [2]:
device = "cuda"
dataset = datasets.load_dataset("danjacobellis/kodak")
model_path = snapshot_download(repo_id='nvidia/Cosmos-Tokenizer-DI16x16')
encoder = ImageTokenizer(checkpoint_enc=f'{model_path}/encoder.jit').to(device)
decoder = ImageTokenizer(checkpoint_dec=f'{model_path}/decoder.jit').to(device)

lpips_loss = LPIPS().to(device)
dists_loss = DISTS().to(device)
ssim_loss = SSIMLoss().to(device)

Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]



In [3]:
def evaluate_quality(sample):
    img = sample['image'].convert("RGB")
    x_orig = pil_to_tensor(img).to(device).unsqueeze(0).to(torch.float) / 127.5 - 1.0
    orig_dim = x_orig.numel() 
    with torch.no_grad():
        z = encoder.encode(x_orig)[0]
    size_bytes = 2*z.numel()
    with torch.no_grad():
        x_hat = decoder.decode(z).to(torch.float).clamp(-1,1)
    x_orig_01 = x_orig / 2 + 0.5
    x_hat_01 = x_hat / 2 + 0.5
    pixels = img.width * img.height
    bpp = 8 * size_bytes / pixels
    mse = torch.nn.functional.mse_loss(x_orig_01[0], x_hat_01[0])
    PSNR = -10 * mse.log10().item()
    LPIPS_dB = -10 * np.log10(lpips_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item())
    DISTS_dB = -10 * np.log10(dists_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item())
    SSIM = 1 - ssim_loss(x_orig_01.to("cuda"), x_hat_01.to("cuda")).item()

    return {
        'pixels': pixels,
        'bpp': bpp,
        'PSNR': PSNR,
        'LPIPS_dB': LPIPS_dB,
        'DISTS_dB': DISTS_dB,
        'SSIM': SSIM,
    }

In [4]:
results_dataset = dataset['validation'].map(evaluate_quality)



Map:   0%|          | 0/24 [00:00<?, ? examples/s]

In [5]:
print("mean\n---")
for metric in [
    'pixels',
    'bpp',
    'PSNR',
    'LPIPS_dB',
    'DISTS_dB',
    'SSIM',
]:
    μ = np.mean(results_dataset[metric])
    print(f"{metric}: {μ}")

mean
---
pixels: 393216.0
bpp: 0.0625
PSNR: 21.77425762017568
LPIPS_dB: 5.378436234862551
DISTS_dB: 10.618903597362483
SSIM: 0.6448745379845301
