In [1]:
import json
import os

generated_dir = '../results/2023-01-25_13.35.59.974447'
coco_dir = '../coco'

with open(os.path.join(generated_dir, 'generated.json')) as f:
    generated_pointers = json.load(f) 

In [2]:
generated_pointers

[{'image_id': 203564,
  'caption': 'A bicycle replica with a clock as the front wheel.',
  'generated_image': '203564_a-bicycle-replica-with-a-clock-as-the-front-wheel.jpg'},
 {'image_id': 179765,
  'caption': 'A black Honda motorcycle parked in front of a garage.',
  'generated_image': '179765_a-black-honda-motorcycle-parked-in-front-of-a-garage.jpg'},
 {'image_id': 322141,
  'caption': 'A room with blue walls and a white sink and door.',
  'generated_image': '322141_a-room-with-blue-walls-and-a-white-sink-and-door.jpg'},
 {'image_id': 16977,
  'caption': 'A car that seems to be parked illegally behind a legally parked car',
  'generated_image': '16977_a-car-that-seems-to-be-parked-illegally-behind-a-legally-parked-car.jpg'},
 {'image_id': 106140,
  'caption': 'A large passenger airplane flying through the air.',
  'generated_image': '106140_a-large-passenger-airplane-flying-through-the-air.jpg'},
 {'image_id': 106140,
  'caption': 'There is a GOL plane taking off in a partly cloudy s

In [3]:
"""
CLIP-score doesn't actually need the real image; I am just testing that we are able to locate it.
We will need this for FID-score.
"""

def get_real_image_path(coco_dir, image_id):
    image_id_str = str(image_id).zfill(12) # pad to 12 characters
    image_base_fname = f"COCO_val2014_{image_id_str}.jpg"
    image_path = os.path.join(coco_dir, 'val2014', image_base_fname)
    return image_path

real_image_path = get_real_image_path(coco_dir, generated_pointers[0]['image_id'])

real_image_path

'../coco/val2014/COCO_val2014_000000203564.jpg'

In [4]:
# CLIP score demo

import torch
from torchvision.io import read_image
import torchvision.transforms as T
from torchmetrics.multimodal import CLIPScore




metric = CLIPScore(model_name_or_path="openai/clip-vit-base-patch16") # TODO: which CLIP model do others use?

score_noise = metric(torch.randint(255, (3, 224, 224)), "a photo of a cat")
print(f"score_noise: {score_noise}")

for i in range(len(generated_pointers)):
    generated_image_path = os.path.join(generated_dir, generated_pointers[i]['generated_image'])
    generated_image = read_image(generated_image_path)

    # img = T.ToPILImage()(generated_image)
    # img.show()

    score_actual = metric(generated_image, generated_pointers[i]['caption'])
    print(f"score {i}: {score_actual}")


  from .autonotebook import tqdm as notebook_tqdm


score_noise: 24.244796752929688
score 0: 32.35782241821289
score 1: 29.201053619384766
score 2: 36.21590805053711
score 3: 26.181865692138672
score 4: 28.211578369140625
score 5: 26.74888038635254
score 6: 35.66602325439453
score 7: 32.951229095458984
score 8: 37.02812194824219
score 9: 28.038082122802734


In [5]:
# prep for FID score demo

generated_images = []
real_images = []

for i in range(len(generated_pointers)):
    image_path = get_real_image_path(coco_dir, generated_pointers[i]['image_id'])
    print(image_path)
    real_images.append(read_image(image_path))

for i in range(len(generated_pointers)):
    image_path = os.path.join(generated_dir, generated_pointers[i]['generated_image'])
    generated_images.append(read_image(image_path))


    

../coco/val2014/COCO_val2014_000000203564.jpg
../coco/val2014/COCO_val2014_000000179765.jpg
../coco/val2014/COCO_val2014_000000322141.jpg
../coco/val2014/COCO_val2014_000000016977.jpg
../coco/val2014/COCO_val2014_000000106140.jpg
../coco/val2014/COCO_val2014_000000106140.jpg
../coco/val2014/COCO_val2014_000000322141.jpg
../coco/val2014/COCO_val2014_000000322141.jpg
../coco/val2014/COCO_val2014_000000322141.jpg
../coco/val2014/COCO_val2014_000000203564.jpg


In [10]:
img = real_images[0]
img.shape
img2 = img.unsqueeze(0)
img2.shape

torch.Size([1, 3, 400, 400])

In [11]:
# FID score demo

from torchmetrics.image.fid import FrechetInceptionDistance

fid = FrechetInceptionDistance(feature=2048)

"""
We are running the following with batch size 1. It would likely be faster to use a larger batch size.
But, the real images have various resolutions and aspect ratios.
It's convenient to let torchmetrics take care of resizing the images.
"""

for generated, real in zip(generated_images, real_images):
    fid.update(generated.unsqueeze(0), real=False)
    fid.update(real.unsqueeze(0), real=True)

fid_score = fid.compute()
print(f"fid score: {fid_score}")

fid score: 181.83921813964844
