In [1]:
import json
import os

generated_dir = '../results/2023-01-25_14.40.38.728069'
coco_dir = '../coco'

with open(os.path.join(generated_dir, 'generated.json')) as f:
    generated_pointers = json.load(f) 

In [2]:
generated_pointers[0:20]

[{'image_id': 203564,
  'caption': 'A bicycle replica with a clock as the front wheel.',
  'generated_image': '203564_a-bicycle-replica-with-a-clock-as-the-front-wheel.jpg'},
 {'image_id': 179765,
  'caption': 'A black Honda motorcycle parked in front of a garage.',
  'generated_image': '179765_a-black-honda-motorcycle-parked-in-front-of-a-garage.jpg'},
 {'image_id': 322141,
  'caption': 'A room with blue walls and a white sink and door.',
  'generated_image': '322141_a-room-with-blue-walls-and-a-white-sink-and-door.jpg'},
 {'image_id': 16977,
  'caption': 'A car that seems to be parked illegally behind a legally parked car',
  'generated_image': '16977_a-car-that-seems-to-be-parked-illegally-behind-a-legally-parked-car.jpg'},
 {'image_id': 106140,
  'caption': 'A large passenger airplane flying through the air.',
  'generated_image': '106140_a-large-passenger-airplane-flying-through-the-air.jpg'},
 {'image_id': 106140,
  'caption': 'There is a GOL plane taking off in a partly cloudy s

In [3]:
"""
CLIP-score doesn't actually need the real image; I am just testing that we are able to locate it.
We will need this for FID-score.
"""

def get_real_image_path(coco_dir, image_id):
    image_id_str = str(image_id).zfill(12) # pad to 12 characters
    image_base_fname = f"COCO_val2014_{image_id_str}.jpg"
    image_path = os.path.join(coco_dir, 'val2014', image_base_fname)
    return image_path

real_image_path = get_real_image_path(coco_dir, generated_pointers[0]['image_id'])

real_image_path

'../coco/val2014/COCO_val2014_000000203564.jpg'

In [None]:
# CLIP score demo

import torch
from torchvision.io import read_image
import torchvision.transforms as T
from torchmetrics.multimodal import CLIPScore


metric = CLIPScore(model_name_or_path="openai/clip-vit-base-patch16") # TODO: which CLIP model do others use?

score_noise = metric(torch.randint(255, (3, 224, 224)), "a photo of a cat")
print(f"score_noise: {score_noise}")

sum_clip_score = 0

for i in range(len(generated_pointers)):
    generated_image_path = os.path.join(generated_dir, generated_pointers[i]['generated_image'])
    generated_image = read_image(generated_image_path)

    # img = T.ToPILImage()(generated_image)
    # img.show()

    score_actual = metric(generated_image, generated_pointers[i]['caption'])
    print(f"score {i}: {score_actual}")

mean_clip_score = sum_clip_score / len(generated_pointers)

  from .autonotebook import tqdm as notebook_tqdm


score_noise: 24.652740478515625
score 0: 29.513614654541016
score 1: 28.45810890197754
score 2: 34.60138702392578
score 3: 24.628847122192383
score 4: 30.42526626586914
score 5: 24.69168472290039
score 6: 37.36733627319336
score 7: 34.12033462524414
score 8: 33.41608810424805
score 9: 27.333444595336914
score 10: 30.239782333374023
score 11: 29.865205764770508
score 12: 26.857927322387695
score 13: 34.22323989868164
score 14: 30.690690994262695
score 15: 32.1330680847168
score 16: 32.03816223144531
score 17: 29.20905303955078
score 18: 31.937095642089844
score 19: 32.5599365234375
score 20: 30.953853607177734
score 21: 35.94513702392578
score 22: 35.440895080566406
score 23: 28.53565788269043
score 24: 29.20604705810547
score 25: 31.601783752441406
score 26: 29.59362030029297
score 27: 28.22239112854004
score 28: 36.76041793823242
score 29: 26.873912811279297
score 30: 31.808015823364258
score 31: 35.722328186035156
score 32: 36.527652740478516
score 33: 33.458377838134766
score 34: 33

In [None]:
# prep for FID score demo

generated_images = []
real_images = []

for i in range(len(generated_pointers)):
    image_path = get_real_image_path(coco_dir, generated_pointers[i]['image_id'])
    # print(image_path)
    real_images.append(read_image(image_path))

for i in range(len(generated_pointers)):
    image_path = os.path.join(generated_dir, generated_pointers[i]['generated_image'])
    generated_images.append(read_image(image_path))


    

In [None]:
img = real_images[0]
img.shape
img2 = img.unsqueeze(0)
img2.shape

In [None]:
# FID score demo

from torchmetrics.image.fid import FrechetInceptionDistance

fid = FrechetInceptionDistance(feature=2048)

"""
We are running the following with batch size 1. It would likely be faster to use a larger batch size.
But, the real images have various resolutions and aspect ratios.
It's convenient to let torchmetrics take care of resizing the images.
"""

i = 0
for generated, real in zip(generated_images, real_images):
    fid.update(generated.unsqueeze(0), real=False)
    fid.update(real.unsqueeze(0), real=True)
    
    if i%100 == 0:
        print(f"fid score iteration {i}")
    
    i += 1

fid_score = fid.compute()
print(f"fid score: {fid_score}")