In [None]:
# Results dataset from prompts after Inversion
import pandas as pd
compare_promts = pd.read_parquet('XXX')
compare_promts

## Bert Scores

In [3]:
from bert_score import score

def bert_sim_eval(candidate_sentence, target_sentence):
    # Put sentences in lists since bert-score's score function expects list inputs
    candidates = [candidate_sentence]
    references = [target_sentence]

    # Calculate BertScore
    P, R, F1 = score(candidates, references, lang='en', verbose=True)
    return {"P":P.item(),"R":R.item(),"F1":F1.item()}

In [None]:
from tqdm import tqdm
P_ours = []
R_ours = []
F1_ours = []
for index,item in tqdm(compare_promts.iterrows(),total=len(compare_promts)):    
    res = bert_sim_eval(item['rl_generation'],item['reference_prompt'])
    P_ours.append(res['P'])
    R_ours.append(res['R'])
    F1_ours.append(res['F1'])

In [None]:
print(sum(P_ours)/len(P_ours))
print(sum(R_ours)/len(R_ours))
print(sum(F1_ours)/len(F1_ours))

## CLIP: Calculate similarity between target image and images generated from prompts using two different methods

In [6]:
import open_clip
import torch
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, _, clip_preprocess = open_clip.create_model_and_transforms("ViT-L-14", pretrained="openai", device=device)

def measure_clip_imgtxt_similarity(image_path, txt_1):
    text = open_clip.tokenize([txt_1]).to(device)
    orig_images_t = clip_preprocess(Image.open(image_path)).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = clip_model.encode_image(orig_images_t)
        text_features = clip_model.encode_text(text)
        
    cosine_similarity = torch.nn.functional.cosine_similarity(image_features, text_features, dim=1)

    return cosine_similarity.cpu().numpy().tolist()

    

def measure_clip_imgs_similarity(orig_images_t, pred_imgs_t, clip_model):
    with torch.no_grad():
        orig_feat = clip_model.encode_image(orig_images_t)
        orig_feat = orig_feat / orig_feat.norm(dim=1, keepdim=True)

        pred_feat = clip_model.encode_image(pred_imgs_t)
        pred_feat = pred_feat / pred_feat.norm(dim=1, keepdim=True)
        return (orig_feat @ pred_feat.t()).mean().item()

In [None]:
from diffusers import StableDiffusionPipeline
from diffusers import PNDMScheduler
from PIL import Image
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"


model_id = "runwayml/stable-diffusion-v1-5"
scheduler = PNDMScheduler.from_pretrained(model_id, subfolder="scheduler")

pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
    scheduler=scheduler,
    torch_dtype=torch.float16,
    ).to(device)

image_length = 512

In [None]:
from tqdm import tqdm

eval_sim_ours = []

for index, item in tqdm(compare_promts.iterrows(),total=len(compare_promts)):
    orig_image = Image.open(item['img_path']).convert('RGB')
    with torch.no_grad():
        pred_imgs = pipe(
                    item['rl_generation'],
                    num_images_per_prompt=1,
                    guidance_scale=7.5,
                    num_inference_steps=50,
                    height=image_length,
                    width=image_length,
                    ).images
        orig_images_temp = [clip_preprocess(orig_image).unsqueeze(0)]
        orig_images_t = torch.cat(orig_images_temp).to(device)
        pred_imgs_temp = [clip_preprocess(i).unsqueeze(0) for i in pred_imgs]
        pred_imgs_t = torch.cat(pred_imgs_temp).to(device)
        eval_sim_ours.append(measure_clip_imgs_similarity(orig_images_t, pred_imgs_t, clip_model))

In [None]:
sum(eval_sim_ours)/len(eval_sim_ours)

## LPIPS: Semantic Difference Between Target and Generated Images

In [None]:
import torch
import lpips
import itertools
from PIL import Image
from torchvision import transforms

# Load LPIPS model
lpips_model = lpips.LPIPS(net='alex').to(device)

# Load and preprocess images
transform = transforms.Compose([transforms.Resize((256, 256)),transforms.ToTensor(),])

def load_image(image_path):
    image = Image.open(image_path).convert('RGB')
    return transform(image).unsqueeze(0)

def measure_lpips_imgs_similarity(img1, img2):
    distance = lpips_model(img1, img2)
    # Calculate diversity metric, here we take the average
    return distance.item()

In [None]:
lpips_sim_ours = []

for index, item in tqdm(compare_promts.iterrows(),total=len(compare_promts)):
    orig_image_t = load_image(item['img_path']).to(device)
    with torch.no_grad():
        pred_imgs = pipe(
                    item['rl_generation'],
                    num_images_per_prompt=1,
                    guidance_scale=7.5,
                    num_inference_steps=50,
                    height=image_length,
                    width=image_length,
                    )
        pred_img = pred_imgs.images[0]
        pred_img_t = transform(pred_img).unsqueeze(0).to(device)

        lpips_sim_ours.append(lpips_model(orig_image_t, pred_img_t).item())
        #.append(measure_lpips_imgs_similarity())

In [None]:
print(sum(lpips_sim_ours)/len(lpips_sim_ours))