In [1]:
import PIL
from PIL import Image
import torch
import numpy as np
from diffusers import StableDiffusionImg2ImgPipeline, DPMSolverMultistepScheduler

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
input_image = Image.open("../media/input_images/1041.png").convert("RGB")
input_prompt = "a pattern of orange and blue flowers with leaves on a white background, luxury brand, Tom Lowell style, dense brushstrokes, drawing technique Tempera,  high quality, smooth, colorful, vibrant colors, high contrast, beautiful, dynamic lighting, 16k, high resolution, painting, digital illustration, white background, trending, bloom, wallpaper, colourful,  texture"

pipe = StableDiffusionImg2ImgPipeline.from_pretrained("/home/sd_models/deliberate_v2/")
pipe = pipe.to("cuda", dtype=torch.float16)

Loading pipeline components...:  14%|█▍        | 1/7 [00:00<00:01,  4.28it/s]Some weights of StableDiffusionSafetyChecker were not initialized from the model checkpoint at /home/sd_models/deliberate_v2/safety_checker and are newly initialized: ['vision_model.vision_model.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Loading pipeline components...: 100%|██████████| 7/7 [00:05<00:00,  1.21it/s]


In [3]:
seed = 0
generator = torch.manual_seed(seed)

input_images = []
original_captions = []
modified_captions = []
edited_images = []

input_image = input_image
edit_instruction = input_prompt
edited_image = pipe(
        prompt=edit_instruction,
        image=input_image,
        output_type="pil",
        generator=generator,
    ).images[0]
edited_image.save(f"output.png")


100%|██████████| 40/40 [00:08<00:00,  4.73it/s]


In [4]:
  input_images.append(np.array(input_image))
  original_captions.append("a pattern of blue and brown flowers on a white background, ffffound, pixiv, peonies, beige, high quality material bssrdf, motif, uncompressed png, aesthetic / a riot in mars, rococo fashion, brown red blue, sad motif")
  modified_captions.append("a pattern of orange and blue flowers on a white background, ffffound, kitsch fashion, bushes, inspired by Alfred Jensen, 中 元 节, 2 0 1 4. modern attire, blue - print, motif, cinnamon #b57e59 skin color, honeysuckle, azimov")
  edited_images.append(edited_image)

In [5]:
from transformers import (
    CLIPTokenizer,
    CLIPTextModelWithProjection,
    CLIPVisionModelWithProjection,
    CLIPImageProcessor,
)

clip_id = "openai/clip-vit-large-patch14"
tokenizer = CLIPTokenizer.from_pretrained(clip_id)
text_encoder = CLIPTextModelWithProjection.from_pretrained(clip_id).to("cuda")
image_processor = CLIPImageProcessor.from_pretrained(clip_id)
image_encoder = CLIPVisionModelWithProjection.from_pretrained(clip_id).to("cuda")

Some weights of the model checkpoint at openai/clip-vit-large-patch14 were not used when initializing CLIPTextModelWithProjection: ['vision_model.encoder.layers.14.layer_norm1.weight', 'vision_model.encoder.layers.9.layer_norm2.bias', 'vision_model.encoder.layers.2.mlp.fc1.weight', 'vision_model.encoder.layers.6.self_attn.q_proj.weight', 'vision_model.encoder.layers.20.mlp.fc2.bias', 'vision_model.encoder.layers.15.layer_norm1.bias', 'vision_model.encoder.layers.8.mlp.fc1.weight', 'vision_model.encoder.layers.0.mlp.fc1.weight', 'vision_model.encoder.layers.14.mlp.fc2.weight', 'vision_model.encoder.layers.20.self_attn.v_proj.bias', 'vision_model.encoder.layers.4.layer_norm1.weight', 'vision_model.encoder.layers.18.layer_norm2.weight', 'vision_model.encoder.layers.12.layer_norm1.weight', 'vision_model.encoder.layers.1.self_attn.k_proj.bias', 'vision_model.encoder.layers.8.self_attn.q_proj.weight', 'vision_model.encoder.layers.6.self_attn.k_proj.bias', 'vision_model.encoder.layers.4.layer

In [6]:
import torch.nn as nn
import torch.nn.functional as F


class DirectionalSimilarity(nn.Module):
    def __init__(self, tokenizer, text_encoder, image_processor, image_encoder):
        super().__init__()
        self.tokenizer = tokenizer
        self.text_encoder = text_encoder
        self.image_processor = image_processor
        self.image_encoder = image_encoder

    def preprocess_image(self, image):
        image = self.image_processor(image, return_tensors="pt")["pixel_values"]
        return {"pixel_values": image.to("cuda")}

    def tokenize_text(self, text):
        inputs = self.tokenizer(
            text,
            max_length=self.tokenizer.model_max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {"input_ids": inputs.input_ids.to("cuda")}

    def encode_image(self, image):
        preprocessed_image = self.preprocess_image(image)
        image_features = self.image_encoder(**preprocessed_image).image_embeds
        image_features = image_features / image_features.norm(dim=1, keepdim=True)
        return image_features

    def encode_text(self, text):
        tokenized_text = self.tokenize_text(text)
        text_features = self.text_encoder(**tokenized_text).text_embeds
        text_features = text_features / text_features.norm(dim=1, keepdim=True)
        return text_features

    def compute_directional_similarity(self, img_feat_one, img_feat_two, text_feat_one, text_feat_two):
        sim_direction = F.cosine_similarity(img_feat_two - img_feat_one, text_feat_two - text_feat_one)
        return sim_direction

    def forward(self, image_one, image_two, caption_one, caption_two):
        img_feat_one = self.encode_image(image_one)
        img_feat_two = self.encode_image(image_two)
        text_feat_one = self.encode_text(caption_one)
        text_feat_two = self.encode_text(caption_two)
        directional_similarity = self.compute_directional_similarity(
            img_feat_one, img_feat_two, text_feat_one, text_feat_two
        )
        return directional_similarity

In [9]:
dir_similarity = DirectionalSimilarity(tokenizer, text_encoder, image_processor, image_encoder)
scores = []

for i in range(len(input_images)):
    original_image = input_images[i]
    original_caption = original_captions[i]
    edited_image = edited_images[i]
    modified_caption = modified_captions[i]

    similarity_score = dir_similarity(original_image, edited_image, original_caption, modified_caption)
    scores.append(float(similarity_score.detach().cpu()))

print(f"CLIP directional similarity: {np.mean(scores)}")

CLIP directional similarity: 0.43593454360961914
