In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image

from datasets import load_dataset

# dataset = load_dataset("JotDe/mscoco_20k_unique_imgs")
# dataset = load_dataset("JotDe/mscoco_20k_unique_imgs")
dataset = load_dataset("google/docci")

In [None]:
model_name = 'BAAI/Bunny-v1_1-Llama-3-8B-V' # or 'BAAI/Bunny-Llama-3-8B-V' or 'BAAI/Bunny-v1_1-4B' or 'BAAI/Bunny-v1_0-4B' or 'BAAI/Bunny-v1_0-3B' or 'BAAI/Bunny-v1_0-3B-zh' or 'BAAI/Bunny-v1_0-2B-zh'
offset_bos = 1 # for Bunny-v1_1-Llama-3-8B-V, Bunny-Llama-3-8B-V, Bunny-v1_1-4B, Bunny-v1_0-4B and Bunny-v1_0-3B-zh
# offset_bos = 0 for Bunny-v1_0-3B and Bunny-v1_0-2B-zh

# create model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16, # float32 for cpu
    device_map='auto',
    trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True)

In [None]:
def get_concat_h(im1, im2):
    dst = Image.new('RGB', (im1.width + im2.width, im1.height))
    dst.paste(im1, (0, 0))
    dst.paste(im2, (im1.width, 0))
    return dst

def get_concat_v(im1, im2):
    dst = Image.new('RGB', (im1.width, im1.height + im2.height))
    dst.paste(im1, (0, 0))
    dst.paste(im2, (0, im1.height))
    return dst

In [None]:
import spacy

nlp = spacy.load("en_core_web_lg")

In [None]:
import random


def get_new_word(word: str):
    prompt = f"{word}"
    text = f"Only reply with a single word. Replace the word to another random class in COCO. USER: horse ASSISTANT: elephant USER: {prompt} ASSISTANT:"
    encoded_text = tokenizer(text, return_tensors="pt")
    encoded_text = {k: v.to(model.device) for k, v in encoded_text.items()}
    # generate
    output_ids = model.generate(
        **encoded_text,
        max_new_tokens=32,
        use_cache=True,
        do_sample=False,
    )[0]
    # decode
    output_text = tokenizer.decode(output_ids, skip_special_tokens=True).split(
        "ASSISTANT: "
    )[-1]
    return output_text


def evaluate_image(image, caption):
    # prompt = f"Does the description '{caption}' match the given image? What does the image show instead?"# Reply with 'yes' if it matches else 'no'."
    prompt = f"Does the description '{caption}' match the given image? Reply with 'yes' if it matches else 'no'."
    # prompt = "Caption the given image in a short sentence."
    text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{prompt} ASSISTANT:"
    # text = f"""A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER:{prompt} ASSISTANT: The image shows"""
    text_chunks = [tokenizer(chunk).input_ids for chunk in text.split("<image>")]
    # encoded_text = tokenizer(text, return_tensors="pt")
    # encoded_text = {k: v.to(model.device) for k, v in encoded_text.items()}
    input_ids = (
        torch.tensor(
            text_chunks[0] + [-200] + text_chunks[1][offset_bos:], dtype=torch.long
        )
        .unsqueeze(0)
        .to("cuda")
    )
    image_tensor = model.process_images([image], model.config).to(
        dtype=model.dtype, device=model.device
    )
    # generate
    output_ids = model.generate(
        # **encoded_text,
        input_ids,
        images=[image_tensor],
        max_new_tokens=16,
        use_cache=True,
        do_sample=False,
        repetition_penalty=1.0,  # increase this to avoid chattering
    )[0]
    # decode
    output_text = tokenizer.decode(output_ids, skip_special_tokens=True).split(
        "ASSISTANT: "
    )[-1]
    return output_text


for i in range(100):
    example_A = random.choice(dataset["train"])
    example_B = random.choice(dataset["train"])
    # random_image = random.choice(dataset["train"])["image"]
    doc_A = nlp(example_A["description"])
    doc_B = nlp(example_B["description"])

    sentences_A = list(doc_A.sents)
    sentences_B = list(doc_B.sents)
    random.shuffle(sentences_B)
    random.shuffle(sentences_A)

    # print("#" * 100)
    # print("\n".join([str(a) for a in sentences_A]))
    # print("---" * 100)
    # print("\n".join([str(a) for a in sentences_B]))
    # print("\n".join(sentences_B))
    sentences = []
    for s in sentences_A:
        if random.random() < 0.8 and len(sentences_B) > 0:
            sentence_from_B = sentences_B.pop(0)
            sentences.append(sentence_from_B)
        else:
            sentences.append(s)
    text = " ".join([str(s) for s in sentences])

    # print(doc)
    # sentence = []
    # for token in doc:
    #     # Check if the token is a noun (POS: 'NOUN' or 'PROPN')
    #     if token.pos_ == "NOUN" or token.pos_ == "PROPN" and random.random() < 0.1:
    #         new_word = get_new_word(token.text)
    #         sentence.append(new_word)
    #     else:
    #         sentence.append(token.text)
    # new_caption = " ".join(sentence)
    # print("GENERATED CAPTION")
    # print(text.replace(".", "\n"))
    true_pred = evaluate_image(example_A["image"], example_A["description"])
    pred = evaluate_image(example_A["image"], text)
    # true_pred = evaluate_image(random_image, example["text"])
    # pred = evaluate_image(random_image, new_caption)
    print(f"Is the original caption identified as matching: {true_pred}")
    print(f"Is the modified caption identified as matching: {pred}")
    # break

In [None]:
import json
import glob

example_paths = glob.glob("data/*.json")

results = []
for example_path in example_paths:
    with open(example_path) as f:
        example = json.load(f)
    example_idx = example_path.split("/")[-1].split(".json")[0]
    a_img = Image.open("data/imgs/" + example_idx + "_a.png")
    b_img = Image.open("data/imgs/" + example_idx + "_b.png")
    image = get_concat_h(a_img, b_img)
    image_tensor = model.process_images([image], model.config).to(
        dtype=model.dtype, device="cuda"
    )
    # text prompt
    for caption_origin in ["A", "B", "A_B", "B_A"]:
        caption = example[caption_origin]
        score = example["plausibility_scores"][caption_origin]
        print(caption_origin, caption)

        prompt = f"Description: {caption}\n Does the description match any of the given images? Reply with yes or no."
        text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{prompt} ASSISTANT:"
        text_chunks = [tokenizer(chunk).input_ids for chunk in text.split("<image>")]
        input_ids = (
            torch.tensor(
                text_chunks[0] + [-200] + text_chunks[1][offset_bos:], dtype=torch.long
            )
            .unsqueeze(0)
            .to("cuda")
        )
        # generate
        output_ids = model.generate(
            input_ids,
            images=image_tensor,
            max_new_tokens=32,
            use_cache=True,
            do_sample=False,
            repetition_penalty=1.0,  # increase this to avoid chattering
        )[0]

        # decode
        output_text = tokenizer.decode(output_ids, skip_special_tokens=True).split(
            "ASSISTANT: "
        )[-1]
        if "yes" in output_text.lower():
            results.append(
                (example_idx, caption, caption_origin, "yes", output_text, score)
            )
        elif "no" in output_text.lower():
            results.append(
                (example_idx, caption, caption_origin, "no", output_text, score)
            )
        else:
            results.append(
                (example_idx, caption, caption_origin, "unknown", output_text, score)
            )

In [None]:
import pandas as pd

df = pd.DataFrame(
    results, columns=["example_idx", "caption", "caption_origin", "result", "output_text", "score"]
)
df.loc[df["caption_origin"].isin(["A", "B"])].result.value_counts()

In [None]:
df.loc[df["caption_origin"].isin(["A", "B"])]

In [None]:
df.loc[df["caption_origin"].isin(["A_B", "B_A"])].result.value_counts()

In [None]:
df.loc[df["caption_origin"].isin(["A_B", "B_A"])&df["result"].isin(["yes"])]

In [None]:
import seaborn as sns

df["correct"] = 0
df.loc[df["caption_origin"].isin(["A", "B"]) & (df["result"] == "yes"), "correct"] = 1
df.loc[
    df["caption_origin"].isin(["A_B", "B_A"]) & (df["result"] == "no"), "correct"
] = 1

df["fake_caption"] = df["caption_origin"].isin(["A_B", "B_A"])

sns.barplot(x="fake_caption", y="correct", data=df, hue="score")

In [None]:
df.loc[df["caption_origin"].isin(["A", "B"])].output_text[0]

In [None]:
# a_img = Image.open("data/imgs/" + str(9) + "_a.png")
# b_img = Image.open("data/imgs/" + str(9) + "_b.png")
# image = get_concat_h(a_img, b_img)
# image
df