In [None]:
# %pip install -r requirements.txt

Note:<br>
This demo is adapted from the LXMERT Demo present here: https://github.com/huggingface/transformers/tree/main/examples/research_projects/lxmert
<br>and VisualBERT VQA Demo present here: https://github.com/huggingface/transformers/tree/main/examples/research_projects/visual_bert

## Import

In [1]:
import PIL
import io
import torch
import numpy as np
import pandas as pd
import utils
import json

In [2]:
from utils import Config
from IPython.display import Image, display
from processing_image import Preprocess
from visualizing_image import SingleImageViz
from modeling_frcnn import GeneralizedRCNN

from evaluate import load
from datasets import load_dataset
from tokenizers import Tokenizer
from transformers import (
    AutoModelForSeq2SeqLM,
    DataCollatorWithPadding,
    PreTrainedTokenizerFast,
    Trainer,
    TrainingArguments,
    VisualBertForPreTraining,
)

In [3]:
TORCH_VERSION = ".".join(torch.__version__.split(".")[:2])
CUDA_VERSION = torch.__version__.split("+")[-1]
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device, "; cuda:", CUDA_VERSION)
print("torch:", TORCH_VERSION)

device: cuda:0 ; cuda: cu117
torch: 1.13


## Setup

In [4]:
# load models and model components
frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg)

image_preprocess = Preprocess(frcnn_cfg)

visualbert_pre = VisualBertForPreTraining.from_pretrained("uclanlp/visualbert-vqa-coco-pre")

bleu = load("bleu")

loading configuration file cache
loading weights file https://cdn.huggingface.co/unc-nlp/frcnn-vg-finetuned/pytorch_model.bin from cache at /home/dhiya/.cache/torch/transformers/57f6df6abe353be2773f2700159c65615babf39ab5b48114d2b49267672ae10f.77b59256a4cf8343ae0f923246a81489fc8d82f98d082edc2d2037c977c0d9d0
All model checkpoint weights were used when initializing GeneralizedRCNN.

All the weights of GeneralizedRCNN were initialized from the model checkpoint at unc-nlp/frcnn-vg-finetuned.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GeneralizedRCNN for predictions without further training.


In [5]:
wrapped_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="model/tokenizer.json",
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)

model_text = AutoModelForSeq2SeqLM.from_pretrained("model/stone-seq2seq")

In [6]:
# for visualizing frcnn output
def showarray(a, fmt="jpeg"):
    a = np.uint8(np.clip(a, 0, 255))
    f = io.BytesIO()
    PIL.Image.fromarray(a).save(f, fmt)
    display(Image(data=f.getvalue()))

In [7]:
# run frcnn
def run_frcnn(URL):
    images, sizes, scales_yx = image_preprocess(URL)
    output_dict = frcnn(
        images,
        sizes,
        scales_yx=scales_yx,
        padding="max_detections",
        max_detections=frcnn_cfg.max_detections,
        return_tensors="pt",
    )
    return output_dict

In [8]:
# get unique word from text
def format_unique_word(text):
    words = text.split()
    return " ".join(sorted(set(words), key=words.index))

# formatting caption
def formating_text(captions):
    flat_text = " ".join(captions)
    return format_unique_word(flat_text)

## Load captions

In [9]:
captions = pd.read_csv('dataset/caption.txt', sep=';')

print(captions.shape[0])
captions.head(10)

660


Unnamed: 0,image,caption
0,1.jpg,Singkapan batuan sedimen klastik dengan bidang...
1,1.jpg,Singkapan batuan sedimen klastik dengan bidang...
2,1.jpg,Singkapan batuan sedimen klastik dan batulumpu...
3,1.jpg,batulumpur karbonatan dan Singkapan batuan sed...
4,1.jpg,Singkapan batuan sedimen klastik dengan bidang...
5,2.jpg,Pecahan koral
6,2.jpg,Pecahan koral
7,2.jpg,Pecahan koral
8,2.jpg,Pecahan koral
9,2.jpg,Pecahan koral


<b>-- Skip line below if you have the metadata images files --
<br>Line below for generate metadata image files</b>

In [None]:
caption_group = captions.groupby('image').agg(', '.join)

metadata_item = []
for index, row in caption_group.iterrows():
    obj = {"file_name": row.name, "text": row.caption}
    metadata_item.append(obj)

metadata_item

In [None]:
with open("metadata.jsonl", 'w') as f:
    for item in metadata_item:
        f.write(json.dumps(item) + "\n")

## Train model

<b>-- Skip line below if you have the trained model files --
<br>Line below for training the VisualBERT model</b>

In [14]:
dataset = load_dataset(
    "imagefolder",
    data_files={"train": "dataset/mini_data/train/**", "test": "dataset/mini_data/test/**"},
)

Resolving data files:   0%|          | 0/113 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/21 [00:00<?, ?it/s]

Using custom data configuration default-b042578067b5d1d0


Downloading and preparing dataset imagefolder/default to /home/dhiya/.cache/huggingface/datasets/imagefolder/default-b042578067b5d1d0/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f...
                

Downloading data files #0:   0%|          | 0/8 [00:00<?, ?obj/s]

Downloading data files #8:   0%|          | 0/7 [00:00<?, ?obj/s]

Downloading data files #9:   0%|          | 0/7 [00:00<?, ?obj/s]

Downloading data files #13:   0%|          | 0/7 [00:00<?, ?obj/s]

Downloading data files #7:   0%|          | 0/7 [00:00<?, ?obj/s]

Downloading data files #6:   0%|          | 0/7 [00:00<?, ?obj/s]

Downloading data files #4:   0%|          | 0/7 [00:00<?, ?obj/s]

Downloading data files #1:   0%|          | 0/7 [00:00<?, ?obj/s]

Downloading data files #14:   0%|          | 0/7 [00:00<?, ?obj/s]

Downloading data files #5:   0%|          | 0/7 [00:00<?, ?obj/s]

Downloading data files #15:   0%|          | 0/7 [00:00<?, ?obj/s]

Downloading data files #3:   0%|          | 0/7 [00:00<?, ?obj/s]

Downloading data files #10:   0%|          | 0/7 [00:00<?, ?obj/s]

Downloading data files #11:   0%|          | 0/7 [00:00<?, ?obj/s]

Downloading data files #12:   0%|          | 0/7 [00:00<?, ?obj/s]

Downloading data files #2:   0%|          | 0/7 [00:00<?, ?obj/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

                

Downloading data files #12:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #8:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #14:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #1:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #5:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #0:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #6:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #11:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #13:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #7:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #4:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #2:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #10:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #3:   0%|          | 0/2 [00:00<?, ?obj/s]

Downloading data files #15:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files #9:   0%|          | 0/1 [00:00<?, ?obj/s]

Downloading data files: 0it [00:00, ?it/s]

Extracting data files: 0it [00:00, ?it/s]

Generating train split:   0%|          | 0/112 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/20 [00:00<?, ? examples/s]

Dataset imagefolder downloaded and prepared to /home/dhiya/.cache/huggingface/datasets/imagefolder/default-b042578067b5d1d0/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def preprocess_function(examples):
    text_caption = format_unique_word(examples["text"])

    output_dict = run_frcnn(examples["image"])
    features = output_dict.get("roi_features")
    
    inputs = wrapped_tokenizer(
        text_caption,
        padding="max_length",
        max_length=20,
        truncation=True,
        return_token_type_ids=True,
        return_attention_mask=True,
        add_special_tokens=True,
    )

    visual_embeds = features.squeeze(0).squeeze(0)
    visual_attention_mask = torch.ones(features.shape[:-1]).squeeze(0)
    
    inputs.update(
        {
            "visual_embeds": visual_embeds,
            "visual_attention_mask": visual_attention_mask,
        }
    )
    
    max_length = len(inputs["input_ids"]) + features.shape[-2]
    labels = wrapped_tokenizer(
        text_caption, padding="max_length", max_length=max_length
    )["input_ids"]
    sentence_image_labels = torch.tensor(1).unsqueeze(0)  # Batch_size
    
    feats_info = {"labels":labels, "sentence_image_labels":sentence_image_labels}
    inputs.update(feats_info)
    return inputs

datasets_encoded_train = dataset["train"].map(preprocess_function)
datasets_encoded_test = dataset["test"].map(preprocess_function)

In [None]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
#     if data_args.ignore_pad_token_for_loss:
#         # Replace -100 in the labels as we can't decode them.
#         labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
training_args = TrainingArguments(
    output_dir="./trainer/results",
    logging_dir="./trainer/logs",
    num_train_epochs=100,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
)

data_collator = DataCollatorWithPadding(tokenizer=wrapped_tokenizer)

trainer = Trainer(
    model=visualbert_pre,
    args=training_args,
    train_dataset=datasets_encoded_train,
    eval_dataset=datasets_encoded_test,
    tokenizer=wrapped_tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
predicts = trainer.predict(datasets_encoded_test)
len(predicts.predictions[0])

test_predictions = predicts.predictions[0].argmax(-1)

for i, item in enumerate(test_predictions):
    print(f"\nPrediction {i+1}:\n {wrapped_tokenizer.decode(item)}\n")

In [None]:
trainer.save_model("model/stone-visualbert")

## Predictions

In [None]:
trained_model = VisualBertForPreTraining.from_pretrained("model/stone-visualbert")

In [None]:
def test_visualbert_model(image_feature, caption):
    inputs = wrapped_tokenizer(
        caption,
        padding="max_length",
        max_length=20,
        truncation=True,
        return_token_type_ids=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors="pt",
    )

    inputs.update(
        {
            "visual_embeds": image_feature,
            "visual_attention_mask": torch.ones(image_feature.shape[:-1]),
            "output_attentions": False,
        }
    )

    max_length = inputs["input_ids"].shape[-1] + image_feature.shape[-2]
    labels = wrapped_tokenizer(
        caption, return_tensors="pt", padding="max_length", max_length=max_length
    )["input_ids"]
    sentence_image_labels = torch.tensor(1).unsqueeze(0)  # Batch_size

    outputs = trained_model(
        **inputs,
        labels=labels,
        sentence_image_labels=sentence_image_labels,
    )
    
    return outputs

In [None]:
def greedy_search(text):
    input_ids = wrapped_tokenizer.encode(text, return_tensors='pt')

    # generate text until the output length (which includes the context length) reaches 20
    greedy_output = model_text.generate(input_ids, max_length=20)
    
    output = wrapped_tokenizer.decode(greedy_output[0], skip_special_tokens=True)
    
    return output

In [None]:
data_test_path = 'dataset/Flickr8K_Text/Flickr_8k.testImages.txt'

my_file = open(data_test_path, "r")
data = my_file.read()
test_list = data.split("\n")
my_file.close()

In [None]:
bleu_avg = 0.0

for file_name in test_list:
    print("File : ", file_name)

    image_caption = captions.loc[captions["image"] == file_name]["caption"]
    list_image_caption = [''.join(col).strip() for col in image_caption]
    formatted_image_caption = formating_text(list_image_caption)

    file_image_path = f'dataset/Flicker8k_Dataset/{file_name}'

    img = PIL.Image.open(file_image_path)

    output_dict = run_frcnn(img)
    image_feature = output_dict.get("roi_features")
    
    outputs_model = test_visualbert_model(image_feature, formatted_image_caption)

    prediction_logits = outputs_model.prediction_logits.argmax(-1)
    predict_caption = wrapped_tokenizer.decode(prediction_logits[0], skip_special_tokens=True)
    predict_caption_greedy = greedy_search(predict_caption)
    
    predictions = [predict_caption_greedy]
    references = [[[caption] for caption in list_image_caption]]
    bleu_result = bleu.compute(predictions=predictions, references=references)
    bleu_avg = bleu_avg + bleu_result["bleu"]

    display(img)

    print("Caption:")
    for i in range(len(list_image_caption)):
        print(f"{i+1}. {list_image_caption[i]}")

    print("\nFormatted caption:\n", formatted_image_caption)

    print(f"\nPrediction:\n {predict_caption}")
    
    print(f"\nGreedy search:\n {predict_caption_greedy}\n")
    
    print(f"BLEU Score:\n {bleu_result}\n")
    print(100 * '-' + "\n")

print(f"Avg. BLEU Score: {bleu_avg / len(test_list)}")