# Evaluation Usage Example

In [1]:
%load_ext autoreload
%autoreload 2

In [8]:
import numpy as np
from vtt.data.caption_preprocessing import load_tokenizer, load_and_clean_captions
from vtt.data.data_loader import load_split_datasets
from vtt.models.decoder import build_decoder_model
from vtt.evaluation.evaluate import evaluate_model
from vtt.evaluation.evaluate import evaluate_captions

## Sample

In [3]:
ground_truths = {
    "image1.jpg": ["a man riding a bike", "a person on a bicycle in motion"],
    "image2.jpg": ["a cat sitting on a couch", "a feline on furniture"],
}

generated = {
    "image1.jpg": "a man riding a bicycle",
    "image2.jpg": "a cat is lying on a sofa",
}

scores = evaluate_captions(ground_truths, generated)
for metric, value in scores.items():
    print(f"{metric}: {value:.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BLEU-1: 0.7857
BLEU-2: 0.7182
BLEU-3: 0.5169
BLEU-4: 0.4293
METEOR: 0.7936
BERTScore_P: 0.9723
BERTScore_R: 0.9735
BERTScore_F1: 0.9729


## Caption Prediction on Test Set

### Fetch the Test dataset fro evaluation

In [14]:
dataset_name = "flickr8k"
features_path = f"../data/processed/{dataset_name}_features.npz"
captions_path = f"../data/processed/{dataset_name}_padded_caption_sequences.npz"
tokenizer_path = f"../data/processed/{dataset_name}_tokenizer.json"

tokenizer = load_tokenizer(tokenizer_path)
features = np.load(features_path)

train_ds, val_ds, test_ds = load_split_datasets(
    features_path=features_path,
    captions_path=captions_path,
    batch_size=64,
    val_split=0.15,
    test_split=0.10,
    shuffle=True,
    buffer_size=1000,
    seed=42,
    cache=True,
    return_numpy=False
)

[INFO] Tokenizer loaded from JSON file: ../data/processed/flickr8k_tokenizer.json

--- Dataset Split Sizes (number of individual samples) ---
Total samples loaded: 38008
Train samples: 28507
Validation samples: 5701
Test samples: 3800
----------------------------------------------------------



In [15]:

# Get max caption length from dataset
for (image_tensor, input_caption, _), _ in train_ds.take(1):
    max_caption_len = input_caption.shape[1]
# Get vocab size
vocab_size = tokenizer.num_words

2025-07-18 14:14:59.103519: W tensorflow/core/kernels/data/cache_dataset_ops.cc:916] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


### Load the Saved Model Weights

In [12]:
model = build_decoder_model(vocab_size=vocab_size,
                            max_caption_len=max_caption_len)

checkpoint_path = "../models/flickr8k_decoder_weights.weights.h5"
model.load_weights(checkpoint_path)
print("Loaded pretrained weights")


Loaded pretrained weights


### Evaluate Scores for Test Dataset

In [9]:
references_dict = load_and_clean_captions(f"../data/raw/{dataset_name}_captions.csv")

scores = evaluate_model(
    model=model,
    tokenizer=tokenizer,
    features=features,
    test_dataset=test_ds,
    references_dict=references_dict,
    max_len=max_caption_len,
)

Generating Captions from Dataset: 100%|██████████| 60/60 [24:19<00:00, 19.78s/it]2025-07-18 13:32:50.523074: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
Generating Captions from Dataset: 100%|██████████| 60/60 [24:19<00:00, 24.32s/it]
W0718 13:32:57.799000 12095 site-packages/torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
print("Evaluation Scores:")
for metric, score in scores.items():
    print(f"{metric}: {score:.4f}")

Evaluation Scores:
BLEU-1: 0.4705
BLEU-2: 0.2836
BLEU-3: 0.1710
BLEU-4: 0.1126
METEOR: 0.2661
BERTScore_P: 0.8854
BERTScore_R: 0.8552
BERTScore_F1: 0.8699
