# Caption Preprocessing Usage Example

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from vtt.data.caption_preprocessing import *

2025-07-14 19:44:15.687263: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-14 19:44:15.696831: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752536655.710890  129159 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752536655.714321  129159 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752536655.722246  129159 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [3]:
# Define dataset name
# dataset_name = "flickr8k"
dataset_name = "flickr30k"

# Step 1: Load and clean raw captions
captions_path = f"../data/raw/{dataset_name}_captions.csv"
captions_dict = load_and_clean_captions(captions_path)

# Step 2: Filter out rare words and build vocabulary
filtered_captions, vocab = filter_captions_by_frequency(captions_dict, min_word_freq=5)

# Step 3: Fit tokenizer on filtered captions
tokenizer = fit_tokenizer(filtered_captions, num_words=10000)

# Step 4: Convert cleaned captions to sequences of token IDs
seqs = captions_to_sequences(filtered_captions, tokenizer)

# Step 5: Compute max length for padding using 95th percentile
max_length = compute_max_caption_length(seqs, quantile=0.95)

# Step 6: Pad all sequences to uniform length
padded_seqs = pad_caption_sequences(seqs, max_length=max_length)

# Step 7: Save processed data and tokenizer
save_padded_sequences(
    padded_seqs, f"../data/processed/{dataset_name}_padded_caption_sequences.npz"
)
# save_tokenizer(tokenizer, f"../data/processed/{dataset_name}_tokenizer.pkl")
save_tokenizer(tokenizer, f"../data/processed/{dataset_name}_tokenizer.json")

[INFO] File already exists and overwrite=False: ../data/processed/flickr30k_padded_caption_sequences.npz
[INFO] File already exists and overwrite=False: ../data/processed/flickr30k_tokenizer.json


In [4]:
# Load the saved data and inpsect a few examples
loaded_sequences = load_padded_sequences(
    f"../data/processed/{dataset_name}_padded_caption_sequences.npz"
)
# tokenizer_loaded = load_tokenizer(f"../data/processed/{dataset_name}_tokenizer.pkl")
tokenizer_loaded = load_tokenizer(f"../data/processed/{dataset_name}_tokenizer.json")

# Inspect the five captions associated with the reference image
image_id = "1000092795.jpg"  # example image ID
for i, sequence in enumerate(loaded_sequences.get(image_id, [])[:5]):
    print(f"Caption {i}:", sequence)

[INFO] Tokenizer loaded from JSON file: ../data/processed/flickr30k_tokenizer.json
Caption 0: [3, 14, 22, 310, 12, 2182, 116, 195, 19, 63, 165, 27, 325, 73, 5, 6, 471, 4, 0, 0, 0, 0]
Caption 1: [3, 14, 22, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Caption 2: [3, 14, 30, 5, 51, 262, 16, 35, 5, 2, 471, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Caption 3: [3, 2, 8, 5, 2, 28, 23, 35, 5, 2, 686, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Caption 4: [3, 14, 457, 786, 586, 15, 134, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
