# Load Processed Data

Loads and combines the image features with the padded sequences into a Tensorflow dataset.

## Setup and Imports

In [1]:
# Confirm environment
!conda info


     active environment : northeastern
    active env location : /home/curtis/anaconda3/envs/northeastern
            shell level : 2
       user config file : /home/curtis/.condarc
 populated config files : /home/curtis/anaconda3/.condarc
          conda version : 24.9.2
    conda-build version : 24.9.0
         python version : 3.12.7.final.0
                 solver : libmamba (default)
       virtual packages : __archspec=1=skylake
                          __conda=24.9.2=0
                          __glibc=2.39=0
                          __linux=6.6.87.2=0
                          __unix=0=0
       base environment : /home/curtis/anaconda3  (writable)
      conda av data dir : /home/curtis/anaconda3/etc/conda
  conda av metadata url : None
           channel URLs : https://repo.anaconda.com/pkgs/main/linux-64
                          https://repo.anaconda.com/pkgs/main/noarch
                          https://repo.anaconda.com/pkgs/r/linux-64
                          https://r

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Concatenate
from vtt.utils import detect_and_set_device

2025-07-10 18:24:35.206735: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-10 18:24:35.218185: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752186275.233925  911290 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752186275.242826  911290 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1752186275.264297  911290 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [4]:
# Detect and set up GPU or use CPU
device_used = detect_and_set_device()
print(f"TensorFlow is configured to use: {device_used}")

No GPU devices found despite TensorFlow being built with CUDA. Using CPU.
TensorFlow is configured to use: CPU


2025-07-10 18:24:38.580987: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


## Combine Features and Sequences

In [5]:
import tensorflow as tf
import numpy as np


def load_features_and_sequences(
    features_path: str,
    captions_path: str,
    batch_size: int = 32,
    shuffle: bool = True,
    buffer_size: int = 1000,
) -> tf.data.Dataset:
    """
    Load image features and padded caption sequences and return a batched tf.data.Dataset.

    Args:
        features_path (str): Path to .npz file with image features.
        captions_path (str): Path to .npz file with padded caption sequences.
        batch_size (int): Number of samples per batch.
        shuffle (bool): Whether to shuffle the dataset.
        buffer_size (int): Buffer size for shuffling.

    Returns:
        tf.data.Dataset: Dataset of (image_feature, caption_sequence) pairs.
    """
    features_npz = np.load(features_path)
    captions_npz = np.load(captions_path, allow_pickle=True)

    image_features = []
    caption_sequences = []

    for img_id in captions_npz.files:
        if img_id not in features_npz:
            continue  # Skip if image feature is missing

        feature = features_npz[img_id]
        captions = captions_npz[img_id]

        for caption in captions:
            image_features.append(feature)
            caption_sequences.append(caption)

    # Convert to tensors
    image_features = tf.convert_to_tensor(image_features, dtype=tf.float32)
    caption_sequences = tf.convert_to_tensor(caption_sequences, dtype=tf.int32)

    # Build dataset
    dataset = tf.data.Dataset.from_tensor_slices((image_features, caption_sequences))

    if shuffle:
        dataset = dataset.shuffle(buffer_size=buffer_size)

    return dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [6]:
image_features = "../data/processed/flickr8k_features.npz"
padded_sequences = "../data/processed/flickr8k_padded_caption_sequences.npz"
dataset = load_features_and_sequences(image_features, padded_sequences)

In [7]:
for img_feat, seq in dataset.take(5):
    print("Image features shape:", img_feat.shape)
    print("Padded caption sequence:", seq.numpy())
    print("---")

Image features shape: (2048,)
Padded caption sequence: [   3    2   43    5    2   92  172    8  115   52    2  397   13  378
    5   29 5005  690    4]
---
Image features shape: (2048,)
Padded caption sequence: [  3   2  19 304  63   2 189 116   4   0   0   0   0   0   0   0   0   0
   0]
---
Image features shape: (2048,)
Padded caption sequence: [   3    2   38   19  115   63    2  189 2330    4    0    0    0    0
    0    0    0    0    0]
---
Image features shape: (2048,)
Padded caption sequence: [   3    2   38   19  115    6  378   21   62 2330    4    0    0    0
    0    0    0    0    0]
---
Image features shape: (2048,)
Padded caption sequence: [   3    2   38   19    5    2   92  172  304   63    2  189 3267    4
    0    0    0    0    0]
---


2025-07-10 18:28:45.341219: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


## Sandbox

In [None]:
def build_caption_model(
    vocab_size: int,
    max_caption_len: int,
    embedding_dim: int = 256,
    lstm_units: int = 512,
) -> Model:
    """
    Build an image captioning model using LSTM and pretrained image features.

    Args:
        vocab_size (int): Size of the tokenizer vocabulary.
        max_caption_len (int): Max length of caption sequences.
        embedding_dim (int): Size of word embeddings.
        lstm_units (int): Number of LSTM units.

    Returns:
        Model: Compiled Keras model ready for training.
    """
    # Image feature input (2048-dim)
    img_input = Input(shape=(2048,), name="image_features")
    img_dense = Dense(embedding_dim, activation="relu")(img_input)
    img_dropout = Dropout(0.5)(img_dense)

    # Caption sequence input
    caption_input = Input(shape=(max_caption_len,), name="caption_sequence")
    caption_embed = Embedding(vocab_size, embedding_dim, mask_zero=True)(caption_input)
    caption_dropout = Dropout(0.5)(caption_embed)

    # Combine image + text
    merged = Concatenate()([tf.expand_dims(img_dropout, 1), caption_dropout])
    lstm_out = LSTM(lstm_units)(merged)
    output = Dense(vocab_size, activation="softmax")(lstm_out)

    model = Model(inputs=[img_input, caption_input], outputs=output)
    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
    return model

In [None]:
def prepare_training_dataset(dataset: tf.data.Dataset) -> tf.data.Dataset:
    """
    Shift target tokens for teacher forcing.

    Input:
        (image_features, [start, w1, w2, ..., wn])
    Output:
        inputs = (image_features, [start, w1, ..., wn-1])
        target = wn

    Returns:
        tf.data.Dataset: ((img, input_seq), target_seq) pairs
    """

    def map_fn(img, caption):
        input_seq = caption[:-1]
        target = caption[1:]
        return (img, input_seq), target

    return dataset.map(map_fn).prefetch(tf.data.AUTOTUNE)

In [None]:
# Build and summarize the model
model = build_caption_model(tokenizer.num_words, padded_sequences.shape[1])
model.summary()

# Prepare dataset
train_dataset = load_features_and_sequences(
    features_path=".../flickr8k_features_combined.npz",
    captions_path=".../flickr8k_sequences.npz",
    batch_size=64,
    shuffle=True,
)
train_dataset = prepare_training_dataset(train_dataset)

# Train the model
history = model.fit(train_dataset, epochs=20)