In [None]:
import json
import os
import tensorflow as tf
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import numpy as np

# Paths to your dataset
image_dir = "../../../advance/images"
annotations_path = "../../../advance/vlm.jsonl"

# Load annotations
annotations = []
with open(annotations_path, 'r') as f:
    for line in f:
        annotations.append(json.loads(line))

print(f"Loaded {len(annotations)} annotations.")

# Load images and corresponding annotations
def load_data(annotations, image_dir):
    images = []
    captions = []
    bboxes = []

    for annotation in annotations:
        image_path = os.path.join(image_dir, annotation['image'])
        image = load_img(image_path, target_size=(224, 224))
        image = img_to_array(image) / 255.0

        for ann in annotation['annotations']:
            captions.append(ann['caption'])
            bboxes.append(ann['bbox'])
        
        images.append(image)
    
    print(f"Loaded {len(images)} images.")
    return np.array(images), captions, bboxes

images, captions, bboxes = load_data(annotations, image_dir)


In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize captions
tokenizer = Tokenizer(num_words=10000, oov_token="<UNK>")
tokenizer.fit_on_texts(captions)
sequences = tokenizer.texts_to_sequences(captions)
padded_sequences = pad_sequences(sequences, padding='post')

print(f"Tokenized {len(captions)} captions into sequences.")

# Normalize bounding boxes
def normalize_bbox(bbox, image_shape):
    height, width, _ = image_shape
    x, y, w, h = bbox
    return [x / width, y / height, w / width, h / height]

normalized_bboxes = [normalize_bbox(bbox, images[0].shape) for bbox in bboxes]
print(f"Normalized {len(normalized_bboxes)} bounding boxes.")


In [18]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Concatenate, Flatten

# Image feature extractor
image_input = Input(shape=(224, 224, 3))
base_model = ResNet50(include_top=False, weights='imagenet', input_tensor=image_input)
x = Flatten()(base_model.output)
image_features = Dense(256, activation='relu')(x)

# Caption processor
caption_input = Input(shape=(None,))
embedding = Embedding(input_dim=10000, output_dim=256, mask_zero=True)(caption_input)
lstm_out = LSTM(256)(embedding)

# Bounding box predictor
combined = Concatenate()([image_features, lstm_out])
bbox_output = Dense(4, activation='sigmoid')(combined)

# Define the model
model = Model(inputs=[image_input, caption_input], outputs=bbox_output)
model.compile(optimizer='adam', loss='mse')

print("Model Summary:")
model.summary()


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 0us/step


In [None]:
# Prepare the inputs
def prepare_inputs(images, sequences, normalized_bboxes, max_seq_length):
    images_input = np.array(images)
    sequences_input = pad_sequences(sequences, maxlen=max_seq_length, padding='post')
    bboxes_output = np.array(normalized_bboxes)
    
    return images_input, sequences_input, bboxes_output

max_seq_length = max(len(seq) for seq in sequences)
images_input, sequences_input, bboxes_output = prepare_inputs(images, sequences, normalized_bboxes, max_seq_length)

print(f"Prepared inputs: images_input shape = {images_input.shape}, sequences_input shape = {sequences_input.shape}, bboxes_output shape = {bboxes_output.shape}")

# Train the model
model.fit([images_input, sequences_input], bboxes_output, epochs=10, batch_size=32, validation_split=0.2)


In [None]:
# Further fine-tuning can be done by adjusting learning rates, using callbacks, etc.
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

callbacks = [
    EarlyStopping(patience=3, restore_best_weights=True),
    ReduceLROnPlateau(factor=0.1, patience=2)
]

history = model.fit([images_input, sequences_input], bboxes_output, epochs=50, batch_size=32, validation_split=0.2, callbacks=callbacks)

print(f"Training completed. History: {history.history}")

# Evaluate the model
val_loss = model.evaluate([images_input, sequences_input], bboxes_output)
print(f"Validation Loss: {val_loss}")
