# Dataloader Usage Examples

## Flickr30k

In [None]:
from datasets import Flickr30k

flickr_dataset = Flickr30k(csv_file='./data/flickr30k/results.csv', 
                          img_dir='./data/flickr30k/images/')

In [None]:
flickr_dataset[1]

## COCO

In [None]:
from datasets import COCODataset

In [None]:
coco_dataset = COCODataset(ann_file='./data/coco/annotations/captions_val2017.json',
                           img_dir='./data/coco/val2017')

In [None]:
coco_dataset[1]

In [None]:
caption

# Dataloader

In [None]:
import torch
from torch.utils.data import DataLoader
from datasets import Flickr30k
from torchvision import transforms
import multiprocessing

# Define your transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create the dataset
flickr_dataset = Flickr30k(csv_file='./data/flickr30k/results.csv', 
                           img_dir='./data/flickr30k/images/',
                           transform=transform)

# Set up DataLoader parameters
batch_size = 32
num_workers = multiprocessing.cpu_count()  # Use all available CPU cores

# Create the DataLoader
data_loader = DataLoader(
    flickr_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    pin_memory=True  # This can speed up data transfer to GPU
)

# Demonstrate loading data
def process_batch(batch):
    images, captions = batch
    print(f"Batch size: {images.shape[0]}")
    print(f"Image shape: {images.shape}")
    print(f"Number of captions: {len(captions)}")
    print(f"First caption: {captions[0]}")
    print("-" * 50)

# Iterate through a few batches
for i, batch in enumerate(data_loader):
    process_batch(batch)
    if i == 2:  # Stop after 3 batches
        break
Steve Keen
print(f"DataLoader is using {num_workers} workers")

In [None]:
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration
# Load BLIP-2 model and processor
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")

In [None]:
processor

In [None]:
flickr_dataset[0][1][0]

In [None]:
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image

# Initialize the processor and model
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")

# Load an example image
image = flickr_dataset[0][0]

# Example text
text = flickr_dataset[0][1][0]

# 1. Encoding a single image
def encode_image(image):
    inputs = processor(images=image, return_tensors="pt")
    return inputs

image_inputs = encode_image(image)
print("Image inputs:", image_inputs.keys())

# 2. Encoding a single text
def encode_text(text):
    inputs = processor(text=text, return_tensors="pt")
    return inputs

text_inputs = encode_text(text)
print("Text inputs:", text_inputs.keys())

# 3. Encoding a single image + text
def encode_image_and_text(image, text):
    inputs = processor(images=image, text=text, return_tensors="pt")
    return inputs

image_text_inputs = encode_image_and_text(image, text)
print("Image + Text inputs:", image_text_inputs.keys())

# Decoding examples

# For image captioning (decoding generated ids)
def generate_and_decode_caption(image_inputs):
    generated_ids = model.generate(**image_inputs)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    return generated_text

caption = generate_and_decode_caption(image_inputs)
print("Generated caption:", caption)

# Decoding input ids (if you want to see the tokenized text)
def decode_input_ids(input_ids):
    decoded_text = processor.decode(input_ids[0], skip_special_tokens=True)
    return decoded_text

if 'input_ids' in text_inputs:
    decoded_input = decode_input_ids(text_inputs['input_ids'])
    print("Decoded input:", decoded_input)