# Dataloader Usage Examples

## Flickr30k

In [1]:
from datasets import Flickr30k

flickr_dataset = Flickr30k(csv_file='./data/flickr30k/results.csv', 
                          img_dir='./data/flickr30k/images/')

In [2]:
flickr_dataset[1]

(<PIL.Image.Image image mode=RGB size=500x374>,
 ['Several men in hard hats are operating a giant pulley system .',
  'Workers look down from up above on a piece of equipment .',
  'Two men working on a machine wearing hard hats .',
  'Four men on top of a tall structure .',
  'Three men on a large rig .'])

## COCO

In [1]:
from datasets import COCODataset

In [2]:
coco_dataset = COCODataset(ann_file='./data/coco/annotations/captions_val2017.json',
                           img_dir='./data/coco/val2017')

loading annotations into memory...
Done (t=0.05s)
creating index...
index created!


In [3]:
coco_dataset[1]

(<PIL.Image.Image image mode=RGB size=352x230>,
 ['The dining table near the kitchen has a bowl of fruit on it.',
  'A small kitchen has various appliances and a table.',
  'The kitchen is clean and ready for us to see.',
  'A kitchen and dining area decorated in white.',
  'A kitchen that has a bowl of fruit on the table.'])

In [None]:
caption

# Dataloader

In [1]:
import torch
from torch.utils.data import DataLoader
from datasets import Flickr30k
from torchvision import transforms
import multiprocessing

# Define your transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create the dataset
flickr_dataset = Flickr30k(csv_file='./data/flickr30k/results.csv', 
                           img_dir='./data/flickr30k/images/',
                           transform=transform)

# Set up DataLoader parameters
batch_size = 32
num_workers = multiprocessing.cpu_count()  # Use all available CPU cores

# Create the DataLoader
data_loader = DataLoader(
    flickr_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers,
    pin_memory=True  # This can speed up data transfer to GPU
)

# Demonstrate loading data
def process_batch(batch):
    images, captions = batch
    print(f"Batch size: {images.shape[0]}")
    print(f"Image shape: {images.shape}")
    print(f"Number of captions: {len(captions)}")
    print(f"First caption: {captions[0]}")
    print("-" * 50)

# Iterate through a few batches
for i, batch in enumerate(data_loader):
    process_batch(batch)
    if i == 2:  # Stop after 3 batches
        break
Steve Keen
print(f"DataLoader is using {num_workers} workers")

Batch size: 32
Image shape: torch.Size([32, 3, 224, 224])
Number of captions: 5
First caption: ['Adorable blond-hair little girl posing for her Daddy while he takes her picture .', 'Three dogs pulling a man in a brown jumpsuit and a baby in a blue snowsuit , on a sled in a snowy forest .', "Several young children dressed in winter clothes play in the street outside of Stampen 's restaurant with the streetlights on while several adults are standing around .", 'Male surfer stands on a white surfboard in white shorts and a blue shirt turns to come in as the brown waves splash around him', 'A barefoot boy in shorts and a t-shirt jumps in muddy grass .', 'The child is wearing a blue hat and green jacket while walking through snow .', 'A small group of soccer players are on a soccer field standing around while a player is looked at for injuries .', 'The man in a peach-colored shirt is carrying items on a huge cart .', 'Man in a blue number 78 sports jersey walking down city street past a whi

In [2]:
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration
# Load BLIP-2 model and processor
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")



In [3]:
processor

Blip2Processor:
- image_processor: BlipImageProcessor {
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "BlipImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "processor_class": "Blip2Processor",
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}

- tokenizer: GPT2TokenizerFast(name_or_path='Salesforce/blip2-opt-2.7b', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '</s>', 'eos_token': '</s>', 'unk_token': '</s>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=Fa

In [13]:
flickr_dataset[0][1][0]

'Two young guys with shaggy hair look at their hands while hanging out in the yard .'

In [5]:
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image

# Initialize the processor and model
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")

# Load an example image
image = flickr_dataset[0][0]

# Example text
text = flickr_dataset[0][1][0]

# 1. Encoding a single image
def encode_image(image):
    inputs = processor(images=image, return_tensors="pt")
    return inputs

image_inputs = encode_image(image)
print("Image inputs:", image_inputs.keys())

# 2. Encoding a single text
def encode_text(text):
    inputs = processor(text=text, return_tensors="pt")
    return inputs

text_inputs = encode_text(text)
print("Text inputs:", text_inputs.keys())

# 3. Encoding a single image + text
def encode_image_and_text(image, text):
    inputs = processor(images=image, text=text, return_tensors="pt")
    return inputs

image_text_inputs = encode_image_and_text(image, text)
print("Image + Text inputs:", image_text_inputs.keys())

# Decoding examples

# For image captioning (decoding generated ids)
def generate_and_decode_caption(image_inputs):
    generated_ids = model.generate(**image_inputs)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    return generated_text

caption = generate_and_decode_caption(image_inputs)
print("Generated caption:", caption)

# Decoding input ids (if you want to see the tokenized text)
def decode_input_ids(input_ids):
    decoded_text = processor.decode(input_ids[0], skip_special_tokens=True)
    return decoded_text

if 'input_ids' in text_inputs:
    decoded_input = decode_input_ids(text_inputs['input_ids'])
    print("Decoded input:", decoded_input)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Image inputs: dict_keys(['pixel_values'])
Text inputs: dict_keys(['input_ids', 'attention_mask'])
Image + Text inputs: dict_keys(['pixel_values', 'input_ids', 'attention_mask'])




Generated caption: two men skateboarding in a garden
Decoded input: Two young guys with shaggy hair look at their hands while hanging out in the yard.
