In [1]:
import webdataset as wds
import torch
from torchvision import transforms

In [None]:
def identity(x):
    return x

def collect_all_texts(sample):
    img = sample["jpg"]
    texts = []
    
    # 1. obtain text1
    if "txt" in sample:
        txt_content = sample["txt"]
        if isinstance(txt_content, bytes):
            txt_content = txt_content.decode("utf-8")
        texts.append(str(txt_content))
    else:
        texts.append("") 
    
    # 2. add text2, text3, syn_text to text list
    if "json" in sample:
        json_content = sample["json"]
        if isinstance(json_content, bytes):
            json_content = json_content.decode("utf-8")
        texts.extend([json_content.get("text2", ""), json_content.get("text3", ""), json_content.get("syn_text", "")])
    
    return img, texts

url = "RealSyn100M-webdataset/{00000..00389}.tar"
preproc = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize(224),
    transforms.CenterCrop(224),
])

batch_size = 512

dataset = (
    wds.WebDataset(url)
    .shuffle(1000)
    .decode("rgb")
    .map(collect_all_texts)  
    .map_tuple(preproc, identity) 
)

dataset = dataset.batched(batch_size, partial=False)

loader = (
    wds.WebLoader(dataset, num_workers=2, batch_size=None)
)

In [3]:
for step, (img, texts) in enumerate(loader):
    print(f"Batch {step}:")
    print(f"  Image count: {len(img)}")
    print(f"  Text list count: {len(texts)}")
    print(f"  First sample text count: {len(texts[0]) if texts else 0}")
    if texts and len(texts) > 0:
        print(f"  All texts of the first sample:")
        for i, t in enumerate(texts[0]):
            print(f"    text[{i}]: {t}...") 
    break

Batch 0:
  Image count: 512
  Text list count: 512
  First sample text count: 4
  All texts of the first sample:
    text[0]: and while many things have changed over the years — like the name ( it 's just epcot now ) and the attractions ( good - bye world of motion , hello test track )— it still remains the coolest " edu - tainment " theme park you 'll ever find ....
    text[1]: future world celebrates technology and innovation with a combination of rides , shows , and interactive displays ....
    text[2]: while epcot center attractions such as horizons , world of motion , and communicore have been lost over the years , there are references to them throughout the parks ....
    text[3]: although the name has changed to epcot and some attractions have been replaced, such as world of motion and the introduction of test track, it still remains the coolest "edu-tainment" theme park you'll ever find....


In [4]:
print("=" * 60)
print("Dataset information")
print("=" * 60)

text_names = ["text1", "text2", "text3", "syn_text"]
for step, (img, texts) in enumerate(loader):
    print(f"\nBatch {step}:")
    print(f"  image shape: {img.shape}")  # [batch_size, 3, 224, 224]
    print(f"  text lists count: {len(texts)}")
    print(f"  first sample:")
    print(f"    - image shape: {img[0].shape}")
    print(f"    - number of texts: {len(texts[0])}")
    for i, t in enumerate(texts[0]):
        name = text_names[i] if i < len(text_names) else f"text[{i}]"
        preview = t[:100] + "..." if len(t) > 100 else t
        print(f"    - {name}: {preview}")
    
    if step >= 2:
        break

print("\n" + "=" * 60)

Dataset information

Batch 0:
  image shape: torch.Size([512, 3, 224, 224])
  text lists count: 512
  first sample:
    - image shape: torch.Size([3, 224, 224])
    - number of texts: 4
    - text1: there is also a figure of james the apostle , a work created by the villinger master , metzger .
    - text2: the arch and decoration reflect the romanesque character of the church exterior , while the model fo...
    - text3: several of these figures have reached our days after living an interesting journey that caused the s...
    - syn_text: there is also a blue statue of james the apostle, a work created by the villinger master, metzger, w...

Batch 1:
  image shape: torch.Size([512, 3, 224, 224])
  text lists count: 512
  first sample:
    - image shape: torch.Size([3, 224, 224])
    - number of texts: 4
    - text1: wertheim draws a parallel between the emergence of outsider artists in the early twentieth century a...
    - text2: sign up for one of four arrival times to be a part of 