In [None]:
from datasets import load_dataset
from PIL import Image
import os
from tqdm import tqdm
from itertools import islice

In [None]:
# === Config ===
output_dir = "pickapic_pair_subset"
num_samples = 100

# Create class folders: image_0 and image_1
for class_label in ["image_0", "image_1"]:
    os.makedirs(os.path.join(output_dir, class_label), exist_ok=True)

# Load streamed dataset
print("⏬ Streaming the dataset...")
dataset = load_dataset("yuvalkirstain/pickapic_v2", split="train", streaming=True)
subset = islice(dataset, num_samples)

In [None]:
import io  # Add this at the top with your imports

# Save both images from each pair using 'jpg_0' and 'jpg_1'
print(f"💾 Saving {num_samples} samples with both jpg images per sample...")
for idx, sample in tqdm(enumerate(subset), total=num_samples):
    for image_idx in [0, 1]:
        image_bytes = sample[f"jpg_{image_idx}"]
        image = Image.open(io.BytesIO(image_bytes)).convert("RGB")  # <- convert bytes to image

        class_folder = os.path.join(output_dir, f"image_{image_idx}")
        image.save(os.path.join(class_folder, f"sample_{idx}_{image_idx}.jpg"))

print("Done!")
