In [None]:
import io
from pathlib import Path
import webdataset as wds
from PIL import Image
from datasets import load_dataset
import pandas as pd
from collections import Counter
import random

## DALLE-E

In [18]:
# Create directories 
Path("data/train/dalle3").mkdir(parents=True, exist_ok=True)
Path("data/val/dalle3").mkdir(parents=True, exist_ok=True)

# https://huggingface.co/datasets/CaptionEmporium/dalle3-llama3.2-11b
WDS_URLS = "https://huggingface.co/datasets/CaptionEmporium/dalle3-llama3.2-11b/resolve/main/data/wds/{000000..000137}.tar"
ds = wds.WebDataset(WDS_URLS)

print("Downloading images...")
count = 0
skipped = 0

for row in ds:
    if count >= 1000:
        break
    
    try:
        image_pil = Image.open(io.BytesIO(row["jxl"]))
        
        # Determine folder
        folder = "train" if count < 700 else "val"
        
        # Save as JPG
        image_pil.convert('RGB').save(f"data/{folder}/dalle3/img_{count:04d}.jpg")
        
        count += 1
        if count % 100 == 0:
            print(f"Downloaded {count}/1000")
            
    except Exception as e:
        skipped += 1
        continue

print(f"Done! Downloaded {count} images (700 train, 300 val, skipped {skipped})")



Downloading images...
Downloaded 100/1000
Downloaded 200/1000
Downloaded 300/1000
Downloaded 400/1000
Downloaded 500/1000
Downloaded 600/1000
Downloaded 700/1000
Downloaded 800/1000
Downloaded 900/1000
Downloaded 1000/1000
Done! Downloaded 1000 images (700 train, 300 val, skipped 0)


## Midjourney

In [None]:
# Create directories
Path("data/train/midjourney").mkdir(parents=True, exist_ok=True)
Path("data/val/midjourney").mkdir(parents=True, exist_ok=True)

# Load parquet directly (faster, more stable than streaming)
# https://huggingface.co/datasets/ava-space/MidJourney
df = pd.read_parquet("train-00000-of-00002.parquet")

print("Downloading images...")
count = 0

for idx, row in df.iterrows():
    if count >= 1000:
        break
    
    try:
        # Extract bytes and convert to PIL Image
        img_bytes = row['image']['bytes']
        img = Image.open(io.BytesIO(img_bytes))
        
        folder = "train" if count < 700 else "val"
        img.convert('RGB').save(f"data/{folder}/midjourney/img_{count:04d}.jpg")
        
        count += 1
        if count % 100 == 0:
            print(f"Downloaded {count}/1000")
            
    except Exception as e:
        continue

print(f"Done! {count} images")

Downloading images...
Downloaded 100/1000
Downloaded 200/1000
Downloaded 300/1000
Downloaded 400/1000
Downloaded 500/1000
Downloaded 600/1000
Downloaded 700/1000
Downloaded 800/1000
Downloaded 900/1000
Downloaded 1000/1000
Done! 1000 images


## Authentic

In [None]:
# Create directories
Path("data/train/authentic").mkdir(parents=True, exist_ok=True)
Path("data/val/authentic").mkdir(parents=True, exist_ok=True)

# Load dataset
ds = load_dataset("huggan/wikiart", split="train")

# Get genres directly from the dataset
all_genres = ds['genre']
genre_counts = Counter(all_genres)
unique_genres = list(genre_counts.keys())

print(f"Found {len(unique_genres)} genres")

# Sample evenly
images_per_genre = 1000 // len(unique_genres)
sampled_indices = []

for genre in unique_genres:
    # Get indices for this genre
    genre_indices = [i for i, g in enumerate(all_genres) if g == genre]

    # Sample
    n_samples = min(images_per_genre, len(genre_indices))
    sampled = random.sample(genre_indices, n_samples)
    sampled_indices.extend(sampled)

# Shuffle and limit to 1000
random.shuffle(sampled_indices)
sampled_indices = sampled_indices[:1000]

# Split 70/30 train/val
train_indices = sampled_indices[:700]
val_indices = sampled_indices[700:1000]

print("Saving images...")

# Save train images
for i, idx in enumerate(train_indices):
    img = ds[idx]['image']
    img.convert('RGB').save(f"data/train/authentic/img_{i:04d}.jpg")
    if (i + 1) % 100 == 0:
        print(f"Train: {i + 1}/700")

# Save val images
for i, idx in enumerate(val_indices):
    img = ds[idx]['image']
    img.convert('RGB').save(f"data/val/authentic/img_{i:04d}.jpg")
    if (i + 1) % 50 == 0:
        print(f"Val: {i + 1}/300")

print("Done! Genre distribution maintained across train/val splits")

OSError: Not enough disk space. Needed: 66.20 GiB (download: 31.42 GiB, generated: 34.79 GiB, post-processed: Unknown size)