In [21]:
import json
import shutil
from pathlib import Path
from datasets import load_dataset

subjects = [
    'Accounting',
    'Agriculture',
    'Architecture_and_Engineering',
    'Art',
    'Art_Theory',
    'Basic_Medical_Science',
    'Biology',
    'Chemistry',
    'Clinical_Medicine',
    'Computer_Science',
    'Design',
    'Diagnostics_and_Laboratory_Medicine',
    'Economics',
    'Electronics',
    'Energy_and_Power',
    'Finance',
    'Geography',
    'History',
    'Literature',
    'Manage',
    'Marketing',
    'Materials',
    'Math',
    'Mechanical_Engineering',
    'Music',
    'Pharmacy',
    'Physics',
    'Psychology',
    'Public_Health',
    'Sociology',
]

out_dir = Path('exports/mmmu_validation_all')
images_dir = out_dir / 'images'
out_dir.mkdir(parents=True, exist_ok=True)
images_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / 'mmmu_validation_all.jsonl'

def sanitize_id(value: str) -> str:
    return value.replace('/', '_')

with out_path.open('w', encoding='utf-8') as f:
    for subject in subjects:
        ds = load_dataset('MMMU/MMMU', subject, split='validation')
        for row in ds:
            row = dict(row)
            row['subject'] = subject
            safe_id = sanitize_id(row['id'])
            for i in range(1, 8):
                key = f'image_{i}'
                if key not in row:
                    continue
                img = row[key]
                if img is None:
                    row[key] = None
                    continue
                img_name = f'{safe_id}_{key}.png'
                img_path = images_dir / img_name
                if hasattr(img, 'save'):
                    img.save(img_path)
                else:
                    raise ValueError(f"Unsupported image type for {row['id']} {key}")
                row[key] = str(img_path.relative_to(out_dir))

            f.write(json.dumps(row, ensure_ascii=True) + '\n')

print(f'Wrote {out_path}')


Wrote exports/mmmu_validation_all/mmmu_validation_all.jsonl
