In [None]:
from pathlib import Path
import pandas as pd
from PIL import Image
import torch
from torch.utils.data import DataLoader, random_split, Dataset
from torchvision import datasets, transforms

ROOT = Path("data")
TRAIN_DIR = ROOT/"train"
val_dir = "data/HoldoutSet01"
OUT = Path("processed"); OUT.mkdir(exist_ok=True)

# ImageNet-style transforms
IMG_SIZE = 224
mean, std = (0.485, 0.456, 0.406), (0.229, 0.224, 0.225)

train_tfms = transforms.Compose([
    transforms.RandomResizedCrop(IMG_SIZE, scale=(0.8, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])
eval_tfms = transforms.Compose([
    transforms.Resize(int(IMG_SIZE*1.15)),
    transforms.CenterCrop(IMG_SIZE),
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])

# 1) PyTorch dataset that reads ONLY pixel content from folders Alex/ and Kelly/
full = datasets.ImageFolder(TRAIN_DIR, transform=train_tfms)
print("Classes:", full.classes, "| class_to_idx:", full.class_to_idx)


# 2) Dataloaders
train_dl = DataLoader(full, batch_size=32, shuffle=True,  num_workers=0, pin_memory=True)

# 4) Show a batch
xb, yb = next(iter(train_dl))
print("Batch images:", xb.shape, "Batch labels (first 10):", yb[:10].tolist())

# 5) Save a manifest CSV
def subset_to_rows(subset, split):
    base = subset.dataset
    rows = []
    for i in subset.indices:
        p, lbl = base.samples[i]
        rows.append({"path": p, "label": base.classes[lbl], "label_id": lbl, "split": split})
    return rows

import itertools
rows = list(itertools.chain(
    subset_to_rows(full, "train")
))
df = pd.DataFrame(rows).sort_values("path").reset_index(drop=True)
df.to_csv(OUT/"manifest.csv", index=False)
print("Saved", OUT/"manifest.csv")
print(df["split"].value_counts(), "\n", df["label"].value_counts())


Classes: ['Alex', 'Kelly'] | class_to_idx: {'Alex': 0, 'Kelly': 1}
Batch images: torch.Size([32, 3, 224, 224]) Batch labels (first 10): [0, 1, 0, 1, 0, 0, 0, 0, 1, 1]
Batch images: torch.Size([32, 3, 224, 224]) Batch labels (first 10): [0, 1, 0, 1, 0, 0, 0, 0, 1, 1]


AttributeError: 'ImageFolder' object has no attribute 'dataset'