In [19]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import PIL
import os

# Load and process subset

In [9]:
data = datasets.ImageFolder("CelebDataProcessed")

In [16]:
labels = data.classes
# Find the most common label
label_counts = {}
for _, label in data.samples:
    label_counts[label] = label_counts.get(label, 0) + 1   
# Store top 10 most common label names in list
subsetLabels = [labels[label] for label in sorted(label_counts, key=label_counts.get, reverse=True)[:10]]

In [20]:
# Create a directory only containing the top 10 most common labels
os.makedirs("CelebDataProcessedSubset", exist_ok=True)
for label in subsetLabels:
    os.makedirs(f"CelebDataProcessedSubset/{label}", exist_ok=True)
    # Copy images from original directory to new directory
    for root, _, files in os.walk(f"CelebDataProcessed/{label}"):
        for file in files:
            os.link(f"{root}/{file}", f"CelebDataProcessedSubset/{label}/{file}")

# Split subset

In [24]:
# Now we need to split into training and validation sets
subset = datasets.ImageFolder("CelebDataProcessedSubset")
train1, train2, val, test = torch.utils.data.random_split(subset, [0.4, 0.4, 0.1, 0.1])

# Save the datasets to disk


In [40]:
def saveSubset(dataset, path):
    for i, (img, label) in enumerate(dataset):
        dirname = subset.classes[label] # Name of class
        os.makedirs(f"{path}/{dirname}", exist_ok=True)
        img.save(f"{path}/{dirname}/{i}.jpg")

In [42]:
saveSubset(train1, "CelebTrainA")
saveSubset(train2, "CelebTrainB")
saveSubset(val, "CelebVal")
saveSubset(test, "CelebTest")