# DeepFashion2 Dataset Subset Creation

This notebook will create a stratified train and val subset for our experiments


In [1]:
import os
import json
import random
import shutil
import csv
from collections import defaultdict

## Set Up Project Structure


In [2]:
# Paths to original DeepFashion2 data
TRAIN_IMG_DIR = "../data/original/train/image"
TRAIN_ANNO_DIR = "../data/original/train/annos"
VAL_IMG_DIR   = "../data/original/validation/image"
VAL_ANNO_DIR  = "../data/original/validation/annos"

# Paths to new subset (train and val)
SUBSET_TRAIN_IMG_DIR = "../data/subset/train/images"
SUBSET_TRAIN_ANNO_DIR = "../data/subset/train/annotations"
SUBSET_VAL_IMG_DIR   = "../data/subset/val/images"
SUBSET_VAL_ANNO_DIR  = "../data/subset/val/annotations"

# Number of samples per category
DESIRED_TRAIN_PER_CAT = 500
DESIRED_VAL_PER_CAT   = 100

# Map category ID to category name
category_map = {
    1:  "short_sleeve_top",
    2:  "long_sleeve_top",
    3:  "short_sleeve_outwear",
    4:  "long_sleeve_outwear",
    5:  "vest",
    6:  "sling",
    7:  "shorts",
    8:  "trousers",
    9:  "skirt",
    10: "short_sleeve_dress",
    11: "long_sleeve_dress",
    12: "vest_dress",
    13: "sling_dress"
}

random.seed(42)

os.makedirs(SUBSET_TRAIN_IMG_DIR, exist_ok=True)
os.makedirs(SUBSET_TRAIN_ANNO_DIR, exist_ok=True)
os.makedirs(SUBSET_VAL_IMG_DIR,   exist_ok=True)
os.makedirs(SUBSET_VAL_ANNO_DIR,  exist_ok=True)

## Create Stratified Subset

Create a balanced subset with equal representation of all 13 clothing categories


In [3]:
def find_dominant_item(annotation_data):
    # Find the item with the largest bounding box area.
    # Returns (cat_id, bbox, item_key) or None if no items found
    largest_area = 0
    dominant_cat_id = None

    i = 1
    while True:
        item_key = f"item{i}"
        if item_key not in annotation_data:
            break
        item = annotation_data[item_key]
        cat_id = item["category_id"]
        bbox = item["bounding_box"]
        x1, y1, x2, y2 = bbox
        area = (x2 - x1) * (y2 - y1)
        if area > largest_area:
            largest_area = area
            dominant_cat_id = cat_id
        i += 1

    return dominant_cat_id


def parse_dataset(img_dir, anno_dir):
    # Parse all JSON annotations in 'anno_dir'. For each file, pick the largest bounding
    # box item as the 'dominant' item. Return a dict:
    # category_index[cat_id] = list of (image_filename, annotation_filename)

    category_index = defaultdict(list)
    all_anno_files = [f for f in os.listdir(anno_dir) if f.endswith(".json")]
    for anno_file in all_anno_files:

        image_id = os.path.splitext(anno_file)[0]
        img_filename = image_id + ".jpg"

        anno_path = os.path.join(anno_dir, anno_file)
        img_path  = os.path.join(img_dir,  img_filename)

        if not os.path.isfile(img_path):
            continue

        with open(anno_path, 'r') as f:
            data = json.load(f)

        dominant_cat_id = find_dominant_item(data)
        if dominant_cat_id is not None:
            category_index[dominant_cat_id].append((img_filename, anno_file))

    return category_index


def sample_category_items(category_index, desired_count):
    # Shuffle each category's list and take up to 'desired_count' samples
    # Returns a dict with the same keys but truncated lists

    sampled = {}
    for cat_id, items in category_index.items():
        random.shuffle(items)
        sampled[cat_id] = items[:desired_count]
    return sampled

def copy_subset_to_folder(sample_dict, src_img_dir, src_anno_dir, dst_img_dir, dst_anno_dir, csv_path):
    # Given a dict {cat_id -> list of (img_file, anno_file)}, copy files
    # into the subset folder structure, and write a CSV with columns:
    # image_filename, annotation_filename, category_id, category_name

    with open(csv_path, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(["image_filename", "annotation_filename", "category_id", "category_name"])

        # For each category
        for cat_id, items in sample_dict.items():
            cat_name = category_map.get(cat_id, f"cat_{cat_id}")
            for (img_file, anno_file) in items:
                # Copy image
                src_img_path  = os.path.join(src_img_dir,  img_file)
                dst_img_path  = os.path.join(dst_img_dir,  img_file)
                # Copy annotation
                src_anno_path = os.path.join(src_anno_dir, anno_file)
                dst_anno_path = os.path.join(dst_anno_dir, anno_file)
                shutil.copyfile(src_img_path,  dst_img_path)
                shutil.copyfile(src_anno_path, dst_anno_path)

                writer.writerow([img_file, anno_file, cat_id, cat_name])

In [4]:
print("Parsing TRAIN annotations...")
train_index = parse_dataset(TRAIN_IMG_DIR, TRAIN_ANNO_DIR)
print(f"Found dominant items in {sum(len(v) for v in train_index.values())} train images total.")

print("Parsing VAL annotations...")
val_index = parse_dataset(VAL_IMG_DIR, VAL_ANNO_DIR)
print(f"Found dominant items in {sum(len(v) for v in val_index.values())} val images total.")

Parsing TRAIN annotations...
Found dominant items in 191961 train images total.
Parsing VAL annotations...
Found dominant items in 32153 val images total.


In [5]:
train_samples = sample_category_items(train_index, DESIRED_TRAIN_PER_CAT)
val_samples   = sample_category_items(val_index,   DESIRED_VAL_PER_CAT)

for cat_id in range(1, 14):
    print(f"Category {cat_id} (train): {len(train_samples.get(cat_id, []))} / {DESIRED_TRAIN_PER_CAT}")
    print(f"Category {cat_id} (val):   {len(val_samples.get(cat_id, []))} / {DESIRED_VAL_PER_CAT}")

Category 1 (train): 500 / 500
Category 1 (val):   100 / 100
Category 2 (train): 500 / 500
Category 2 (val):   100 / 100
Category 3 (train): 461 / 500
Category 3 (val):   100 / 100
Category 4 (train): 500 / 500
Category 4 (val):   100 / 100
Category 5 (train): 500 / 500
Category 5 (val):   100 / 100
Category 6 (train): 500 / 500
Category 6 (val):   100 / 100
Category 7 (train): 500 / 500
Category 7 (val):   100 / 100
Category 8 (train): 500 / 500
Category 8 (val):   100 / 100
Category 9 (train): 500 / 500
Category 9 (val):   100 / 100
Category 10 (train): 500 / 500
Category 10 (val):   100 / 100
Category 11 (train): 500 / 500
Category 11 (val):   100 / 100
Category 12 (train): 500 / 500
Category 12 (val):   100 / 100
Category 13 (train): 500 / 500
Category 13 (val):   100 / 100


In [6]:
train_csv_path = os.path.join(os.path.dirname(SUBSET_TRAIN_IMG_DIR), "classification_metadata.csv")  
copy_subset_to_folder(
    train_samples,
    src_img_dir=TRAIN_IMG_DIR,
    src_anno_dir=TRAIN_ANNO_DIR,
    dst_img_dir=SUBSET_TRAIN_IMG_DIR,
    dst_anno_dir=SUBSET_TRAIN_ANNO_DIR,
    csv_path=train_csv_path
)

val_csv_path = os.path.join(os.path.dirname(SUBSET_VAL_IMG_DIR), "classification_metadata.csv")
copy_subset_to_folder(
    val_samples,
    src_img_dir=VAL_IMG_DIR,
    src_anno_dir=VAL_ANNO_DIR,
    dst_img_dir=SUBSET_VAL_IMG_DIR,
    dst_anno_dir=SUBSET_VAL_ANNO_DIR,
    csv_path=val_csv_path
)

print("Done creating train and val subsets")

Done creating train and val subsets
