Organizar el dataset

In [5]:
import os
from pathlib import Path
import numpy as np
from PIL import Image
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import uuid


# ============================================================
# CONFIGURATION
# ============================================================

INPUT_DIR = "dataset"
OUTPUT_DIR = "dataset_by_age2"
IMG_SIZE = (128, 128)


# ============================================================
# LABEL EXTRACTION
# ============================================================

def extract_age_from_filename(filename):
    """
    Extract age from UTKFace-style filenames:
    - '1_0_0_20161219140623097.jpg'
    - '1_0_0_20161219140623097.jpg.chip.jpg'
    - '12_1_3_201701022015.png'
    
    Age is ALWAYS the first integer before the first underscore.
    """
    try:
        age_str = filename.split("_")[0]
        return int(age_str)
    except:
        return None


def age_to_category(age):
    if age < 40:
        return "joven"
    elif age < 65:
        return "medio"
    else:
        return "anciano"


# ============================================================
# IMAGE PROCESSING
# ============================================================

def process_image(path):
    """Load, resize, and return a processed PIL image."""
    img = Image.open(path).convert("RGB")
    img = img.resize(IMG_SIZE)
    return img


# ============================================================
# DIRECTORY CREATION
# ============================================================

def recreate_structure(base_dir, classes):
    for split in ["train", "validation", "test"]:
        for cls in classes:
            Path(base_dir, split, cls).mkdir(parents=True, exist_ok=True)


# ============================================================
# SCAN DATASET
# ============================================================

print("Scanning UTKFace image files...")

input_dir = Path(INPUT_DIR)
files = [f for f in input_dir.iterdir() if f.is_file()]

records = []
for f in tqdm(files):
    age = extract_age_from_filename(f.name)
    if age is None:
        continue
    category = age_to_category(age)
    records.append((f, category))

print(f"Valid images found: {len(records)}")

paths = np.array([r[0] for r in records])
labels = np.array([r[1] for r in records])

classes = sorted(list(set(labels)))
print("Classes:", classes)


# ============================================================
# STRATIFIED SPLIT
# ============================================================

train_paths, temp_paths, train_labels, temp_labels = train_test_split(
    paths, labels,
    test_size=0.30,
    stratify=labels,
    random_state=42
)

val_paths, test_paths, val_labels, test_labels = train_test_split(
    temp_paths, temp_labels,
    test_size=0.5,
    stratify=temp_labels,
    random_state=42
)

splits = {
    "train": list(zip(train_paths, train_labels)),
    "validation": list(zip(val_paths, val_labels)),
    "test": list(zip(test_paths, test_labels)),
}


# ============================================================
# ENABLE DIRECTORY STRUCTURE
# ============================================================

recreate_structure(OUTPUT_DIR, classes)


# ============================================================
# SAVE IMAGES STREAMING (UUID + PNG)
# ============================================================

def save_split(name, split_data, output_base):
    print(f"\nSaving {name}...")
    for path, label in tqdm(split_data):
        img = process_image(path)

        # generate random UUID filename
        new_name = f"{uuid.uuid4().hex}.png"

        dest = Path(output_base) / name / label / new_name

        img.save(dest, format="PNG")


save_split("train", splits["train"], OUTPUT_DIR)
save_split("validation", splits["validation"], OUTPUT_DIR)
save_split("test", splits["test"], OUTPUT_DIR)

print("\nDataset ready!")
print("Output directory:", OUTPUT_DIR)


Scanning UTKFace image files...


100%|██████████| 10137/10137 [00:00<00:00, 1460987.55it/s]


Valid images found: 10137
Classes: [np.str_('anciano'), np.str_('joven'), np.str_('medio')]

Saving train...


100%|██████████| 7095/7095 [01:07<00:00, 105.55it/s]



Saving validation...


100%|██████████| 1521/1521 [00:14<00:00, 107.15it/s]



Saving test...


100%|██████████| 1521/1521 [00:15<00:00, 99.80it/s] 


Dataset ready!
Output directory: dataset_by_age2





Crear una version normalizada (igual cantidad de muestras por clase)

In [6]:
import os
import shutil
import random
from pathlib import Path

# ==============================
# Configuración
# ==============================
INPUT_DIR = "dataset_by_age2"
OUTPUT_DIR = "dataset_by_age_normalized2"

SPLITS = ['train', 'validation', 'test']
CLASSES = ['joven', 'medio', 'anciano']
SPLIT_RATIOS = {'train': 0.7, 'validation': 0.15, 'test': 0.15}

random.seed(42)

# ==============================
# Funciones
# ==============================
def make_dirs(base_dir):
    for split in SPLITS:
        for cls in CLASSES:
            Path(base_dir, split, cls).mkdir(parents=True, exist_ok=True)

def get_all_files(input_dir):
    """Devuelve un diccionario: class -> list of files"""
    class_files = {}
    for cls in CLASSES:
        files = list(Path(input_dir, 'train', cls).glob("*.png")) + \
                list(Path(input_dir, 'validation', cls).glob("*.png")) + \
                list(Path(input_dir, 'test', cls).glob("*.png"))
        class_files[cls] = files
    return class_files

def split_files(files, ratios):
    """Divide lista de archivos en train/val/test según ratios"""
    random.shuffle(files)
    n = len(files)
    n_train = int(ratios['train']*n)
    n_val = int(ratios['validation']*n)
    train_files = files[:n_train]
    val_files = files[n_train:n_train+n_val]
    test_files = files[n_train+n_val:]
    return train_files, val_files, test_files

# ==============================
# Pipeline principal
# ==============================

# Crear estructura de carpetas
make_dirs(OUTPUT_DIR)

# Obtener todos los archivos por clase
class_files = get_all_files(INPUT_DIR)

# Determinar tamaño de la clase minoritaria
min_count = min(len(files) for files in class_files.values())
print("Cantidad de imágenes por clase tras normalizar:", min_count)

# Para cada clase, tomar solo min_count imágenes y dividir en splits
for cls, files in class_files.items():
    selected_files = random.sample(files, min_count)  # submuestreo
    train_files, val_files, test_files = split_files(selected_files, SPLIT_RATIOS)

    # Copiar archivos a la nueva estructura
    for f in train_files:
        shutil.copy(f, Path(OUTPUT_DIR, 'train', cls, f.name))
    for f in val_files:
        shutil.copy(f, Path(OUTPUT_DIR, 'validation', cls, f.name))
    for f in test_files:
        shutil.copy(f, Path(OUTPUT_DIR, 'test', cls, f.name))

print("Dataset balanceado y normalizado creado en:", OUTPUT_DIR)


Cantidad de imágenes por clase tras normalizar: 1148
Dataset balanceado y normalizado creado en: dataset_by_age_normalized2
