In [1]:
import os
import shutil
import random

In [5]:
def split_dataset(images_dir, labels_dir, output_dir, train_ratio=0.8, val_ratio=0.12, test_ratio=0.08, seed=42):
    # Crear carpetas de salida
    for split in ['train', 'val', 'test']:
        os.makedirs(os.path.join(output_dir, split, 'images'), exist_ok=True)
        os.makedirs(os.path.join(output_dir, split, 'labels'), exist_ok=True)

    fall_files = []
    nofall_files = []

    # Recorrer imágenes y clasificar según el label
    image_files = [f for f in os.listdir(images_dir) if f.endswith(('.jpg', '.png', '.jpeg'))]

    for img_file in image_files:
        label_file = img_file.rsplit('.', 1)[0] + '.txt'
        label_path = os.path.join(labels_dir, label_file)

        if not os.path.exists(label_path):
            continue  # Saltar si no tiene label

        # Leer el label para ver si es "fall" o "no fall"
        with open(label_path, 'r') as f:
            contenido = f.read().strip()

            if not contenido:
                continue  # Saltar si el label está vacío

            clases = [int(line.split()[0]) for line in contenido.splitlines()]

            if 1 in clases:
                fall_files.append(img_file)
            else:
                nofall_files.append(img_file)

    print(f"Total Fall: {len(fall_files)}, Total No Fall: {len(nofall_files)}")

    # Mezclar aleatoriamente cada grupo
    random.seed(seed)
    random.shuffle(fall_files)
    random.shuffle(nofall_files)

    def dividir_lista(lista):
        total = len(lista)
        train_size = int(total * train_ratio)
        val_size = int(total * val_ratio)

        train_files = lista[:train_size]
        val_files = lista[train_size:train_size + val_size]
        test_files = lista[train_size + val_size:]

        return train_files, val_files, test_files

    # Dividir cada clase
    fall_train, fall_val, fall_test = dividir_lista(fall_files)
    nofall_train, nofall_val, nofall_test = dividir_lista(nofall_files)

    # Unir las listas
    train_files = fall_train + nofall_train
    val_files = fall_val + nofall_val
    test_files = fall_test + nofall_test

    # Volver a mezclar dentro de cada split
    random.shuffle(train_files)
    random.shuffle(val_files)
    random.shuffle(test_files)

    splits = [('train', train_files), ('val', val_files), ('test', test_files)]

    # Copiar archivos
    for split_name, split_files in splits:
        for img_file in split_files:
            label_file = img_file.rsplit('.', 1)[0] + '.txt'

            shutil.copy(os.path.join(images_dir, img_file), os.path.join(output_dir, split_name, 'images', img_file))
            shutil.copy(os.path.join(labels_dir, label_file), os.path.join(output_dir, split_name, 'labels', label_file))

    print("Separación balanceada completada exitosamente.")

In [6]:
images_dir = 'dataset/CAUCAFall/all_imgs'
labels_dir = 'dataset/CAUCAFall/all_labels'
output_dir = "dataset/CAUCAFall/cleaned_dataset"

split_dataset(images_dir, labels_dir, output_dir)

Total Fall: 6388, Total No Fall: 13607
Separación balanceada completada exitosamente.
