# Train Test Split
Split train dataset into train, val, and test. Export a dataset from CVAT can't split automatically from tasks. So, we need to split manually if we don't specify the subsets. This notebook do that.

In [1]:
import os
import io
import yaml
import random
import shutil
from tqdm import tqdm

Specify train, val, and test composition

In [2]:
n_train = 70
n_val = 20
n_test = 10

if sum([n_train, n_val, n_test]) != 100:
    raise ValueError("Total composition must be 100!")

In [3]:
dir_dataset = "C:/Users/eats/projects/IASS/asset/task_20250507-epsilon-skip_1_dataset_2025_05_19_03_18_32_ultralytics yolo detection 1.0"
dir_output = f"{dir_dataset}/data-split"
os.makedirs(dir_output, exist_ok=True)
os.makedirs(f"{dir_output}/images/train", exist_ok=True)
os.makedirs(f"{dir_output}/images/val", exist_ok=True)
os.makedirs(f"{dir_output}/images/test", exist_ok=True)
os.makedirs(f"{dir_output}/labels/train", exist_ok=True)
os.makedirs(f"{dir_output}/labels/val", exist_ok=True)
os.makedirs(f"{dir_output}/labels/test", exist_ok=True)

with open(f"{dir_dataset}/data/data.yaml", "r") as file:
    data_yaml = yaml.safe_load(file)

with open(f"{dir_dataset}/data/train.txt", "r") as file:
    paths_train = file.read()
    paths_train = [item for item in paths_train.split("\n") if item]


In [4]:
data_yaml

{'names': {0: 'bicycle',
  1: 'motorbike',
  2: 'car',
  3: 'truck',
  4: 'bus',
  5: 'person'},
 'path': '.',
 'train': 'train.txt'}

In [5]:
paths_train[0]

'data/images/train/epsilon_0.jpg'

Function to split dataset into train, val, and test

In [6]:
def split_dataset(data, train_ratio=0.7, val_ratio=0.2, test_ratio=0.1, seed=None):
    if not abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-6:
        raise ValueError("Ratios must sum to 1.0")
    
    if seed is not None:
        random.seed(seed)
    
    data = data.copy()
    random.shuffle(data)

    total = len(data)
    train_end = int(train_ratio * total)
    val_end = train_end + int(val_ratio * total)

    train = data[:train_end]
    val = data[train_end:val_end]
    test = data[val_end:]

    return train, val, test


In [7]:
split_train, split_val, split_test = split_dataset(paths_train)

len(split_train), len(split_val), len(split_test)

(1050, 300, 150)

In [8]:
split_train[0], split_val[0], split_test[0]

('data/images/train/epsilon_955.jpg',
 'data/images/train/epsilon_220.jpg',
 'data/images/train/epsilon_653.jpg')

In [9]:
dataset = {
    "train": split_train,
    "val": split_val,
    "test": split_test,
}

# copy the split data to output directory
for name, list_path in dataset.items():
    for path in tqdm(list_path, desc=f"Split Images [{name}]"):
        path_src_img = f"{dir_dataset}/{path}"
        path_dst_img = path_src_img.replace("/data/", "/data-split/").replace("/train/", f"/{name}/")
        
        path_src_label = path_src_img.replace("/images/", "/labels/").replace(".jpg", ".txt")
        path_dst_label = path_dst_img.replace("/images/", "/labels/").replace(".jpg", ".txt")
        
        shutil.copyfile(path_src_img, path_dst_img)
        if os.path.exists(path_src_label):
            shutil.copyfile(path_src_label, path_dst_label)

# create txt file related to the split data
name = "train"
with open(f"{dir_output}/{name}.txt", "w") as file:
    file.write("\n".join([item.replace("train", name) for item in split_train]))

name = "val"
with open(f"{dir_output}/{name}.txt", "w") as file:
    file.write("\n".join([item.replace("train", name) for item in split_val]))

name = "test"
with open(f"{dir_output}/{name}.txt", "w") as file:
    file.write("\n".join([item.replace("train", name) for item in split_test]))

# create yaml file related to the split date
data_yaml["val"] = "val.txt"
data_yaml["test"] = "test.txt"
with io.open(f"{dir_output}/data.yaml", 'w', encoding='utf8') as file:
    yaml.dump(data_yaml, file, default_flow_style=False, allow_unicode=True)

Split Images [train]: 100%|██████████| 1050/1050 [00:03<00:00, 328.56it/s]
Split Images [val]: 100%|██████████| 300/300 [00:00<00:00, 343.35it/s]
Split Images [test]: 100%|██████████| 150/150 [00:00<00:00, 376.79it/s]


In [10]:
path_src_img

'C:/Users/eats/projects/IASS/asset/task_20250507-epsilon-skip_1_dataset_2025_05_19_03_18_32_ultralytics yolo detection 1.0/data/images/train/epsilon_610.jpg'

In [11]:
path_dst_img

'C:/Users/eats/projects/IASS/asset/task_20250507-epsilon-skip_1_dataset_2025_05_19_03_18_32_ultralytics yolo detection 1.0/data-split/images/test/epsilon_610.jpg'

In [12]:
data_yaml

{'names': {0: 'bicycle',
  1: 'motorbike',
  2: 'car',
  3: 'truck',
  4: 'bus',
  5: 'person'},
 'path': '.',
 'train': 'train.txt',
 'val': 'val.txt',
 'test': 'test.txt'}