In [1]:
import os
import json
import shutil
import random

# Define paths
dataset_root = "train_dataset"
images_dir = os.path.join(dataset_root, "images")
annotations_file = os.path.join(dataset_root, "instances_default.json")

# Load COCO annotations
with open(annotations_file, "r") as f:
    coco_data = json.load(f)

# Get list of image file names
image_filenames = [img["file_name"] for img in coco_data["images"]]

# Shuffle and split (80% train, 20% val)
random.shuffle(image_filenames)
split_idx = int(0.8 * len(image_filenames))
train_images = set(image_filenames[:split_idx])
val_images = set(image_filenames[split_idx:])

# Create output directories
train_dir = os.path.join(dataset_root, "processed/train")
val_dir = os.path.join(dataset_root, "processed/val")
os.makedirs(train_dir, exist_ok=True)
os.makedirs(val_dir, exist_ok=True)

# Move images to train/ and val/ directories
for img_file in os.listdir(images_dir):
    src_path = os.path.join(images_dir, img_file)
    if img_file in train_images:
        shutil.move(src_path, os.path.join(train_dir, img_file))
    elif img_file in val_images:
        shutil.move(src_path, os.path.join(val_dir, img_file))

print("Dataset split complete.")

train_annotations = {k: [] if isinstance(v, list) else v for k, v in coco_data.items()}
val_annotations = {k: [] if isinstance(v, list) else v for k, v in coco_data.items()}

# Filter images and annotations
train_annotations["images"] = [img for img in coco_data["images"] if img["file_name"] in train_images]
val_annotations["images"] = [img for img in coco_data["images"] if img["file_name"] in val_images]

train_image_ids = {img["id"] for img in train_annotations["images"]}
val_image_ids = {img["id"] for img in val_annotations["images"]}

train_annotations["annotations"] = [ann for ann in coco_data["annotations"] if ann["image_id"] in train_image_ids]
val_annotations["annotations"] = [ann for ann in coco_data["annotations"] if ann["image_id"] in val_image_ids]

# Save new JSON files
with open(os.path.join(dataset_root, "processed/annotations/instances_train.json"), "w") as f:
    json.dump(train_annotations, f)

with open(os.path.join(dataset_root, "processed/annotations/instances_val.json"), "w") as f:
    json.dump(val_annotations, f)

print("Annotations split complete.")



FileNotFoundError: [Errno 2] No such file or directory: 'train_dataset/instances_default.json'