In [5]:
import json
import random
import numpy as np
import cv2
from pycocotools import mask
from sklearn.model_selection import train_test_split

def convert_to_coco_format(input_file, output_file):
    # Load the input JSON file
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    # Initialize the COCO format dictionary
    coco_format = {
        "info": {
            "description": data["info"]["description"],
            "version": data["info"]["version"],
            "year": 2024,
            "contributor": "Alex Davis",
            "date_created": "2024-07-26"
        },
        "licenses": [
            {
                "id": 1,
                "name": "CC BY 4.0",
                "url": "http://creativecommons.org/licenses/by/4.0/"
            }
        ],
        "categories": [],
        "images": [],
        "annotations": []
    }
    
    # Add categories
    for category in data["categories"]:
        coco_format["categories"].append({
            "id": category["id"],
            "name": category["name"],
            "supercategory": "none"
        })
    
    # Add images
    for image in data["images"]:
        coco_format["images"].append({
            "id": image["id"],
            "file_name": image["file_name"],
            "height": image["height"],
            "width": image["width"],
            "license": 1,
            "date_captured": "2024-07-26 00:00:00"
        })
    
    # Function to decode RLE to polygon
    def rle_to_polygon(rle):
        binary_mask = mask.decode(rle)
        polygons = []
        contours, _ = cv2.findContours(binary_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        for contour in contours:
            if contour.size >= 6:
                polygon = contour.flatten().tolist()
                polygons.append(polygon)
        return polygons
    
    # Add annotations
    for annotation in data["annotations"]:
        rle = annotation["segmentation"]
        if isinstance(rle, dict) and 'counts' in rle and 'size' in rle:
            segmentation = rle_to_polygon(rle)
        else:
            segmentation = rle
        coco_format["annotations"].append({
            "id": annotation["id"],
            "image_id": annotation["image_id"],
            "category_id": annotation["category_id"],
            "segmentation": segmentation,
            "area": annotation["area"],
            "bbox": annotation["bbox"],
            "iscrowd": annotation["iscrowd"]
        })
    
    # Save the COCO format JSON to output file
    with open(output_file, 'w') as f:
        json.dump(coco_format, f, separators=(',', ':'))
    
    return coco_format

def split_dataset(coco_format, train_ratio=0.7, val_ratio=0.2, test_ratio=0.1):
    # Extract all image IDs
    image_ids = [image["id"] for image in coco_format["images"]]
    
    # Split the image IDs into train, val, and test
    train_ids, temp_ids = train_test_split(image_ids, test_size=(1 - train_ratio))
    val_ids, test_ids = train_test_split(temp_ids, test_size=(test_ratio / (val_ratio + test_ratio)))
    
    # Function to filter annotations based on image IDs
    def filter_annotations(ids):
        return [ann for ann in coco_format["annotations"] if ann["image_id"] in ids]
    
    # Create the subsets
    train_data = {
        "info": coco_format["info"],
        "licenses": coco_format["licenses"],
        "categories": coco_format["categories"],
        "images": [img for img in coco_format["images"] if img["id"] in train_ids],
        "annotations": filter_annotations(train_ids)
    }
    
    val_data = {
        "info": coco_format["info"],
        "licenses": coco_format["licenses"],
        "categories": coco_format["categories"],
        "images": [img for img in coco_format["images"] if img["id"] in val_ids],
        "annotations": filter_annotations(val_ids)
    }
    
    test_data = {
        "info": coco_format["info"],
        "licenses": coco_format["licenses"],
        "categories": coco_format["categories"],
        "images": [img for img in coco_format["images"] if img["id"] in test_ids],
        "annotations": filter_annotations(test_ids)
    }
    
    return train_data, val_data, test_data

# Input and output file paths
input_file = './export_coco-instance_davis_alexander_TEM_Project4_TEM_V02.json'
output_file = './coco_format_dataset.json'

# Convert the dataset
coco_format = convert_to_coco_format(input_file, output_file)

# Split the dataset
train_data, val_data, test_data = split_dataset(coco_format)

# Save the splits
with open('./train_annotations.json', 'w') as f:
    json.dump(train_data, f, separators=(',', ':'))
with open('./val_annotations.json', 'w') as f:
    json.dump(val_data, f, separators=(',', ':'))
with open('./test_annotations.json', 'w') as f:
    json.dump(test_data, f, separators=(',', ':'))

print("Dataset conversion and splitting completed.")

Dataset conversion and splitting completed.
