In [1]:
import glob
import json
import xml.etree.ElementTree as ET

In [2]:
def voc2coco(voc_root, target_json, categories=None):
    """ VOC格式转为COCO的JSON格式
    
    Args:
        voc_root (str): XML文件的根目录
        target_json (str): 生成的COCO格式的json保存位置
        categories (dict): 类别映射文件，用于训练数据和验证数据的类别对齐
        
    COCO = {
        "images": [
            {
                # img的唯一ID
                "id": 0,
                "file_name": "1.jpg",
                "height": 123,
                "width": 123
            },
            ...
        ],
        "annotations": [
            {
                # img的唯一ID
                "image_id": 0,
                # bbox的唯一ID
                "id": 0,
                "category_id": 1,
                # 左上宽高
                "bbox": [0, 0, 4, 4],
                "area": 16,
                "segmentation": [0, 0, 1, 1, 2, 2, 3, 3, ...],
                "iscrowd": 0 if "polygon" else 1
            },
            ...
        ],
        "categories": [
            # 0表示背景
            {
                "id": 1,
                "name": "lemon",
            },
            ...
        ]
    }
    """
    xml_files = sorted(glob.glob(f"{voc_root}/*.xml"))
    images = []
    annotations = []
    if not categories:
        categories = {"background": 0}
    img_id = 0
    ann_id = 0
    for xml_path in xml_files:
        root = ET.parse(xml_path).getroot()
        # # # # #
        # Image #
        # # # # #
        file_name = root.find("filename").text.split("/")[-1]
        height = int(root.find("size").find("height").text)
        width = int(root.find("size").find("width").text)
        
        images.append(
            {
                "id": img_id,
                "file_name": file_name,
                "height": height,
                "width": width
            }
        )
        # # # # # # # #
        # Annotation #
        # # # # # # # 
        for obj in root.findall("object"):
            cls = obj.find("name").text
            if cls not in categories:
                categories[cls] = len(categories)
            cls_id = categories[cls]
            x_min = int(obj.find("bndbox").find("xmin").text)
            y_min = int(obj.find("bndbox").find("ymin").text)
            x_max = int(obj.find("bndbox").find("xmax").text)
            y_max = int(obj.find("bndbox").find("ymax").text)
            w = x_max - x_min
            h = y_max - y_min
            bbox = [x_min, y_min, w, h]
            area = w * h
            
            annotations.append({
                "image_id": img_id,
                "id": ann_id,
                "category_id": cls_id,
                "bbox": bbox,
                "area": area,
                "segmentation": [],
                "iscrowd": 0
            })
            ann_id += 1
            
        img_id += 1
        
    categories = [{"id": val, "name": key} for key, val in categories.items()]
    
    with open(target_json, "w") as f:
        json.dump(
            {
                "images": images,
                "annotations": annotations,
                "categories": categories
            },
            f
        )
    print(f"共有{img_id}张image\n"
          f"共有{ann_id}个bbox\n"
          f"类别分别为{[item['name'] for item in categories]}")

In [3]:
categories = [
        {
            "id": 0,
            "name": "background"
        },
        {
            "id": 1,
            "name": "knife"
        },
        {
            "id": 2,
            "name": "scissor"
        },
        {
            "id": 3,
            "name": "glassbottle"
        },
        {
            "id": 4,
            "name": "tongs"
        },
        {
            "id": 5,
            "name": "metalcup"
        },
        {
            "id": 6,
            "name": "umbrella"
        },
        {
            "id": 7,
            "name": "lighter"
        },
        {
            "id": 8,
            "name": "pressure"
        },
        {
            "id": 9,
            "name": "laptop"
        }
    ]

categories = {item["name"]: item["id"] for item in categories}

In [4]:
voc2coco("sift_data/labels/", "sift_data/coco_ann.json", categories)

共有4497张image
共有7078个bbox
类别分别为['background', 'knife', 'scissor', 'glassbottle', 'tongs', 'metalcup', 'umbrella', 'lighter', 'pressure', 'laptop']
