In [1]:
import xml.etree.ElementTree as ET
import os

project = 'dog_park'

# classes = {'player': 0,
#            'keeper': 1,
#            'ref': 2,
#            }

classes = {'dog': 0,
           }

def convert_annotations_to_yolo(xml_file_path, output_dir, image_width, image_height):
    # Parse the XML file
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    # Ensure output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    frame_data = {}
    for image in root.findall('image'):
        frame = int(image.get('id'))
        
        for box in image.findall('box'):
            label = box.get('label')
            if label not in classes:
                continue
            label_id = classes[label]

            xtl = float(box.get('xtl'))
            ytl = float(box.get('ytl'))
            xbr = float(box.get('xbr'))
            ybr = float(box.get('ybr'))

            # Convert to YOLO format
            x_center = (xtl + xbr) / 2 / image_width
            y_center = (ytl + ybr) / 2 / image_height
            width = (xbr - xtl) / image_width
            height = (ybr - ytl) / image_height

            if frame not in frame_data:
                frame_data[frame] = []
            frame_data[frame].append(f'{label_id} {x_center} {y_center} {width} {height}')

    for frame, boxes in frame_data.items():
        txt_file_path = os.path.join(output_dir, f'frame_{frame:06d}.txt')
        with open(txt_file_path, 'w') as txt_file:
            for box in boxes:
                txt_file.write(box + '\n')


def convert_cvat_video_to_yolo(xml_file_path, output_dir, image_width, image_height):
    # Parse the XML file
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    # Ensure output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    frame_data = {}
    for track in root.findall('track'):
        label = track.get('label')
        label = classes[label]
        
        for box in track.findall('box'):
            frame = int(box.get('frame'))
            xtl = float(box.get('xtl'))
            ytl = float(box.get('ytl'))
            xbr = float(box.get('xbr'))
            ybr = float(box.get('ybr'))

            # Convert to YOLO format
            x_center = (xtl + xbr) / 2 / image_width
            y_center = (ytl + ybr) / 2 / image_height
            width = (xbr - xtl) / image_width
            height = (ybr - ytl) / image_height

            if frame not in frame_data:
                frame_data[frame] = []
            frame_data[frame].append(f'{label} {x_center} {y_center} {width} {height}')

    for frame, boxes in frame_data.items():
        txt_file_path = os.path.join(output_dir, f'frame_{frame}.txt')
        with open(txt_file_path, 'w') as txt_file:
            for box in boxes:
                txt_file.write(box + '\n')

# Usag
# Update the usage part to include the new function
if project == 'hockey':
    convert_cvat_video_to_yolo('/Users/eric/Desktop/2-Career/Projects/ObjectDetection/hockey/labels_cvat_xml/CAR_BOS_2019_001.xml', '/Users/eric/Desktop/2-Career/Projects/ObjectDetection/hockey/labels/CAR_BOS_2019_001', 1280, 720)
elif project == 'dog_park':
    convert_annotations_to_yolo('/Users/eric/Desktop/2-Career/Projects/ObjectDetection/dog_park/labels_cvat_xml/dogs_6s.xml', '/Users/eric/Desktop/2-Career/Projects/ObjectDetection/dog_park/labels', 1920, 1080)

