In [1]:
# Reference
# https://linuxhint.com/python_xml_to_dictionary/

In [2]:
import os
import xmltodict

In [3]:
base = './data/ori'

In [4]:
def preprocessing(base):
    
    def xml_to_dict(base, filename):
        with open(os.path.join(base, filename), "r") as xml_obj:
            my_dict = xmltodict.parse(xml_obj.read())
            xml_obj.close()
        return my_dict
    
    filenames = [filename for filename in os.listdir(base) if os.path.splitext(filename)[-1] == '.xml']
    
    annotations = {}
    
    cls_dict = {}
       
    # Get all classes in the dataset
    for filename in filenames:
        ann  = xml_to_dict(base, filename)['annotation']
        for obj in ann['object']:
            cls_name = obj['name']
            if cls_name not in cls_dict:
                cls_dict[cls_name] = len(cls_dict) + 1
    
    img_id = 1
    ann_id = 1
    for filename in filenames:
        ann  = xml_to_dict(base, filename)['annotation']
        set_name = ann['folder']
        
        if set_name not in annotations:
            annotations[set_name] = {
                "type": "instances",
                "images": [],
                "categories": [],
                "annotations": []
            }
            
        for ann_obj in ann['object']:
            bbx = ann_obj['bndbox']
            xmin, ymin = int(bbx['xmin']), int(bbx['ymin'])
            xmax, ymax = int(bbx['xmax']), int(bbx['ymax'])
            dx = xmax - xmin
            dy = ymax - ymin
            
            annotations[set_name]["annotations"].append({
                "id": ann_id,
                "bbox": [xmin, ymin, dx, dy],
                "image_id": img_id,
                "category_id": cls_dict[ann_obj['name']],
                "segmentation": [],
                "area": dx*dy,
                "iscrowd": 0
            })
            ann_id = ann_id + 1
        
        size = ann['size']
        annotations[set_name]["images"].append({
            "file_name": ann['filename'],
            "height":size['height'] ,
            "width": size['width'],
            "id": img_id
        })
        img_id = img_id + 1

    for set_name in annotations:
        for cls_name, cls_id in cls_dict.items():
            annotations[set_name]["categories"].append({
                "supercategory": "none",
                "name": cls_name,
                "id": cls_id
            })
    
        
    return annotations

In [5]:
def analysis(annotations, display=True):
    # Get the classes details
    set_names = list(annotations.keys())
    categories = annotations[set_names[0]]['categories']
    num_objects = 0
    num_images = 0
    
    results = {set_name: {
        'num_images': 0,
        'num_objects': 0,
        'objects': {category['id']: 0 for category in categories}
    } for set_name in set_names}
    
    for set_name in set_names:
        anns = annotations[set_name]
        for image in anns['images']:
            num_images = num_images + 1
            results[set_name]['num_images'] = results[set_name]['num_images'] + 1
        
        for objs in anns['annotations']:
            cat_id = objs['category_id']
            num_objects = num_objects + 1
            results[set_name]['num_objects'] = results[set_name]['num_objects'] + 1
            results[set_name]['objects'][cat_id] = results[set_name]['objects'][cat_id] + 1
    
    if display:
        print('-----------------------------------')
        print('num_images', ' '*(20 - len(f'num_images{num_images}')), num_images)
        print('num_objects', ' '*(20 - len(f'num_objects{num_objects}')), num_objects)

        print('-----------------------------------')
        print('num_images on each set')
        print('')
        total = sum([results[set_name]['num_images'] for set_name in set_names])
        for set_name in set_names:
            nimgs = results[set_name]['num_images']
            pct = nimgs / total
            print(set_name, ' '*(15-len(f'{set_name}{nimgs}')), nimgs, ' '*2, "{:.3f}".format(pct))

        print('-----------------------------------')
        print('num_objects on each set')
        print('')
        total = sum([results[set_name]['num_objects'] for set_name in set_names])
        for set_name in set_names:
            nobjs = results[set_name]['num_objects']
            pct = nobjs / total
            print(set_name, ' '*(15-len(f'{set_name}{nobjs}')), nobjs, ' '*2, "{:.3f}".format(pct))

        for category in categories:
            cat_id = category['id']
            print('-----------------------------------')
            print(f'Category: {cat_id}')
            print('')
            total = sum([results[set_name]['objects'][cat_id] for set_name in set_names])
            for set_name in set_names:
                nobjs = results[set_name]['objects'][cat_id]
                pct = nobjs / total
                print(set_name, ' '*(15-len(f'{set_name}{nobjs}')), nobjs, ' '*2, "{:.3f}".format(pct))
        print('-----------------------------------')
            
    return results

In [6]:
annotations = preprocessing(base)
results = analysis(annotations)

-----------------------------------
num_images         180
num_objects      23139
-----------------------------------
num_images on each set

train         154    0.856
test           26    0.144
-----------------------------------
num_objects on each set

train       19487    0.842
test         3652    0.158
-----------------------------------
Category: 1

train        9476    0.836
test         1862    0.164
-----------------------------------
Category: 2

train       10011    0.848
test         1790    0.152
-----------------------------------
