In [1]:
import os
import pandas as pd
import xml.etree.ElementTree as ET

In [2]:
# source and credits:
# https://raw.githubusercontent.com/datitran/raccoon_dataset/master/xml_to_csv.py

def xmlToCsv(xml_file_path, column_name = []):
    tree = ET.parse(xml_file_path)
    root = tree.getroot()
    for member in root.findall('object'):
        value = (
            root.find('filename').text,
            int(root.find('size')[0].text),
            int(root.find('size')[1].text),
            member[0].text,
            int(member[4][0].text),
            int(member[4][1].text),
            int(member[4][2].text),
            int(member[4][3].text)
        )
    return value

In [3]:
BASE_PATH = os.path.join(os.getcwd(), '..', 'dataset')
DATASET_PATH = os.path.join(BASE_PATH, 'mixed-wp-ann')
IMAGES_DATASET_PATH = os.path.join(DATASET_PATH, 'images')
TRAIN_DATASET_PATH = os.path.join(DATASET_PATH, 'train')
TEST_DATASET_PATH = os.path.join(DATASET_PATH, 'test')
ANNOTATION_DATASET_PATH = os.path.join(DATASET_PATH, 'annotations')

In [4]:
# Split dataset into training, validation, and testing with following ratio 0.8, 0, and 0.2
# Uncomment if your data isn't in correct folder

# import splitfolders
# splitfolders.ratio(IMAGES_DATASET_PATH, output = DATASET_PATH, seed = 1092, ratio = (0.8, 0, 0.2), group_prefix = None)

In [4]:
# Defined column names
column_name = ['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
train_arr = []
test_arr = []

# Iterate all train dataset
for label_dirs in os.listdir(TRAIN_DATASET_PATH):
    images_path = os.path.join(TRAIN_DATASET_PATH, label_dirs)
    # Iterate all file in label folder
    for image in os.listdir(images_path):
        # Get filename
        fName, fExt = os.path.splitext(image)
        # Check wheter image and annotations exist
        image_annotation_path = os.path.join(ANNOTATION_DATASET_PATH, label_dirs, '{}{}'.format(fName, '.xml'))
        # Insert to train dataset
        train_arr.append(xmlToCsv(image_annotation_path))

# Iterate all test dataset
for label_dirs in os.listdir(TEST_DATASET_PATH):
    images_path = os.path.join(TEST_DATASET_PATH, label_dirs)
    # Iterate all file in label folder
    for image in os.listdir(images_path):
        # Get filename
        fName, fExt = os.path.splitext(image)
        # Check wheter image and annotations exist
        image_annotation_path = os.path.join(ANNOTATION_DATASET_PATH, label_dirs, '{}{}'.format(fName, '.xml'))
        # Insert to train dataset
        test_arr.append(xmlToCsv(image_annotation_path))

# Convert array to df     
train_df = pd.DataFrame(train_arr, columns = column_name)
test_df = pd.DataFrame(test_arr, columns = column_name)

# Check df contents
print(train_df.head())
print(test_df.head())

                 filename  width  height class  xmin  ymin  xmax  ymax
0  A-00c67a42-d13f-74.JPG    640     425     A   257   160   385   221
1  A-02bc10f7-c8f3-19.jpg    640     426     A   113   206   335   321
2  A-0434e24e-18dc-20.jpg    480     640     A    91   380   356   559
3  A-04438dd5-8621-85.jpg    640     480     A   206   193   369   284
4  A-080f0769-39da-72.JPG    640     425     A   255   145   388   216
                 filename  width  height class  xmin  ymin  xmax  ymax
0  A-0b08af8e-e546-23.jpg    640     480     A   202   182   396   288
1  A-0efe49f8-3676-90.jpg    640     640     A    69   264   592   504
2  A-299d15ac-1200-55.jpg    640     426     A   402    71   640   212
3  A-2c1d7db2-b111-54.jpg    640     480     A   157   161   466   336
4  A-57eb7b69-da4c-59.jpg    640     480     A   205   237   382   334


In [6]:
# df to CSV
# csv extenstion should be inserted or it will return [ERROR 13] Permission Denied

train_df.to_csv('{}/{}{}'.format(DATASET_PATH, 'train', '.csv'), index = None)
test_df.to_csv('{}/{}{}'.format(DATASET_PATH, 'test', '.csv'), index = None)