# Convert PASCAL VOC to gluon

Use Gluon's built in tool for converting xml PASCAL VOC formated bounding box files to the format that Gluon likes.

In [1]:
import sagemaker
import boto3
import glob
import os
import shutil 
from botocore.exceptions import ClientError
from gluoncv.data import VOCDetection
from bs4 import BeautifulSoup

Define a class that takes inherets preprogrammed VOC properties and converts.

In [2]:
class VOCLike(VOCDetection):
    CLASSES = ['orange_roughy']
    #CLASSES = ['person','dog']
    def __init__(self, root, splits, transform=None, index_map=None, preload_label=True):
        super(VOCLike, self).__init__(root, splits, transform, index_map, preload_label)

Define all the S3 resources and builds a dummy nested directory structure to satisfy the Gluon VOC dataloader requirements.

In [10]:
# get the xml files for conversion
# session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
s3 = boto3.resource('s3')

# the bucket name and roots for images, xml files
bucket = s3.Bucket('orange-roughy')
xml_ptf = 'ML_data/OP19/port/xml/'
img_ptf = 'ML_data/OP19/port/images/'

# save the file in the specified bucket to a list
obj_list = []
for obj in bucket.objects.filter(Delimiter='/', Prefix=xml_ptf):
  obj_list.append(obj.key)

print(len(obj_list), 'files in dir', os.path.split(xml_ptf)[0])

# path for the dummy file structure to satisfy Gluon format requirements
dummy = os.path.join(os.getcwd(), 'VOCdummy', 'VOC2018')

# remove images and annotations from previous runs
if os.listdir(os.path.join(dummy, 'Annotations')):
    shutil.rmtree(os.path.join(dummy, 'Annotations'))
    os.mkdir(os.path.join(dummy, 'Annotations'))
        
if os.listdir(os.path.join(dummy, 'JPEGImages')):
    shutil.rmtree(os.path.join(dummy, 'JPEGImages'))
    os.mkdir(os.path.join(dummy, 'JPEGImages'))
        
if os.path.exists(dummy+'/ImageSets/Main/train.txt'):
    os.remove(dummy+'/ImageSets/Main/train.txt')

img_ids = []
# save images to a temporary local directory
for obj in obj_list:
    tmp_base = os.path.splitext(os.path.basename(obj))[0]
    tmp = tmp_base.split('.')[0]+'-'+tmp_base.split('.')[1]
    xml_path = os.path.basename(obj)
    img_path = os.path.splitext(xml_path)[0]+'.JPG'
    #img_ids.append(os.path.splitext(xml_path)[0])
    
    # download image
    try:
        img_remote = os.path.join(img_ptf, img_path)
        img_local = os.path.join(dummy, 'JPEGImages', tmp+'.jpg')
        bucket.download_file(img_remote, img_local)
    except ClientError:
        continue
    
    # download xml
    xml_local = os.path.join(dummy, 'Annotations', tmp+'.xml')
    bucket.download_file(obj, xml_local)
    
    # add to the image list
    img_ids.append(tmp)
    
with open(dummy+'/ImageSets/Main/train.txt', 'w') as ff:
    for line in img_ids:
        ff.write(line +'\n')
    ff.close

205 files in dir ML_data/OP19/port/xml


Now create a dataset object from the inherited VOC class object.

In [25]:
temp = []

xml_files = glob.glob(os.path.join('VOCdummy/VOC2018/Annotations/','*.xml'))

for xfile in xml_files:
    with open(xfile,'r') as ff:
        soup = BeautifulSoup(ff)
        # print(os.path.basename(xfile), len(soup.find_all('name')))
        for item in soup.find_all('name'):
            if item.string not in temp:
                temp.append(item.string)
        ff.close()
print(temp)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


20190715-015107-252.xml 16
20190715-014933-281.xml 11
20190715-015215-274.xml 24
20190715-015139-276.xml 16
20190715-014727-284.xml 10
20190715-015109-277.xml 13
20190715-014949-255.xml 13
20190715-014855-258.xml 6
20190715-014947-230.xml 9
20190715-014631-262.xml 3
20190715-014845-285.xml 9
20190715-015245-274.xml 3
20190715-014827-284.xml 6
20190715-015159-225.xml 16
20190715-014703-285.xml 7
20190715-015141-226.xml 11
20190715-015007-255.xml 17
20190715-014829-234.xml 8
20190715-015123-227.xml 9
20190715-015323-222.xml 4
20190715-015115-278.xml 3
20190715-015001-256.xml 29
20190715-014859-233.xml 8
20190715-015229-223.xml 4
20190715-015129-228.xml 15
20190715-014857-283.xml 6
20190715-015205-225.xml 13
20190715-014835-234.xml 6
20190715-014623-237.xml 1
20190715-015259-224.xml 1
20190715-014637-262.xml 3
20190715-014657-285.xml 6
20190715-015021-280.xml 24
20190715-015125-252.xml 14
20190715-014627-287.xml 2
20190715-015239-274.xml 1
20190715-015113-252.xml 7
20190715-015313-250.xml