In [1]:
import sagemaker
import boto3
import glob
import os
import shutil
import numpy as np
from botocore.exceptions import ClientError
from gluoncv.data import VOCDetection
from bs4 import BeautifulSoup

Define all the S3 resources and builds a dummy nested directory structure to satisfy the Gluon VOC dataloader requirements. This downloads data from Ben's S3 bucket.

In [2]:
"""
# get the xml files for conversion
# session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
s3 = boto3.resource('s3')

# the bucket name and roots for images, xml files
bucket = s3.Bucket('orange-roughy')
xml_ptf = 'ML_data/OP19/starboard/xml/'
img_ptf = 'ML_data/OP19/starboard/images/'

# save the file in the specified bucket to a list
obj_list = []
for obj in bucket.objects.filter(Delimiter='/', Prefix=xml_ptf):
  obj_list.append(obj.key)

print(len(obj_list), 'files in dir', os.path.split(xml_ptf)[0])


# path for the dummy file structure to satisfy Gluon format requirements
dummy = os.path.join(os.getcwd(), 'test-imgs', 'VOC2018')

# remove images and annotations from previous runs
if os.listdir(os.path.join(dummy, 'Annotations')):
    shutil.rmtree(os.path.join(dummy, 'Annotations'))
    os.mkdir(os.path.join(dummy, 'Annotations'))
        
if os.listdir(os.path.join(dummy, 'JPEGImages')):
    shutil.rmtree(os.path.join(dummy, 'JPEGImages'))
    os.mkdir(os.path.join(dummy, 'JPEGImages'))
        
if os.path.exists(dummy+'/ImageSets/Main/train.txt'):
    os.remove(dummy+'/ImageSets/Main/train.txt')

img_ids = []
# save images to a temporary local directory
for obj in obj_list:
    tmp_base = os.path.splitext(os.path.basename(obj))[0]
    tmp = tmp_base.split('.')[0]+'-'+tmp_base.split('.')[1]
    xml_path = os.path.basename(obj)
    img_path = os.path.splitext(xml_path)[0]+'.JPG'
    #img_ids.append(os.path.splitext(xml_path)[0])
    
    # download image
    try:
        img_remote = os.path.join(img_ptf, img_path)
        img_local = os.path.join(dummy, 'JPEGImages', tmp+'.jpg')
        bucket.download_file(img_remote, img_local)
    except ClientError:
        continue
    
    # download xml
    xml_local = os.path.join(dummy, 'Annotations', tmp+'.xml')
    bucket.download_file(obj, xml_local)
    
    # add to the image list
    img_ids.append(tmp)
    
with open(dummy+'/ImageSets/Main/train.txt', 'w') as ff:
    for line in img_ids:
        ff.write(line +'\n')
    ff.close
"""

205 files in dir ML_data/OP19/starboard/xml


This downloads data from my S3 bucket with DCP preprocessing from matlab

In [6]:
# get the xml files for conversion
# session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
s3 = boto3.resource('s3')

# what to download
rn = 'OP19'
side = 'port'

# where to put it
fold = 'VOCport'

# the bucket name and roots for images, xml files
bucket = s3.Bucket('csiro-aos-processed-imgs')
xml_ptf = f'{rn}/{side}/xml/'
img_ptf = f'{rn}/{side}/dcp_images/'

# save the file in the specified bucket to a list
obj_list = []
for obj in bucket.objects.filter(Delimiter='/', Prefix=xml_ptf):
  obj_list.append(obj.key)

print(len(obj_list), 'files in dir', os.path.split(xml_ptf)[0])

# path for the dummy file structure to satisfy Gluon format requirements
#dummy = os.path.join(os.getcwd(), 'test-imgs', f'{run}')  # use this for test data
dummy = os.path.join(os.getcwd(), f'{fold}', f'{rn}')

if not os.path.exists(dummy):
    os.mkdir(dummy)

# remove images and annotations from previous runs
try:
    if os.listdir(os.path.join(dummy, 'Annotations')):
        shutil.rmtree(os.path.join(dummy, 'Annotations'))
        os.mkdir(os.path.join(dummy, 'Annotations'))
except FileNotFoundError:
    os.mkdir(os.path.join(dummy, 'Annotations'))

try:
    if os.listdir(os.path.join(dummy, 'JPEGImages')):
        shutil.rmtree(os.path.join(dummy, 'JPEGImages'))
        os.mkdir(os.path.join(dummy, 'JPEGImages'))
except FileNotFoundError:
    os.mkdir(os.path.join(dummy, 'JPEGImages'))    
        
if os.path.exists(dummy+'/ImageSets/Main/train.txt'):
    os.remove(dummy+'/ImageSets/Main/train.txt')

img_ids = []
# save images to a temporary local directory
for obj in obj_list:
    tmp_base = os.path.splitext(os.path.basename(obj))[0]
    tmp = tmp_base.split('.')[0]+'-'+tmp_base.split('.')[1]
    xml_path = os.path.basename(obj)
    img_path = os.path.splitext(xml_path)[0]+'.jpg'
    #img_ids.append(os.path.splitext(xml_path)[0])
    
    # download image
    try:
        img_remote = os.path.join(img_ptf, img_path)
        img_local = os.path.join(dummy, 'JPEGImages', tmp+'.jpg')
        bucket.download_file(img_remote, img_local)
    except ClientError:
        continue
    
    # download xml
    xml_local = os.path.join(dummy, 'Annotations', tmp+'.xml')
    bucket.download_file(obj, xml_local)
    
    # add to the image list
    img_ids.append(tmp)

try:
    with open(dummy+'/ImageSets/Main/train.txt', 'w') as ff:
        for line in img_ids:
            ff.write(line +'\n')
        ff.close
except FileNotFoundError:
    os.mkdir(dummy+'/ImageSets')
    os.mkdir(dummy+'/ImageSets/Main')
    with open(dummy+'/ImageSets/Main/train.txt', 'w') as ff:
        for line in img_ids:
            ff.write(line +'\n')
        ff.close


205 files in dir OP19/port/xml


Split the training list into trainval if needed. And yes it is needed

In [7]:
#rn = 'OP16'
dummy = os.path.join(os.getcwd(), f'{fold}', f'{rn}')

# get the list
with open(dummy+'/ImageSets/Main/train.txt', 'r') as ff:
    temp = list(ff)
    ff.close()
    
np.random.shuffle(temp)
train = temp[0:int(np.floor(0.8*len(temp)))]
val = temp[int(np.floor(0.8*len(temp)))::]

print(len(train))
print(len(val))

os.remove(dummy+'/ImageSets/Main/train.txt')

with open(dummy+'/ImageSets/Main/train.txt', 'w') as ff:
    for line in train:
        ff.write(line)
with open(dummy+'/ImageSets/Main/val.txt', 'w') as ff:
    for line in val:
        ff.write(line)

164
41
