In [1]:
from pathlib import Path

# Data Loading

We load a list of annotated files (xml) and a list of original images.
The stems of the filenames should match up, which can be done using `set` intersections.


In [2]:
# Load a list of annotated files
annotation_path = Path('data/bboxes/manual/bam-stratified-sample1000/agreement/1.0/')
annotation_files = list(annotation_path.glob("*.xml"))
annotation_stems = set(map(lambda p: p.stem, annotation_files))

# Load a list of images
img_path = Path('data/images/bam-stratified-sample1000/')
img_files = list(img_path.glob("**/*.jpg"))
img_stems = set(map(lambda p: p.stem, img_files))

stems = annotation_stems & img_stems

# Divide the data
We are dividing the data into 3 sets: training (60%), validation (10%), and testing (30%).
To do this, we use Python's `random` functions to split our list of stems and them sort them into folders.


In [3]:
import random

# Set the seed so this is reproducible.
random.seed(42)

n = len(stems)

# Set train as a random sample of 3/5 of the stems
train = random.sample(stems, n * 3 // 5)
remaining = stems - set(train)

# Set validation as 10% of the dataset
val = random.sample(remaining, n // 10)

test = list(remaining - set(val))

# Gather and save the files

Files are stored in temporary directory and then added to a tarball and saved in the working dir.

In [4]:
import tempfile
import tarfile
from shutil import copyfile

with tempfile.TemporaryDirectory() as tmp_dir:
    # Set up directories
    p = Path(tmp_dir) / 'ArtNet'
    p.mkdir()
    for d in ['Annotations', 'ImageSets', 'ImageSets/Main', 'JPEGImages']:
        (p / d).mkdir(exist_ok=True)

    for split, stem_list in [('test', test), ('train', train), ('val', val)]:
        for stem in stem_list:
            # Save annotation
            annotation = list(filter(lambda f: f.stem == stem, annotation_files))[0]
            copyfile(annotation, p / 'Annotations' / annotation.name)

            # Save Image
            img = list(filter(lambda f: f.stem == stem, img_files))[0]
            copyfile(img, p / 'JPEGImages' / img.name)

            # Add to txt file
            txt_file = p / 'ImageSets' / 'Main' / '{}.txt'.format(split)
            with open(txt_file, 'a') as txt:
                txt.write(stem)

            if split in ['train', 'val']:
                with open(p / 'ImageSets' / 'Main' / 'trainval.txt', 'a') as txt:
                    txt.write(stem)

    print('Creating tarball')
    # Create a tarball with the results
    with tarfile.open('ArtNet.tar.xz', 'w:xz') as tar:
        for f in filter(lambda f: f.name[0] != '.', p.parent.glob('**/*')):
            print('Adding {} to tarball...'.format(f.relative_to(p.parent)))
            tar.add(f, arcname=f.relative_to(p.parent))
    print('Tarball complete')

    print('Training: {}, Validation: {}, Testing: {}'.format(len(train), len(val), len(test)))

Creating tarball
Adding ArtNet to tarball...
Adding ArtNet/ImageSets to tarball...
Adding ArtNet/Annotations to tarball...
Adding ArtNet/JPEGImages to tarball...
Adding ArtNet/ImageSets/Main to tarball...
Adding ArtNet/ImageSets/Main/train.txt to tarball...
Adding ArtNet/ImageSets/Main/trainval.txt to tarball...
Adding ArtNet/ImageSets/Main/test.txt to tarball...
Adding ArtNet/ImageSets/Main/val.txt to tarball...
Adding ArtNet/Annotations/1180004204_object.xml to tarball...
Adding ArtNet/Annotations/1180072452_object.xml to tarball...
Adding ArtNet/Annotations/1180027909_object.xml to tarball...
Adding ArtNet/Annotations/1180078462_object.xml to tarball...
Adding ArtNet/Annotations/1180013942_object.xml to tarball...
Adding ArtNet/Annotations/1180029702_object.xml to tarball...
Adding ArtNet/Annotations/1180029174_object.xml to tarball...
Adding ArtNet/Annotations/1180016346_object.xml to tarball...
Adding ArtNet/Annotations/1180053456_object.xml to tarball...
Adding ArtNet/Annotations