In [1]:
import random
from os import listdir, makedirs, path
from shutil import copy, rmtree
from tqdm import tqdm
from glob import glob
from PIL import Image

In [2]:
DATASET_DIRECTORY = "dataset"
OUTPUT_DIRECTORY = "resized-sampled-dataset"
TRAIN_RATIO = 0.8 # range from 0 to 1
SEED = 27
NUM_SAMPLE = 10000 # -1 for all samples, > 0 for sampling data
RESIZED_SHAPE = (256,256)

In [3]:
if path.exists(path.join(OUTPUT_DIRECTORY,'train')):
    rmtree(path.join(OUTPUT_DIRECTORY,'train'))
if path.exists(path.join(OUTPUT_DIRECTORY,'val')):
    rmtree(path.join(OUTPUT_DIRECTORY,'val'))
makedirs(path.join(OUTPUT_DIRECTORY,'train'))
makedirs(path.join(OUTPUT_DIRECTORY,'val'))

In [4]:
classes = listdir(DATASET_DIRECTORY)
for _class in classes:
    print('Processing: {}'.format(_class))
    images = glob(path.join(DATASET_DIRECTORY,_class,'*','*'))
    
    if not path.exists(path.join(OUTPUT_DIRECTORY,'train',_class)):
        makedirs(path.join(OUTPUT_DIRECTORY,'train',_class))
    if not path.exists(path.join(OUTPUT_DIRECTORY,'val',_class)):
        makedirs(path.join(OUTPUT_DIRECTORY,'val',_class))
    
    # randomize
    random.seed(SEED)
    random.shuffle(images)
    if NUM_SAMPLE > 0:
        images = images[:NUM_SAMPLE]
    
    # Copy files to the output directory
    positive_sample = int(TRAIN_RATIO * len(images))
    corrupt_files = []
    for img in tqdm(images):
        im_pil = Image.open(img)
        imResize = im_pil.resize(RESIZED_SHAPE, Image.ANTIALIAS)
        file_name = path.split(img)[1]
        if img in images[:positive_sample]:
            imResize.save(path.join(OUTPUT_DIRECTORY,'train',_class,file_name), 'JPEG', quality=90)
        else:
            imResize.save(path.join(OUTPUT_DIRECTORY,'val',_class,file_name), 'JPEG', quality=90)

Processing: incorrect_mask


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [06:55<00:00, 24.06it/s]


Processing: mask


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [06:51<00:00, 24.31it/s]


Processing: no_mask


100%|█████████████████████████████████████████████████████████████████████████| 10000/10000 [13:58<00:00, 11.93it/s]
