import random
from os import listdir, makedirs, path
from shutil import copy, rmtree
from tqdm import tqdm
from glob import glob
from PIL import Image

In [5]:
WEBCAM_DIR = "webcam_dataset"
SAMPLED_DATASET = "resized-sampled-dataset"
OUTPUT_DIRECTORY = "dataset"
TRAIN_RATIO = 0.8 # range from 0 to 1
SEED = 27
NUM_SAMPLE = -1 # -1 for all samples, > 0 for sampling data
RESIZED_SHAPE = (64,64)

In [6]:
if path.exists(path.join(OUTPUT_DIRECTORY,'train')):
    rmtree(path.join(OUTPUT_DIRECTORY,'train'))
if path.exists(path.join(OUTPUT_DIRECTORY,'val')):
    rmtree(path.join(OUTPUT_DIRECTORY,'val'))
makedirs(path.join(OUTPUT_DIRECTORY,'train'))
makedirs(path.join(OUTPUT_DIRECTORY,'val'))

## Processing Webcam Dataset

In [8]:
classes = listdir(WEBCAM_DIR)
for _class in classes:
    print('Processing: {}'.format(_class))
    images = glob(path.join(WEBCAM_DIR,_class,'*'))
    
    if not path.exists(path.join(OUTPUT_DIRECTORY,'train',_class)):
        makedirs(path.join(OUTPUT_DIRECTORY,'train',_class))
    if not path.exists(path.join(OUTPUT_DIRECTORY,'val',_class)):
        makedirs(path.join(OUTPUT_DIRECTORY,'val',_class))
    
    # randomize
    random.seed(SEED)
    random.shuffle(images)
    if NUM_SAMPLE > 0:
        images = images[:NUM_SAMPLE]
    
    # Copy files to the output directory
    positive_sample = int(TRAIN_RATIO * len(images))
    corrupt_files = []
    for img in tqdm(images):
        im_pil = Image.open(img)
        imResize = im_pil.resize(RESIZED_SHAPE, Image.ANTIALIAS)
        file_name = path.split(img)[1]
        if img in images[:positive_sample]:
            imResize.save(path.join(OUTPUT_DIRECTORY,'train',_class,file_name), 'JPEG', quality=90)
        else:
            imResize.save(path.join(OUTPUT_DIRECTORY,'val',_class,file_name), 'JPEG', quality=90)

Processing: no_mask


100%|████████████████████████████████████████| 908/908 [00:01<00:00, 740.79it/s]


Processing: mask


100%|████████████████████████████████████████| 904/904 [00:01<00:00, 778.76it/s]


Processing: incorrect_mask


100%|████████████████████████████████████████| 930/930 [00:01<00:00, 742.09it/s]


## Processing Public Dataset

In [None]:
for _class in classes:
    print('Processing: {}'.format(_class))
    images = glob(path.join(WEBCAM_DIR,_class,'*'))
    
    if not path.exists(path.join(OUTPUT_DIRECTORY,'train',_class)):
        makedirs(path.join(OUTPUT_DIRECTORY,'train',_class))
    if not path.exists(path.join(OUTPUT_DIRECTORY,'val',_class)):
        makedirs(path.join(OUTPUT_DIRECTORY,'val',_class))
    
    # randomize
    random.seed(SEED)
    random.shuffle(images)
    if NUM_SAMPLE > 0:
        images = images[:NUM_SAMPLE]
    
    # Copy files to the output directory
    positive_sample = int(TRAIN_RATIO * len(images))
    corrupt_files = []
    for img in tqdm(images):
        im_pil = Image.open(img)
        imResize = im_pil.resize(RESIZED_SHAPE, Image.ANTIALIAS)
        file_name = path.split(img)[1]
        if img in images[:positive_sample]:
            imResize.save(path.join(OUTPUT_DIRECTORY,'train',_class,file_name), 'JPEG', quality=90)
        else:
            imResize.save(path.join(OUTPUT_DIRECTORY,'val',_class,file_name), 'JPEG', quality=90)