# DATASET ORGANISATION

### In this notebook we divided the images in folders. 

#### First of all we created the validation folder, and we created the three subdirectories for training and validation, which correspond to the three classes to classify, which are:
* 0: All the people in the image are wearing a mask
* 1: No person in the image is wearing a mask
* 2: Someone in the image is not wearing a mask

In [None]:
import os

# General Dataset directory
cwd = os.getcwd()
dataset_dir = os.path.join(cwd, 'MaskDataset')


# TRAINING DIRECTORY

training_dir = os.path.join(dataset_dir, 'training')

# SubDirectories for every class
if not os.path.exists(os.path.join(training_dir, "0")):
    os.mkdir(os.path.join(training_dir, "0"), mode = 0o666)
if not os.path.exists(os.path.join(training_dir, "1")):
    os.mkdir(os.path.join(training_dir, "1"), mode = 0o666)
if not os.path.exists(os.path.join(training_dir, "2")):
    os.mkdir(os.path.join(training_dir, "2"), mode = 0o666)

# Naming them for simplicity   
zero_train_dir = os.path.join(training_dir, "0")
one_train_dir  = os.path.join(training_dir, "1")
two_train_dir  = os.path.join(training_dir, "2")


# VALIDATION DIRECTORY

if not os.path.exists(os.path.join(dataset_dir, 'validation')):
    os.mkdir(os.path.join(dataset_dir, 'validation'))
valid_dir = os.path.join(dataset_dir, 'validation')

# SubDirectories for every class
if not os.path.exists(os.path.join(valid_dir, "0")):
    os.mkdir(os.path.join(valid_dir, "0"), mode = 0o666)
if not os.path.exists(os.path.join(valid_dir, "1")):
    os.mkdir(os.path.join(valid_dir, "1"), mode = 0o666)
if not os.path.exists(os.path.join(valid_dir, "2")):
    os.mkdir(os.path.join(valid_dir, "2"), mode = 0o666)
    
# Naming them for simplicity   
zero_valid_dir = os.path.join(valid_dir, "0")
one_valid_dir  = os.path.join(valid_dir, "1")
two_valid_dir  = os.path.join(valid_dir, "2")

#### Here we got every label from the json file and moved the correspondant image to right class folder, dispatching 15% of them to the folder to compute validation when training our models (We could have used parameter validation_split). Obviously we didn't touch test folder.

In [None]:
import itertools
import json

# 15% of images should go to validation directory, that is 15 over 100, same as 3 over 20
# We used itertools library, that was helpful to select only a percentage of the images and shift them through directories
range20 = itertools.cycle(range(1,21,1))

# Read the json file with all the image-label mapping 
with open(os.path.join(dataset_dir, 'train_gt.json'), "r") as json_file:
    data = json.load(json_file)
    
# Cycle over json "dictionary" to partition the correct images to the right subdirectories   
for key, value in data.items():
    
    count = next(range20)
    
    try:
   
        if value == 0:
            if count <= 17:
                os.replace(os.path.join(training_dir, key), os.path.join(zero_train_dir, key))
            else:
                os.replace(os.path.join(training_dir, key), os.path.join(zero_valid_dir, key)) 

        elif value == 1:
            if count <= 17:
                os.replace(os.path.join(training_dir, key), os.path.join(one_train_dir, key))
            else:
                os.replace(os.path.join(training_dir, key), os.path.join(one_valid_dir, key)) 

        else:
            if count <= 17:
                os.replace(os.path.join(training_dir, key), os.path.join(two_train_dir, key)) 
            else:
                os.replace(os.path.join(training_dir, key), os.path.join(two_valid_dir, key))
    
    except FileNotFoundError:
        pass

### Now the directories of the dataset should look like:


* training \ 0  : 1615 images
* training \ 1  : 1615 images
* training \ 2  : 1544 images


* validation \ 0  : 285 images
* validation \ 1  : 282 images
* validation \ 2  : 273 images


* test : 450 images