# DATASET ORGANISATION

In this notebook we divide the images in the necessary folders, by creating the same structure of Training subdirectories also for Validation. 

#### Function to create directory if it doesn't exist

In [1]:
from os import getcwd, mkdir, walk, listdir, replace
from os.path import exists, join, isfile

def create_dir(path,name):
    if not exists(join(path,name)):
        mkdir(join(path, name), mode = 0o666)
    return join(path, name)

#### Get all the subfolders of the Training directory, and group them in lists

In [None]:
cwd = getcwd()

# General Dataset directory
dataset_dir = join(cwd, 'Development_Dataset')

# TRAINING

training_dir = join(dataset_dir, 'Training')

bip_bip_train_dir  = join(training_dir, 'BipBip')
pead_train_dir     = join(training_dir, 'Pead')
roseau_train_dir   = join(training_dir, 'Roseau')
weedelec_train_dir = join(training_dir, 'Weedelec')

train_plant_subdirs = []
train_haricot_subdirs = []
train_mais_subdirs = []
name_subdirs = ['BipBip', 'Pead', 'Roseau', 'Weedelec']
training_subdirs = [bip_bip_train_dir, pead_train_dir, roseau_train_dir, weedelec_train_dir]

training_dict = {key:value for key,value in zip(name_subdirs,training_subdirs)}

for key, value in training_dict.items():

    train_plant_dir = create_dir(key,value)
    train_plant_subdirs.append(train_plant_dir)
    #print(plant_dir)
    
    train_haricot_dir = create_dir(train_plant_dir,'Haricot')
    train_haricot_subdirs.append(train_haricot_dir)
    train_images_dir = create_dir(train_haricot_dir,'Images')
    train_masks_dir  = create_dir(train_haricot_dir,'Masks')
    #print(train_haricot_dir)
 
    train_mais_dir = create_dir(train_plant_dir,'Mais')      
    train_mais_subdirs.append(train_mais_dir)
    train_images_dir = create_dir(train_mais_dir,'Images')
    train_masks_dir  = create_dir(train_mais_dir,'Masks')
    #print(train_mais_dir)

#### Create all the subfolders of the Validation directory, with the same structure of Training

In [None]:
# VALIDATION

validation_dir = create_dir(dataset_dir, 'Validation')
        
bip_bip_valid_dir  = join(validation_dir, 'BipBip')
pead_valid_dir     = join(validation_dir, 'Pead')
roseau_valid_dir   = join(validation_dir, 'Roseau')
weedelec_valid_dir = join(validation_dir, 'Weedelec')

name_subdirs = ['BipBip', 'Pead', 'Roseau', 'Weedelec']
validation_subdirs = [bip_bip_valid_dir, pead_valid_dir, roseau_valid_dir, weedelec_valid_dir]
valid_plant_subdirs = []
valid_haricot_subdirs = []
valid_mais_subdirs = []

validation_dict = {key:value for key,value in zip(name_subdirs,validation_subdirs)}

for key, value in validation_dict.items():

    valid_plant_dir = create_dir(key,value)
    valid_plant_subdirs.append(valid_plant_dir)
    
    valid_haricot_dir = create_dir(valid_plant_dir,'Haricot')
    valid_haricot_subdirs.append(valid_haricot_dir)
    valid_images_dir = create_dir(valid_haricot_dir,'Images')
    valid_masks_dir  = create_dir(valid_haricot_dir,'Masks')
    print(valid_haricot_dir + "\n" + valid_images_dir + "\n" + valid_masks_dir)
 
    valid_mais_dir = create_dir(valid_plant_dir,'Mais')      
    valid_mais_subdirs.append(valid_mais_dir)
    valid_images_dir = create_dir(valid_mais_dir,'Images')
    valid_masks_dir  = create_dir(valid_mais_dir,'Masks')
    print(valid_mais_dir + "\n" + valid_images_dir + "\n" + valid_masks_dir)
    

#### Group all subdirs merging Haricot and Mais ones, and print them out for visualization purpose

In [None]:
all_train_subdirs = []
all_train_subdirs = train_haricot_subdirs + train_mais_subdirs
for dir in all_train_subdirs:
    print(dir)

all_valid_subdirs = []
all_valid_subdirs = valid_haricot_subdirs  + valid_mais_subdirs
for dir in all_valid_subdirs:
    print(dir)

#### Move 10% of each Training subfolder to the correspondent Validation subfolder 

In [None]:
import shutil

for i in range(8):
    
    train_images_dir = join(all_train_subdirs[i], 'Images')
    train_masks_dir  = join(all_train_subdirs[i], 'Masks')
    valid_images_dir = join(all_valid_subdirs[i], 'Images')
    valid_masks_dir  = join(all_valid_subdirs[i], 'Masks')
   
    print()
    print(train_images_dir)
    print(train_masks_dir)
    print(valid_images_dir)
    print(valid_masks_dir)
    print()

    # List of every file in subfolder
    onlyfiles1 = [f1 for f1 in listdir(train_images_dir) if isfile(join(train_images_dir,f1))]
    onlyfiles2 = [f2 for f2 in listdir(train_masks_dir)  if isfile(join(train_masks_dir, f2))]
    
    # number of images/masks in this directory
    lenghtt = len(onlyfiles1)
    
    j = 0
    while j < lenghtt:

        try:

            #move
            if j % 10 == 0: 
                image_filename = onlyfiles1[j]
                mask_filename  = onlyfiles2[j]
                shutil.move( join(train_images_dir,image_filename), join(valid_images_dir,image_filename))
                shutil.move( join(train_masks_dir, mask_filename),  join(valid_masks_dir,mask_filename))

            #no move    
            else:
                pass

        except FileNotFoundError:
            pass
        
        j += 1 
        
    
    print("Finished with: " + str(train_images_dir))
    print("Finished with: " + str(train_masks_dir))
    
    i += 1
    print("\n")
    

### Now the directories of the dataset should look like:

* Training
    * BipBip
        * \Haricot
        * \Mais
    * Pead
        * \Haricot
        * \Mais
    * Roseau
        * \Haricot
        * \Mais
    * Weedelec
        * \Haricot
        * \Mais
    
* Validation
    * BipBip
        * \Haricot
        * \Mais
    * Pead
        * \Haricot
        * \Mais
    * Roseau
        * \Haricot
        * \Mais
    * Weedelec
        * \Haricot
        * \Mais
        
        
##### We didn't touch Test directory.