# This file takes a dataset folder and creates a new folder, that is the same of the original one, but with less images for each class!

In [2]:
import os
import shutil
import glob
import random
import copy

In [3]:
#Mounting Google Drive data
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
#@title My function to compute and print number of images per class
def printNumImagesPerClass(datasetPath, subfolders):
    nImagesPerClass = {key: None for key in subfolders}

    class_names = []
    for root, dirs, files in os.walk(os.path.join(datasetPath,subfolders[0])):
        for dirname in dirs:
            class_names.append(dirname)

    nImages = {key: None for key in class_names}

    print("Number of images per class")
    for subf in subfolders:
        for root, dirs, files in os.walk(os.path.join(datasetPath,subf)):
            for dirname in dirs:
                path = os.path.join(root,dirname)
                nImages[dirname] = len(os.listdir(path))
            nImagesPerClass[subf] = copy.deepcopy(nImages)
        print(subf,": ",nImagesPerClass[subf])
    
    return nImagesPerClass

# Tune paths

In [6]:
################################################################################
# TUNE HERE PATHS

# Root directory of interest
gdrivePath = F"gdrive/MyDrive/ML_AI"

# Dataset to reduce
inputpath = gdrivePath + "/NatureDatasetReduced"

# Destination folder of reduced dataset
outputpath = gdrivePath + "/NatureDatasetNew"

# Subfolders contained in the dataset folder
subfolders = ["train","test"]

################################################################################

# Print number of images for each class, both in train and test directory
nImagesPerClass = printNumImagesPerClass(inputpath,subfolders)

Number of images per class
train :  {'glacier': 100, 'buildings': 100, 'forest': 100, 'mountain': 100, 'sea': 100, 'street': 100}
test :  {'glacier': 50, 'buildings': 50, 'forest': 50, 'mountain': 50, 'sea': 50, 'street': 50}


# Choose how many images per class, for train and test sets

In [None]:
################################################################################
# Choose the number of random images per class to copy (distinctly for train and
# test set!)

# For the train set
nTrain = 100
# For the test set
nTest = 150
################################################################################

# Check if your choice of number of images is equal or smaller than the actual
# number of images in the classes
if not nTrain <= min(nImagesPerClass["train"].values()):
    print("ERROR: Too many images chosen for the train set")

if not nTest <= min(nImagesPerClass["test"].values()):
    print("ERROR: Too many images chosen for the test set")

nImgs = {key: None for key in subfolders}
nImgs["train"] = nTrain
nImgs["test"] = nTest

print("""You have chosen to load
 - {:d} images for TRAIN set
 - {:d} images for TEST set""".format(nImgs["train"],nImgs["test"]))

You have chosen to load
 - 100 images for TRAIN set
 - 150 images for TEST set


# Function that
1. creates a copy of the "structure" of input folder in
the output path
2. copies for each class folder a subset of images from the source folder

In [None]:
for subf in subfolders:
    for root, dirs, files in os.walk(os.path.join(inputpath,subf)):
        for dirname in dirs:
            src_path = os.path.join(root,dirname)
            dst_path = os.path.join(outputpath,subf,dirname)

            # Create same subfolder, but in the destination path
            if os.path.exists(dst_path):
                shutil.rmtree(dst_path)
            os.makedirs(dst_path)
            
            # Retrieve a subset of random images
            imgPattern = os.path.join(src_path,"*.jpg")
            imgPathsToCopy = glob.glob(imgPattern)
            to_be_copied = random.sample(imgPathsToCopy, nImgs[subf])

            # Copy all images in variable "to_be_copied"
            for f in enumerate(to_be_copied, 1):
                #dest = os.path.join(dst_path, str(f[0]))
                #if not os.path.exists(dest):
                #    os.makedirs(dest)
                shutil.copy(f[1], dst_path)

            print(nImgs[subf]," images have been saved in directory: ", dst_path)


100  images have been saved in directory:  gdrive/MyDrive/ML_AI/NatureDatasetNew/train/forest
100  images have been saved in directory:  gdrive/MyDrive/ML_AI/NatureDatasetNew/train/mountain
100  images have been saved in directory:  gdrive/MyDrive/ML_AI/NatureDatasetNew/train/glacier
100  images have been saved in directory:  gdrive/MyDrive/ML_AI/NatureDatasetNew/train/buildings
100  images have been saved in directory:  gdrive/MyDrive/ML_AI/NatureDatasetNew/train/sea
100  images have been saved in directory:  gdrive/MyDrive/ML_AI/NatureDatasetNew/train/street
150  images have been saved in directory:  gdrive/MyDrive/ML_AI/NatureDatasetNew/test/buildings
150  images have been saved in directory:  gdrive/MyDrive/ML_AI/NatureDatasetNew/test/forest
150  images have been saved in directory:  gdrive/MyDrive/ML_AI/NatureDatasetNew/test/sea
150  images have been saved in directory:  gdrive/MyDrive/ML_AI/NatureDatasetNew/test/glacier
150  images have been saved in directory:  gdrive/MyDrive/ML

In [None]:
# Check number of images in output path
printNumImagesPerClass(outputpath,subfolders);

Number of images per class
train :  {'forest': 100, 'mountain': 100, 'glacier': 100, 'buildings': 100, 'sea': 100, 'street': 100}
test :  {'forest': 150, 'mountain': 150, 'glacier': 150, 'buildings': 150, 'sea': 150, 'street': 150}
