# train_test_split.ipynb

Create training and test datasets from raw CCMT dataset.

Author: Connacher Murphy

In [1]:
# Libraries
import numpy as np
import os
from PIL import Image
import shutil

In [2]:
# Filepaths
path_raw = os.path.expanduser("~/data/ccmt/Raw Data/CCMT Dataset")  # raw data
path_proc = os.path.expanduser("~/data/ccmt_proc")  # destination

In [3]:
# Set seed for reproducibility
np.random.seed(456)

# Train/test split
split = 0.8

# Crop list
crops = ["Cashew", "Cassava", "Maize", "Tomato"]

In [4]:
for crop in crops:
    print(f"Sorting images for {crop}")

    if os.path.isdir(os.path.join(path_proc, crop)):
        shutil.rmtree(os.path.join(path_proc, crop))

    os.makedirs(os.path.join(path_proc, crop))
    os.makedirs(os.path.join(path_proc, crop, "train"))
    os.makedirs(os.path.join(path_proc, crop, "test"))

    crop_classes_list = sorted(os.listdir(os.path.join(path_raw, crop)))

    if ".DS_Store" in crop_classes_list:
        crop_classes_list.remove(".DS_Store")

    for crop_class in crop_classes_list:
        os.makedirs(os.path.join(path_proc, crop, "train", crop_class))
        os.makedirs(os.path.join(path_proc, crop, "test", crop_class))

        for roots, dirs, files in os.walk(os.path.join(path_raw, crop, crop_class)):
            for file in files:
                # Check if corrupted
                try:
                    img = Image.open(os.path.join(roots, file))  # open the image file
                    img.load()  # verify that the image is valid
                    img = Image.open(os.path.join(roots, file))  # reopen the image file

                    split_draw = np.random.uniform()

                    if split_draw < split:
                        # img.save(
                        #     os.path.join(path_proc, crop, "train", crop_class, file)
                        # )
                        shutil.copy(
                            os.path.join(roots, file),
                            os.path.join(path_proc, crop, "train", crop_class, file),
                        )
                    else:
                        # img.save(
                        #     os.path.join(path_proc, crop, "test", crop_class, file)
                        # )
                        shutil.copy(
                            os.path.join(roots, file),
                            os.path.join(path_proc, crop, "test", crop_class, file),
                        )

                except (IOError, SyntaxError) as e:
                    print(
                        f"File {file} is corrupted and will not be copied to the processed folder"
                    )
                    continue
    print("\n")

Sorting images for Cashew


Sorting images for Cassava


Sorting images for Maize
File healthy88_.jpg is corrupted and will not be copied to the processed folder
File healthy87_.jpg is corrupted and will not be copied to the processed folder
File healthy18_.jpg is corrupted and will not be copied to the processed folder
File healthy189_.jpg is corrupted and will not be copied to the processed folder
File leaf beetle90_.jpg is corrupted and will not be copied to the processed folder
File leaf beetle691_.jpg is corrupted and will not be copied to the processed folder
File leaf beetle326_.jpg is corrupted and will not be copied to the processed folder
File leaf beetle798_.jpg is corrupted and will not be copied to the processed folder
File leaf beetle690_.jpg is corrupted and will not be copied to the processed folder
File leaf beetle68_.jpg is corrupted and will not be copied to the processed folder
File leaf beetle208_.jpg is corrupted and will not be copied to the processed folder
File