# PlantVillage dataset preparation

Project exploring the PlantVillage datatset and creating train, test and validation sets for training a classifier for leaf diseases.

Create as many subsets as there needed for the training and validation data, with predefined distribution between the various workers training sets.

In [1]:
import numpy as np
import os
import shutil
from pathlib import Path
import json
from os import listdir

In [2]:
# TO DO: select whether to work with segmented or color images from the PlantVillage dataset
data_type = 'color/'

In [3]:
'''
Original dataset:   'PlantVillageData/dataset/'+data_type, i.e. "color"
Split dataset:      'dataset/processed/'
Split train sets:   'dataset/processed/train0/
                    'dataset/processed/train1/
                    'dataset/processed/train.../
Split val set:      'dataset/processed/val/
Split test set:     'dataset/processed/test/                    
                    
'''
BASE_DIR = '/home/ubuntu/code/PlantVillageData/dataset/'

DATA_DIR = BASE_DIR+data_type
TRAIN_DIR = BASE_DIR+'processed/train'
VAL_DIR = BASE_DIR+'processed/val/'
TEST_DIR = BASE_DIR+'processed/test/'

In [4]:
# List the dataset categories
categories = listdir(DATA_DIR)
for cat in categories :
    # remove .DS_Store from list
    if cat == ".DS_Store" :
        categories.remove(cat)
print('Dataset categories: \n{}\n\nNumber of categories: {}'.format(categories, len(categories)))

Dataset categories: 
['Soybean___healthy', 'Tomato___Tomato_mosaic_virus', 'Cherry_(including_sour)___healthy', 'Strawberry___Leaf_scorch', 'Tomato___Spider_mites Two-spotted_spider_mite', 'Pepper,_bell___Bacterial_spot', 'Tomato___Bacterial_spot', 'Apple___Apple_scab', 'Potato___Early_blight', 'Tomato___Late_blight', 'Raspberry___healthy', 'Corn_(maize)___Cercospora_leaf_spot Gray_leaf_spot', 'Corn_(maize)___Northern_Leaf_Blight', 'Corn_(maize)___Common_rust_', 'Grape___Esca_(Black_Measles)', 'Cherry_(including_sour)___Powdery_mildew', 'Apple___Black_rot', 'Corn_(maize)___healthy', 'Tomato___Target_Spot', 'Orange___Haunglongbing_(Citrus_greening)', 'Tomato___Tomato_Yellow_Leaf_Curl_Virus', 'Squash___Powdery_mildew', 'Apple___Cedar_apple_rust', 'Grape___healthy', 'Apple___healthy', 'Tomato___Leaf_Mold', 'Peach___Bacterial_spot', 'Tomato___Early_blight', 'Grape___Leaf_blight_(Isariopsis_Leaf_Spot)', 'Peach___healthy', 'Blueberry___healthy', 'Potato___Late_blight', 'Pepper,_bell___health

In [5]:
# Select categories to remove from the analysis
excluded_categories = ['Cherry_(including_sour)___Powdery_mildew', 'Corn_(maize)___Cercospora_leaf_spot Gray_leaf_spot', 
                       'Squash___Powdery_mildew', 'Peach___Bacterial_spot', 'Potato___healthy', 
                       'Pepper,_bell___Bacterial_spot', 'Corn_(maize)___healthy', 
                       'Grape___Leaf_blight_(Isariopsis_Leaf_Spot)', 'Peach___healthy', 'Blueberry___healthy', 
                       'Pepper,_bell___healthy', 'Grape___Esca_(Black_Measles)', 'Cherry_(including_sour)___healthy', 
                       'Orange___Haunglongbing_(Citrus_greening)', 'Grape___healthy', 'Potato___Late_blight', 
                       'Grape___Black_rot', 'Soybean___healthy', 'Potato___Early_blight', 'Corn_(maize)___Common_rust_', 
                       'Corn_(maize)___Northern_Leaf_Blight', 'Raspberry___healthy', 'Tomato___Leaf_Mold', 'Tomato___Target_Spot', 'Tomato___Early_blight', 'Tomato___Tomato_Yellow_Leaf_Curl_Virus', 'Tomato___Septoria_leaf_spot', 'Tomato___Spider_mites Two-spotted_spider_mite', 'Tomato___Tomato_mosaic_virus']
                       

for cat in excluded_categories:
    categories.remove(cat)

print('Dataset categories: \n{}\n\nNumber of categories: {}'.format(categories, len(categories)))

Dataset categories: 
['Strawberry___Leaf_scorch', 'Tomato___Bacterial_spot', 'Apple___Apple_scab', 'Tomato___Late_blight', 'Apple___Black_rot', 'Apple___Cedar_apple_rust', 'Apple___healthy', 'Tomato___healthy', 'Strawberry___healthy']

Number of categories: 9


In [6]:
# Define distribution for various training subsets
values = [[0, 0.5, 0.1, 0.5, 0.1, 0.6, 0.2, 0.2, 0], 
          [0, 0.2,  0.7, 0.3, 0.3, 0, 0.35, 0.25, 0], 
          [0.2, 0.2, 0.2, 0, 0.6, 0.4, 0.3, 0.3, 0.3], 
          [0.8, 0.1, 0, 0.2, 0, 0, 0.15, 0.25, 0.7]]

distribs = []

for farm_distrib in values:
    distribs.append(dict(zip(categories, farm_distrib)))
print(distribs[0])
print(distribs[1])

{'Strawberry___Leaf_scorch': 0, 'Tomato___Bacterial_spot': 0.5, 'Apple___Apple_scab': 0.1, 'Tomato___Late_blight': 0.5, 'Apple___Black_rot': 0.1, 'Apple___Cedar_apple_rust': 0.6, 'Apple___healthy': 0.2, 'Tomato___healthy': 0.2, 'Strawberry___healthy': 0}
{'Strawberry___Leaf_scorch': 0, 'Tomato___Bacterial_spot': 0.2, 'Apple___Apple_scab': 0.7, 'Tomato___Late_blight': 0.3, 'Apple___Black_rot': 0.3, 'Apple___Cedar_apple_rust': 0, 'Apple___healthy': 0.35, 'Tomato___healthy': 0.25, 'Strawberry___healthy': 0}


In [7]:
# Remove existing datasets and create the new dataset directory
path = BASE_DIR+'processed/'
if os.path.exists(path):
    shutil.rmtree(path)
path = Path(path)
path.mkdir(parents=True, exist_ok=True)

In [8]:
# Create the train, val and test directories
for i in range(len(distribs)):
    Path(TRAIN_DIR+str(i)+'/').mkdir(parents=True, exist_ok=True)
Path(VAL_DIR).mkdir(parents=True, exist_ok=True)
Path(TEST_DIR).mkdir(parents=True, exist_ok=True)

label_list = []

def copy_image(directory, category, image, source):
    ''' Create directory if not existant and copy the image file
    '''
    path = directory+"/"+category
    if not os.path.exists(path+"/"+image):
        shutil.copyfile(source+'/'+image, path+"/"+image)

def copy_subset(list_img, bound_inf, bound_sup, category, directory, source_directory):
    ''' Loop over a subset of a category of image copy the image files
    '''
    path = directory+"/"+category
    if not os.path.exists(path):
        os.mkdir(path)
    for image in list_img[bound_inf:bound_sup]:
        if image.endswith(".jpg") == True or image.endswith(".JPG") == True:
            copy_image(directory, category, image, source_directory)
    
try:
    print("[INFO] Loading images ...")
    root_dir = listdir(DATA_DIR)
    
    train_images = [0] * len(categories)

    for plant_disease_folder in categories:
        plant_disease_folder_list = listdir("{}/{}".format(DATA_DIR,plant_disease_folder))
        print("[INFO] Processing {} with {} images".format(plant_disease_folder, len(plant_disease_folder_list)))
        for image in plant_disease_folder_list:
            # remove .DS_Store from list
            if image == ".DS_Store" :
                plant_disease_folder_list.remove(image)
        
        source_directory = DATA_DIR+'/'+plant_disease_folder
        test_images = len(plant_disease_folder_list)//10
        valid_images = len(plant_disease_folder_list)//6
        
        # Copy test image samples
        copy_subset(plant_disease_folder_list, 0, test_images, plant_disease_folder, TEST_DIR, source_directory)
        
        # Copy validation image samples
        copy_subset(plant_disease_folder_list, -valid_images, -1, plant_disease_folder, VAL_DIR, source_directory)

        # Preparation of the split train sets
        train_images = min(len(plant_disease_folder_list)-test_images-valid_images, 1200)
        start_idx = test_images
        i = 0
        for farms in distribs: 
            train_subset = round(train_images * farms[plant_disease_folder])
            # Copy train image samples
            copy_subset(plant_disease_folder_list, start_idx, start_idx+train_subset, plant_disease_folder, TRAIN_DIR+str(i)+'/', source_directory)
            start_idx += train_subset
            i += 1


    print("[INFO] Image loading completed")
except Exception as e:
   print("Error : {}">format(e))

[INFO] Loading images ...
[INFO] Processing Strawberry___Leaf_scorch with 1109 images
[INFO] Processing Tomato___Bacterial_spot with 2127 images
[INFO] Processing Apple___Apple_scab with 630 images
[INFO] Processing Tomato___Late_blight with 1909 images
[INFO] Processing Apple___Black_rot with 621 images
[INFO] Processing Apple___Cedar_apple_rust with 275 images
[INFO] Processing Apple___healthy with 1645 images
[INFO] Processing Tomato___healthy with 1591 images
[INFO] Processing Strawberry___healthy with 456 images
[INFO] Image loading completed
