## Dependencies

In [1]:
# Filepaths, numpy, and Tensorflow
import os
from PIL import Image
import numpy as np
import pandas as pd

In [2]:
# Sklearn scaling
from sklearn.preprocessing import MinMaxScaler

### Load the Plant Disease dataset from local desktop

In [3]:
base_dir = r'C:\Users\gfoley\OneDrive - Epiq Inc\Desktop\Plant Project\archive\PlantVillage'

In [4]:
def load_images_from_subfolders(base_folder):
    images = []
    labels = []
    max_images_healthy = 1500
    max_images_others = 175

    for root, dirs, files in os.walk(base_folder):
        folder_name = os.path.basename(root)
        if folder_name.startswith("Tomato_"):
            image_count = 0 
            if folder_name == "Tomato_healthy":
                max_images = max_images_healthy
            else:
                max_images = max_images_others

            for file in files:
                if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                    if image_count >= max_images:
                        break
                    img_path = os.path.join(root, file)
                    print(f"Loading {img_path}")
                    try:
                        img = Image.open(img_path).convert('RGB')
                        img = img.resize((28, 28))
                        images.append(np.array(img))
                        labels.append(folder_name) 
                        image_count += 1
                    except Exception as e:
                        print(f"Error loading image {img_path}: {e}")

    return np.array(images), np.array(labels)

all_images, labels = load_images_from_subfolders(base_dir)
print(f'Total number of images: {all_images.shape[0]}')

Loading C:\Users\gfoley\OneDrive - Epiq Inc\Desktop\Plant Project\archive\PlantVillage\PlantVillage\Tomato_Bacterial_spot\00416648-be6e-4bd4-bc8d-82f43f8a7240___GCREC_Bact.Sp 3110.JPG
Loading C:\Users\gfoley\OneDrive - Epiq Inc\Desktop\Plant Project\archive\PlantVillage\PlantVillage\Tomato_Bacterial_spot\0045ba29-ed1b-43b4-afde-719cc7adefdb___GCREC_Bact.Sp 6254.JPG
Loading C:\Users\gfoley\OneDrive - Epiq Inc\Desktop\Plant Project\archive\PlantVillage\PlantVillage\Tomato_Bacterial_spot\00639d29-2d1a-4fcf-9bd3-a2b3109c74c4___UF.GRC_BS_Lab Leaf 1054.JPG
Loading C:\Users\gfoley\OneDrive - Epiq Inc\Desktop\Plant Project\archive\PlantVillage\PlantVillage\Tomato_Bacterial_spot\00728f4d-83a0-49f1-87f8-374646fcda05___GCREC_Bact.Sp 6326.JPG
Loading C:\Users\gfoley\OneDrive - Epiq Inc\Desktop\Plant Project\archive\PlantVillage\PlantVillage\Tomato_Bacterial_spot\00a7c269-3476-4d25-b744-44d6353cd921___GCREC_Bact.Sp 5807.JPG
Loading C:\Users\gfoley\OneDrive - Epiq Inc\Desktop\Plant Project\archive\P

In [5]:
n_samples, height, width, n_channels = all_images.shape
flattened_images = all_images.reshape((n_samples, -1))

scaler = MinMaxScaler()
flattened_images_scaled = scaler.fit_transform(flattened_images)

df_scaled_images = pd.DataFrame(flattened_images_scaled)
df_scaled_images['label'] = labels

file_path = r'C:\Users\gfoley\OneDrive - Epiq Inc\Desktop\Plant Project\scaled_images.csv'
df_scaled_images.to_csv(file_path, index=False)

In [6]:
df_scaled_images.shape

(6150, 2353)

In [7]:
df_scaled_images['label'].unique().tolist()

['Tomato_Bacterial_spot',
 'Tomato_Early_blight',
 'Tomato_healthy',
 'Tomato_Late_blight',
 'Tomato_Leaf_Mold',
 'Tomato_Septoria_leaf_spot',
 'Tomato_Spider_mites_Two_spotted_spider_mite',
 'Tomato__Target_Spot',
 'Tomato__Tomato_mosaic_virus',
 'Tomato__Tomato_YellowLeaf__Curl_Virus']