In [1]:
# Report Feb-18-2026

# The goal of this notebook is to prepare raw data for disease identification.
# Using the PlantVillage dataset, I implement a preprocessing pipeline designed
# to ensure model robustness and computational efficiency.

# Key Objectives:
# Data Partitioning: I split the original dataset into Training (70%), Validation (15%), and Testing (15%) sets.
# This ensures that the model is evaluated on entirely unseen data to accurately measure its generalization capabilities.

# Dynamic Resizing & Augmentation: I utilize torchvision.transforms.Training to  apply RandomResizedCrop
# and RandomHorizontalFlip to artificially increase dataset variety and prevent overfitting.

# Validation/Testing:
# I use CenterCrop and Resize to maintain consistency during evaluation.

# Normalization:
# Images are normalized using the ImageNet mean and standard deviation to align the data distribution
# with pre-trained architecture of ResNet.

In [2]:
# Mounting Google Drive from Colab.

from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [3]:
# Apply resizing using transforms

# Instead of permanently resizing the images, resizing and normalization are applied
# dynamically during data loading using torchvision transforms to ensure consistency
# and preserve the original dataset.

# The PlantVillage dataset provides images in color, grayscale, and segmented formats.
# I used color images , as they preserve essential chromatic information
# required for accurate disease identification.


In [4]:
# Define transforms
from torchvision import transforms

In [5]:
# Data augmentation techniques such as random cropping and horizontal flipping
# are applied during training to improve generalization,
# while validation images were resized and center-cropped for consistent evaluation.

data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(
            [0.485, 0.456, 0.406],
            [0.229, 0.224, 0.225]
        )
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(
            [0.485, 0.456, 0.406],
            [0.229, 0.224, 0.225]
        )
    ]),
    'test': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(
            [0.485, 0.456, 0.406],
            [0.229, 0.224, 0.225]
        )
    ])
}

In [6]:
import os
import shutil
import random

In [7]:
# Path to original dataset
source_dir = "/content/gdrive/MyDrive/leaf_diagnosis_project/data/plantVillage/color"

# Path where split dataset will be created
target_dir = "/content/gdrive/MyDrive/leaf_diagnosis_project/data/plantVillage_split"

In [8]:
#Create Train / Val / Test Folders
splits = ['train', 'val', 'test']

for split in splits:
    os.makedirs(os.path.join(target_dir, split), exist_ok=True)

In [9]:
# Split the Dataset (Core Logic)
train_ratio = 0.7
val_ratio = 0.15
test_ratio = 0.15

random.seed(42)  # reproducibility

for class_name in os.listdir(source_dir):
    class_path = os.path.join(source_dir, class_name)

    if not os.path.isdir(class_path):
        continue

    images = os.listdir(class_path)
    random.shuffle(images)

    total = len(images)
    train_end = int(train_ratio * total)
    val_end = train_end + int(val_ratio * total)

    splits_dict = {
        'train': images[:train_end],
        'val': images[train_end:val_end],
        'test': images[val_end:]
    }

    for split, split_images in splits_dict.items():
        split_class_dir = os.path.join(target_dir, split, class_name)
        os.makedirs(split_class_dir, exist_ok=True)

        for img in split_images:
            src = os.path.join(class_path, img)
            dst = os.path.join(split_class_dir, img)
            shutil.copy(src, dst)

In [3]:
# Checking that generated data.
!ls /content/gdrive/MyDrive/leaf_diagnosis_project/data/plantVillage_split

test  train  val
