# Rice Leaf Disease Image Classification

## 1. Data Preparation

In [1]:
cd D:\CVOps\angkorice-vision

D:\CVOps\angkorice-vision


In [None]:
from pathlib import Path
from PIL import Image, ImageOps
import os
import cv2
image_extension =  [".jpg", ".jpeg", ".png"]

def save_image(source_dir, dest_dir):
    extensions = [f"{ext}" for ext in image_extension]
    source_dir = Path(source_dir)
    os.makedirs(dest_dir, exist_ok=True)

    for image_path in source_dir.iterdir():
        '''Iterate files in side a folder'''
        if image_path.suffix.lower() in image_extension:
            # suffix: final component last suffix
            image = Image.open(image_path)
            # This line fixes the mirroring/rotation issue
            image = ImageOps.exif_transpose(image)
            new_image = image.copy()

            new_image_path = os.path.join(dest_dir, image_path.name)
            new_image.save(new_image_path)

            print(f"Saved to : {new_image_path}")


In [None]:
save_image('original_image', "new_image/images")

https://www.google.com/search?q=rice+disease&rlz=1C9BKJA_enKH1182KH1184&hl=en-US&sourceid=chrome-mobile&ie=UTF-8&udm=50&fbs=ADc_l-aN0CWEZBOHjofHoaMMDiKpaEWjvZ2Py1XXV8d8KvlI3jljrY5CkLlk8Dq3IvwBz-SJyfRX_inP-J3Cs9lQZu9Jfs7bKPahJnRsHKc4onwyDq9WMnowl0OajFazw3AO9oMExtRIasTocpRk1Nm0YznD9lpG9_yLbeoG_GkgV4UDOiWHcm4BAGZgVpfjwIOjCiGwWoQfcCPqOtHkGvPHHgyph2z3ng&aep=10&ntc=1&mstk=AUtExfBQ1dh3t4oIEaBrZDcAcoPR8RAi5DvHI9KSIv0GscYASXIXQ3YpfhRzHvXxS3ltzCy3DCT4eMK7JItBZyzCRJqehN6eswNrSpt_ijacWRGUzczFrJvRVNRbSFgdq3b9Vb9eGqkzb37U7kg-IFyyvFko_p-EXMKWwQyqgqNR8mVV8cpojY07d11E7rX1Srrky1_Yy6arp8Nqrgo5G-J7Z8xxFqJKqjiK7Vua5NWVzWKMrzRIIIAj0I5ShY8RPMkJmOYd5K4uVxX9jXq8X1gybLlZs8iUKboi2naQELakls4aiwPxu4nbX4FCiaXNHtSUM1u7Y7NUVu-abw&csuir=1&mtid=HptkaafjBoL2seMPq9Ou-Aw#lfId=ChxjMe

In [7]:
import os
import shutil

def merge_datasets(source_dirs, dest_dirs, splits=("train", "valid", "test")):
    '''A function that merges multiple source of image dataset'''
    for split in splits:
        for src in source_dirs:
            split_path = os.path.join(src, split)

            if not os.path.exists(split_path):
                print(f"Skipping missing folder : {split_path}")
                continue

            for class_name in os.listdir(split_path):
                class_src = os.path.join(split_path, class_name)

                if not os.path.exists(class_src):
                    continue

                # Destination class folder
                class_dest = os.path.join(dest_dirs, split, class_name)
                os.makedirs(class_dest, exist_ok=True)

                # Copy images
                for file in os.listdir(class_src):
                    # List down all image in the source folder for each classes
                    src_file = os.path.join(class_src, file)
                    dest_file = os.path.join(class_dest, file)
                    # Avoid overwrite if same filename exists
                    # if os.path.exists(dest_file):
                        # base, ext = os.path.splitext(file)
                        # dest_file = os.path.join(class_dest, f"{base}_{src[-1]}{ext}")

                    shutil.copy2(src_file, dest_file)
    print(f"Datasets merged successfully : {dest_dirs}")


In [None]:
# raw â†’ staging â†’ processed/train|valid|test
def merge_raw_datasets(source_dir, staging_dir):
    os.makedirs(staging_dir, exist_ok=True)

    for src in source_dir:
        print(f"Merging images from source : {src}")
        for class_name in os.listdir(src):
            class_src = os.path.join(src, class_name)

            if not os.path.exists(class_src):
                continue

            class_dest = os.path.join(staging_dir, class_name)
            os.makedirs(class_dest, exist_ok=True)

            for file in os.listdir(class_src):
                src_file = os.path.join(class_src, file)
                dest_file = os.path.join(class_dest, file)

                # avoid overwrite
                if os.path.exists(dest_file):
                    base, ext = os.path.split(file)
                    dest_file = os.path.join(class_dest, f"{base}_{os.path.basename(src)}{ext}")

                shutil.copy2(src_file, dest_file)

    print(f"All raw datasets merged to : {staging_dir}")


In [None]:
RAW_SOURCE = [
    './dataset/sources_datasets/Mendeley Rice Leaf Disease Samples',
    './dataset/sources_datasets/Rice Disease Dataset (Kaggle)'
]
STAGING_DIR = './dataset/staging'

merge_raw_datasets(RAW_SOURCE, STAGING_DIR)

In [None]:
def count_images(root):
    for split in ["train", "valid", "test"]:
        print(f"\n{split.upper()}")
        split_path = os.path.join(root, split)
        for cls in os.listdir(split_path):
            cls_path = os.path.join(split_path, cls)
            count = len(os.listdir(cls_path))
            print(f"  {cls}: {count} images")



In [10]:
# Split Stating --> Train/ Valid/Test

import random
from collections import defaultdict
import shutil

VALID_EXTENSIONS = (".jpg", ".jpeg", ".png", ".webp", ".bmp")

def split_dataset(staging_dir,output_dir, train_ratio=0.7, valid_ratio=0.2):
    os.makedirs(output_dir, exist_ok=True)

    stats = defaultdict(dict)

    for cls in os.listdir(staging_dir):
        cls_path = os.path.join(staging_dir, cls)
        
        if not os.path.isdir(cls_path):
            continue
        # Ensure we copy only with valid image extensions
        images = [f for f in os.listdir(cls_path) if f.lower().endswith(VALID_EXTENSIONS)]

        if len(images) ==0:
            print(f"No valid images found in : {cls}")
        
        random.shuffle(images)

        total = len(images)
        train_end = int(total*train_ratio)
        valid_end = int(total*(train_ratio+valid_ratio))

        splits = {
            "train": images[:train_end],
            "valid": images[train_end:valid_end],
            "test": images[valid_end:]
        }
        
        for split, files in splits.items():
            split_dir = os.path.join(output_dir, split, cls)
            os.makedirs(split_dir, exist_ok=True)

            for f in files:
                src = os.path.join(cls_path, f)
                dest = os.path.join(split_dir, f)

                if not os.path.exists(src):
                    print(f"Skipped missing file : {src}")
                shutil.copy2(src, dest)

        stats[cls] = {
            "total": total,
            "train": len(splits["train"]),
            "valid": len(splits["valid"]),
            "test": len(splits["test"])
        }
        print(f"{cls} : {total} images")
    
    return stats


In [11]:
import json
from datetime import datetime

def save_metadata(output_dir, version, class_stats):
    metadata = {
        "dataset_name": "rice_leaf_vision",
        "version": version,
        "created_at": datetime.utcnow().isoformat(),
        "splits": ["train", "valid", "test"],
        "classes": class_stats
    }

    path = os.path.join(output_dir, "metadata.json")
    with open(path, "w") as f:
        json.dump(metadata, f, indent=4)

    print(f"ðŸ“„ Metadata saved: {path}")


In [12]:
import os

VALID_EXTENSIONS = (".jpg", ".jpeg", ".png", ".webp", ".bmp")

def clean_staging_folder(staging_dir):
    print("\nðŸ§¹ Cleaning staging folder...")

    for cls in os.listdir(staging_dir):
        cls_path = os.path.join(staging_dir, cls)

        if not os.path.isdir(cls_path):
            continue

        for f in os.listdir(cls_path):
            file_path = os.path.join(cls_path, f)

            # remove subfolders
            if os.path.isdir(file_path):
                print(f"Removing folder: {file_path}")
                os.rmdir(file_path)
                continue

            # remove non-image files
            if not f.lower().endswith(VALID_EXTENSIONS):
                print(f"Removing non-image: {file_path}")
                os.remove(file_path)
                continue

            # remove empty files
            if os.path.getsize(file_path) == 0:
                print(f"Removing empty file: {file_path}")
                os.remove(file_path)

    print("âœ… Staging folder cleaned successfully\n")


In [13]:
clean_staging_folder("./dataset/staging")



ðŸ§¹ Cleaning staging folder...
âœ… Staging folder cleaned successfully



In [15]:
VERSION = "v1.0"
OUTPUT_DIR = f"./dataset/processed/rice_leaf_vision/{VERSION}"
STAGING_DIR = './dataset/staging'

class_stats = split_dataset(STAGING_DIR, OUTPUT_DIR)

save_metadata(OUTPUT_DIR, VERSION, class_stats)



Bacterialblight : 2224 images
Blast : 2074 images
Brownspot : 2246 images
Healthyleaf : 653 images
Leafscald : 628 images
Sheathblight : 632 images
Tungro : 1308 images
ðŸ“„ Metadata saved: ./dataset/processed/rice_leaf_vision/v1.0\metadata.json


In [None]:
a = {"x1": [1,2,3], "x2": [4,5,6]}
for b in a.items():
    print(b)

## 2. Training Pipeline

## 3. Inferencing With Trained Model

## 4. K-Fold Validatioin for DL Model

https://www.analyticsvidhya.com/blog/2021/09/how-to-apply-k-fold-averaging-on-deep-learning-classifier/