# Data Cleaning
Jupyter Notebook used to clean and prepare the image dataset for ML model development.

### **Setup:**

To run the notebook, you'll first need to download the Kaggle dataset available at:

[🌱 House Plant Species 🌱](https://www.kaggle.com/datasets/kacpergregorowicz/house-plant-species)

Save this dataset in the `/Data/` folder before running the code.

---

### Constants

In [10]:
base_dir = '../Data/house_plant_species'
""" Folder path to the base directory containing species subfolders with images. """

' Folder path to the base directory containing species subfolders with images. '

### Helper Functions

In [2]:
import hashlib

def calculate_md5(file_path):
    """
    Calculate the MD5 hash of a file.

    Parameters:
        file_path (str): The path to the file whose MD5 hash needs to be calculated. 

    Returns:
        str: A string representing the MD5 hash of the file in hexadecimal format.

    Example Usage:
        file_path = 'example.txt'
        md5_hash = calculate_md5(file_path)
        print(f"MD5 Hash: {md5_hash}")
    """
    hasher = hashlib.md5()
    with open(file_path, 'rb') as f:
        while chunk := f.read(4096):
            hasher.update(chunk)
    return hasher.hexdigest()

In [3]:
import os

def find_duplicates(folder_path):
    """ 
    Find duplicate files in a folder.

    Parameters:
        folder_path (str): The path to the folder in which to find the duplicate files.
    
    Returns:
        list: A list of tuples, where each tuple contains the paths of duplicate files.
    
    Example Usage:
        folder_path = '/path/to/folder'
        duplicates = find_duplicates_in_folder(folder_path)
        for dup in duplicates:
            print(f"Duplicate: {dup[0]} and {dup[1]}")
    """
    hashes = {}
    duplicates = []
    
    for image_name in os.listdir(folder_path):
        image_path = os.path.join(folder_path, image_name)
        if os.path.isfile(image_path):
            img_hash = calculate_md5(image_path)
            if img_hash in hashes:
                duplicates.append((image_path, hashes[img_hash]))
            else:
                hashes[img_hash] = image_path

    return duplicates

In [9]:
import cv2

def convert_to_jpg(folder_path):
    """
    Convert images in a folder to JPEG format.

    Parameters:
        folder_path (str): The path to the folder containing images to convert.

    How It Works:
        - Reads each file in the folder.
        - Drops the alpha channel if the image has 4 channels.
        - If the file is not already in JPEG format, converts and saves it as a JPEG.
        - Deletes the original file after successful conversion.

    Notes:
        - Only valid image files will be processed.
        - If an image cannot be read, a warning is displayed, and the file is skipped.

    Example Usage:
        folder_path = '/path/to/folder'
        convert_to_jpg(folder_path)
    """
    for image_name in os.listdir(folder_path):
        image_path = os.path.join(folder_path, image_name)

        if os.path.isfile(image_path):
            image = cv2.imread(image_path)
            if image is None:
                print(f'Warning: Failed to read image: {image_path}')
                continue

            # If the image has 4 channels, drop the alpha channel
            if image.shape[2] == 4:
                image = image[:, :, :3]

            # Convert and save as JPEG if necessary
            new_image_name = f"{os.path.splitext(image_name)[0]}.jpg"
            new_image_path = os.path.join(folder_path, new_image_name)

            if not image_name.lower().endswith('.jpg'):
                success = cv2.imwrite(new_image_path, image)

                if success:
                    print(f'Converted to JPEG: {new_image_path}')
                    os.remove(image_path)  # Remove the original file
                else:
                    print(f'Failed to save JPEG for {image_path}, skipping.')

### Clean the dataset

In [13]:
# Obtain a list of subfolders (classes)
subfolders = []
dir_content = os.listdir(base_dir)

for file_name in dir_content:
    file_path = os.path.join(base_dir, file_name)
    if os.path.isdir(file_path):
        subfolders.append(file_name)

len(subfolders), subfolders

(47,
 ['African Violet (Saintpaulia ionantha)',
  'Aloe Vera',
  'Anthurium (Anthurium andraeanum)',
  'Areca Palm (Dypsis lutescens)',
  'Asparagus Fern (Asparagus setaceus)',
  'Begonia (Begonia spp.)',
  'Bird of Paradise (Strelitzia reginae)',
  'Birds Nest Fern (Asplenium nidus)',
  'Boston Fern (Nephrolepis exaltata)',
  'Calathea',
  'Cast Iron Plant (Aspidistra elatior)',
  'Chinese evergreen (Aglaonema)',
  'Chinese Money Plant (Pilea peperomioides)',
  'Christmas Cactus (Schlumbergera bridgesii)',
  'Chrysanthemum',
  'Ctenanthe',
  'Daffodils (Narcissus spp.)',
  'Dracaena',
  'Dumb Cane (Dieffenbachia spp.)',
  'Elephant Ear (Alocasia spp.)',
  'English Ivy (Hedera helix)',
  'Hyacinth (Hyacinthus orientalis)',
  'Iron Cross begonia (Begonia masoniana)',
  'Jade plant (Crassula ovata)',
  'Kalanchoe',
  'Lilium (Hemerocallis)',
  'Lily of the valley (Convallaria majalis)',
  'Money Tree (Pachira aquatica)',
  'Monstera Deliciosa (Monstera deliciosa)',
  'Orchid',
  'Parlor 

In [16]:
# Clean images by folder
for folder in subfolders:
    folder_path = os.path.join(base_dir, folder)
    
    print(f'Processing folder: {folder_path}')
    
    # Find duplicates
    duplicates = find_duplicates(folder_path)
    if duplicates:
        for duplicate in duplicates:
            os.remove(duplicate)
            print(f'Duplicate found and removed: {duplicate}')
    
    # Convert to jpg
    convert_to_jpg(folder_path)

Processing folder: ../Data/house_plant_species\African Violet (Saintpaulia ionantha)
Converted to JPEG: ../Data/house_plant_species\African Violet (Saintpaulia ionantha)\123.jpg
Converted to JPEG: ../Data/house_plant_species\African Violet (Saintpaulia ionantha)\149.jpg
Converted to JPEG: ../Data/house_plant_species\African Violet (Saintpaulia ionantha)\166.jpg
Converted to JPEG: ../Data/house_plant_species\African Violet (Saintpaulia ionantha)\181.jpg
Converted to JPEG: ../Data/house_plant_species\African Violet (Saintpaulia ionantha)\186.jpg
Converted to JPEG: ../Data/house_plant_species\African Violet (Saintpaulia ionantha)\193.jpg
Converted to JPEG: ../Data/house_plant_species\African Violet (Saintpaulia ionantha)\232.jpg
Converted to JPEG: ../Data/house_plant_species\African Violet (Saintpaulia ionantha)\24.jpg
Converted to JPEG: ../Data/house_plant_species\African Violet (Saintpaulia ionantha)\246.jpg
Converted to JPEG: ../Data/house_plant_species\African Violet (Saintpaulia iona