# Feature Extraction Script

### This script is used to extract mango leaf features from the mango leaves dataset and generate a CSV file for training the classification model.

In [34]:
import numpy as np
import pandas as pd
import cv2
import os
import utils
from collections import namedtuple

### Feature Extration Function

In [35]:
def extract_features(image_path):
    """
    This function extracts the following features from an image at a given path and return those features as
    a namedtuple object.
    Features :-
    1. Centroid
    2. Aspect Ratio
    3. Leaf Area
    4. Leaf Margin Perimeter
    5. Form Factor
    6. Mean Color
    
    These features can be accessed in the returned namedtuple object by using following attributes on that object :-
    1. centroid - Centroid of Leaf
    2. aspectratio - Aspect Ratio of Leaf
    3. area - Leaf Area
    4. perimeter - Leaf Perimeter
    5. formfactor - Form Factor
    6. meancolor - Mean Color
    
    arguments:
     image_path - string containing path to leaf image file.
    returns:
     namedtuple object containing extracted features of the leaf.
    """
    # Read image from image file
    bgr_image = cv2.imread(image_path)
    # Obtain RGB and Grayscale images
    rgb_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB)
    gray_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY)
    # Apply binary otsu thresholdng to grayscale image
    ostu_value, thresh_image = cv2.threshold(gray_image,0, 255,cv2.THRESH_BINARY_INV+cv2.THRESH_OTSU)
    # Find all contours in thresholded image using RETR_EXTERNAL method
    image_contours, _ = cv2.findContours(thresh_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    # Find leaf contour among all contours
    leaf_contour = max(image_contours, key=cv2.contourArea) # Contour having maximum area is our leaf contour
    # Find moments of leaf contour to make measurements
    M = cv2.moments(leaf_contour)
    # FEATURE - Centroid
    if M["m00"] != 0:
        cX = int(M["m10"] / M["m00"])
        cY = int(M["m01"] / M["m00"])
    else:
        cX, cY = 0, 0
    centroid = (cX, cY)
    
    # FEATURE - Aspect Ratio
    x, y, w, h = cv2.boundingRect(leaf_contour)
    aspectratio = w / h
    
    # FEATURE - Area
    area = cv2.contourArea(leaf_contour)
    
    # FEATURE - Perimeter
    perimeter = cv2.arcLength(leaf_contour, True)
    
    # FEATURE - Form Factor
    formfactor = (4 * np.pi * area) / perimeter ** 2
    
    # FEATURE - Mean Color
    # Cropping rgb image using bounding rectangle of leaf contour
    cropped_image = rgb_image[y:y+h, x:x+w]
    r_mean = int(np.mean(cropped_image[: ,: ,0]))
    g_mean = int(np.mean(cropped_image[: ,: ,1]))
    b_mean = int(np.mean(cropped_image[: ,: ,2]))
    meancolor = (r_mean, g_mean, b_mean)
    
    # Create and return namedtuple containing extracted features
    Feature = namedtuple('Feature', ['centroid', 'aspectratio', 'area', 'perimeter', 'formfactor', 'meancolor'])
    leaf_feature = Feature(
        centroid=centroid,
        aspectratio=aspectratio,
        area=area,
        perimeter=perimeter,
        formfactor=formfactor,
        meancolor=meancolor)
    return leaf_feature

### Prepare dataset directories and image files paths

In [36]:
# Leaves Dataset Folder Name
dataset = 'MangoLeavesDatabase'
# Get Current Working directory
working_dir = os.getcwd()
# Generate paths for varieties
paths = {
    'alphonso': os.path.join(working_dir, dataset, 'alphonso/front'), # For now only using leaf front images
    'amrapali': os.path.join(working_dir, dataset, 'amrapali/front'),
    'chausa': os.path.join(working_dir, dataset, 'chausa/front'),
    'dusheri': os.path.join(working_dir, dataset, 'dusheri/front'),
    'langra': os.path.join(working_dir, dataset, 'langra/front'),
}
# Generate a dictionary storing lists of image paths of a particular variety accessible using corresponding variety name. 
image_dict = dict()
for label, path in paths.items():
    image_dict[label] = utils.get_file_paths(path, ['.jpg'])
# CSV file output path
csv_file_output_path = os.path.join(working_dir, dataset, 'labeled_dataset.csv')

### Generate Pandas DataFrame containing extracted features for each image file. If M features are extracted from N images then DataFrame will be of N x M dimension.

In [37]:
# Columns of the data
cols = ['centerX', 'centerY', 'aspectratio', 'area', 'perimeter', 'formfactor', 'meanR', 'meanG', 'meanB', 'label']
# Create an empty dataframe with above columns
data = pd.DataFrame(columns=cols)
# Extract image features of each image of each variety
for label, path_list in image_dict.items():
    for image_path in path_list:
        features = extract_features(image_path)
        data = data.append({
            cols[0]: features.centroid[0],
            cols[1]: features.centroid[1],
            cols[2]: features.aspectratio,
            cols[3]: features.area,
            cols[4]: features.perimeter,
            cols[5]: features.formfactor,
            cols[6]: features.meancolor[0],
            cols[7]: features.meancolor[1],
            cols[8]: features.meancolor[2],
            cols[9]: label,
        }, ignore_index=True)

### Export Data to CSV File

In [38]:
data.to_csv(csv_file_output_path)