# Feature Extraction Script

### This script is used to extract mango leaf features from the mango leaves dataset and generate a CSV file for training the classification model.

In [1]:
import numpy as np
import pandas as pd
import cv2
import os
import utils
from collections import namedtuple

### Feature Extration Function

In [2]:
def extract_features(image_path):
    """
    This function extracts the following features from an image at a given path and return those features as
    a namedtuple object.
    Features :-
    1. Aspect Ratio
    2. Leaf Area
    3. Leaf Margin Perimeter
    4. Form Factor
    5. Mean Color
    
    These features can be accessed in the returned namedtuple object by using following attributes on that object :-
    1. aspectratio - Aspect Ratio of Leaf
    2. area - Leaf Area to bounding rectangle area ratio
    3. perimeter - Leaf Perimeter to bounding rectangle perimeter ratio
    4. formfactor - Form Factor
    5. meancolor - Mean Color
    
    arguments:
     image_path - string containing path to leaf image file.
    returns:
     namedtuple object containing extracted features of the leaf.
    """
    # Read image from image file
    bgr_image = cv2.imread(image_path)
    # Obtain RGB and Grayscale images
    rgb_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2RGB)
    # Apply K-Means to reduce color space in image
    img_pixels = np.float32(rgb_image.reshape((-1, 3)))
    criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
    K = 10 # no of clusters
    ret,labels,centers = cv2.kmeans(img_pixels, K, None, criteria, 10, cv2.KMEANS_RANDOM_CENTERS)
    centers = np.uint8(centers)
    cluster_img = centers[labels.flatten()]
    # Set shadow bluish pixels to white
    for i,p in enumerate(cluster_img):
        max_intensity = max(p)
        if max_intensity == p[2]:
            cluster_img[i] = [255, 255, 255]
    cluster_img = cluster_img.reshape(rgb_image.shape)
    # Apply binary otsu thresholdng to grayscale image
    gray_image = cv2.cvtColor(cluster_img, cv2.COLOR_RGB2GRAY)
    ostu_value, thresh_image = cv2.threshold(gray_image,0, 255,cv2.THRESH_BINARY_INV+cv2.THRESH_OTSU)
    # Find all contours in thresholded image using RETR_EXTERNAL method
    image_contours, _ = cv2.findContours(thresh_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    # Find leaf contour among all contours
    leaf_contour = max(image_contours, key=cv2.contourArea) # Contour having maximum area is our leaf contour
    
    # FEATURE - Aspect Ratio
    x, y, w, h = cv2.boundingRect(leaf_contour)
    aspectratio = h / w
    
    # FEATURE - Area
    area = cv2.contourArea(leaf_contour)
    area_ratio = area / (w * h)
    
    # FEATURE - Perimeter
    perimeter = cv2.arcLength(leaf_contour, True)
    perimeter_ratio = perimeter / (2 * (w + h))
    
    # FEATURE - Form Factor
    formfactor = (4 * np.pi * area) / perimeter ** 2
    
    # FEATURE - Mean Color
    # Cropping rgb image using bounding rectangle of leaf contour
    cropped_image = rgb_image[y:y+h, x:x+w]
    r_mean = int(np.mean(cropped_image[: ,: ,0])) / 255
    g_mean = int(np.mean(cropped_image[: ,: ,1])) / 255
    b_mean = int(np.mean(cropped_image[: ,: ,2])) / 255
    meancolor = (r_mean, g_mean, b_mean)
    
    # Create and return namedtuple containing extracted features
    Feature = namedtuple('Feature', ['aspectratio', 'area', 'perimeter', 'formfactor', 'meancolor'])
    leaf_feature = Feature(
        aspectratio=aspectratio,
        area=area_ratio,
        perimeter=perimeter_ratio,
        formfactor=formfactor,
        meancolor=meancolor)
    return leaf_feature

### Prepare dataset directories and image files paths

In [3]:
# Leaves Dataset Folder Name
dataset = 'MangoLeavesDatabase'
# Get Current Working directory
working_dir = os.getcwd()
# Generate paths for varieties
paths = {
    'alphonso': os.path.join(working_dir, dataset, 'alphonso/'), # For now only using leaf front images
    'amrapali': os.path.join(working_dir, dataset, 'amrapali/'),
    'chausa': os.path.join(working_dir, dataset, 'chausa/'),
    'dusheri': os.path.join(working_dir, dataset, 'dusheri/'),
    'langra': os.path.join(working_dir, dataset, 'langra/'),
}
# Generate a dictionary storing lists of image paths of a particular variety accessible using corresponding variety name. 
image_dict = dict()
for label, path in paths.items():
    image_dict[label] = utils.get_file_paths(path, ['.jpg'])
# CSV file output path
csv_file_output_path = os.path.join(working_dir, dataset, 'labeled_dataset.csv')

### Generate Pandas DataFrame containing extracted features for each image file. If M features are extracted from N images then DataFrame will be of N x M dimension.

In [4]:
# Columns of the data
cols = ['aspectratio', 'area', 'perimeter', 'formfactor', 'meanR', 'meanG', 'meanB', 'label']
# Create an empty dataframe with above columns
data = pd.DataFrame(columns=cols)
# Extract image features of each image of each variety
for label, path_list in image_dict.items():
    for image_path in path_list:
        features = extract_features(image_path)
        data = data.append({
            cols[0]: features.aspectratio,
            cols[1]: features.area,
            cols[2]: features.perimeter,
            cols[3]: features.formfactor,
            cols[4]: features.meancolor[0],
            cols[5]: features.meancolor[1],
            cols[6]: features.meancolor[2],
            cols[7]: label,
        }, ignore_index=True)

### Export Data to CSV File

In [5]:
# Shuffle the rows of data before saving
data = data.sample(frac=1).reset_index(drop=True)
data.to_csv(csv_file_output_path, index=False)