In [32]:
#| default_exp features

In [53]:
#|export
import sys
import os

# Path to the project root (one level above notebooks/)
project_root = os.path.abspath("..")

# Add to Python path
sys.path.append(project_root)

import weedcrop.dataset as weedcrop_ds
import cv2
from skimage.feature import hog, local_binary_pattern

from PIL import Image

In [66]:
ds_list = weedcrop_ds.create_dataset_list()
ds_list;

---

# Resize images to dimensions

In [61]:
#|export
min_width, min_height = 2000, 2000

def find_min_dim(ds):
    """
    Finds the smallest width and height in the dataset using PIL.
    """
    min_width, min_height = 2000, 2000
    for each_img in ds:
        img = Image.open(each_img[0]) # PIL - RGB/RGBA by default
        min_width = min(min_width, img.size[0])
        min_height = min(min_height, img.size[1])

    print(f"Minimum resolution: {min_width, min_height}")
    return min_width, min_height

In [64]:
#|export
def resize_img_to_dim(img_file_path, width, height):
    """
    Loads image with CV2, converts to RGB, and resizes.
    """
    img_bgr = cv2.imread(img_file_path) # CV2 reads in BGR by default
    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
    img_resized = cv2.resize(img_rgb, (width, height))
    return img_resized

---

# Histogram of Oriented Graphs

For each image in the dataset:
1. Load image
2. Convert the image to grayscale
3. Resize the image to match min resolution
4. Extract HOG
5. Append to list with features

In [62]:
#|export
from skimage.feature import hog
from skimage.color import rgb2gray

def extract_hog_features_from_list(X_images_rgb, y_labels):
    """
    Takes a list of pre-resized RGB images and returns HOG features.
    """
    X_hog_features = []
    
    for img_rgb in X_images_rgb:
        # 1. Convert our standard RGB to Gray using skimage
        # This ensures the same weights are used as in your other skimage functions
        img_gray = rgb2gray(img_rgb)

        # 2. Extract HOG
        hog_features = hog(
            img_gray,
            orientations=9,
            pixels_per_cell=(8, 8),
            cells_per_block=(2, 2),
            visualize=False
        )

        X_hog_features.append(hog_features)
        
    return np.array(X_hog_features), np.array(y_labels)

In [36]:
%%time
X_hog, y_hog = extract_hog_features(ds_list)

Minimum resolution: (360, 360)
CPU times: user 37.7 s, sys: 1.03 s, total: 38.7 s
Wall time: 37.5 s


In [37]:
len(X_hog), len(y_hog)

(1176, 1176)

In [42]:
X_hog[0], y_hog[0]

(array([0.28475655, 0.        , 0.02391007, ..., 0.18594874, 0.25146204,
        0.25146204]),
 'weed')

---

# Local Binary Pattern - LBP

For each image in the dataset:
1. Load image
2. Convert the image to grayscale
3. Resize the image to match min resolution
4. Extract LBP
5. Get the histogram of LBP
6. Append to list with features

In [63]:
#|export
from skimage import feature, color
import numpy as np

def extract_lbp_features_from_list(X_images_rgb, y_labels):
    """
    Standardized LBP extraction using pre-resized RGB images.
    """
    X_lbp_features = []
    
    for img_rgb in X_images_rgb:
        # 1. Convert our standard RGB to Gray
        # Using skimage.color ensures consistency across the whole project
        img_gray = color.rgb2gray(img_rgb)

        # 2. Extract LBP
        lbp = feature.local_binary_pattern(img_gray, P=8, R=1, method='uniform')
        
        # 3. Create Histogram (10 bins for P=8 uniform)
        lbp_hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, 11), density=True)

        X_lbp_features.append(lbp_hist)
        
    return np.array(X_lbp_features), np.array(y_labels)

In [40]:
%%time
X_lbp, y_lbp = extract_hog_features(ds_list)

Minimum resolution: (360, 360)
CPU times: user 37.8 s, sys: 1.05 s, total: 38.9 s
Wall time: 37.7 s


In [43]:
len(X_lbp), len(y_lbp)

(1176, 1176)

---

# Feature fusion

In [45]:
#|export
def feature_fusion(super_matrix=None, feature_list=[]):
    """ Fuses a list of features into a super-matrix. """
    
    new_features = [np.array(f) for f in feature_list]
    
    if super_matrix is None:
        super_matrix = np.hstack([ np.array(f) for f in feature_list ])
        
    else:
        super_matrix = np.hstack([super_matrix] + new_features)
        
    return super_matrix

---

# HSV Histograms

HOG and LBP capture Shape and Texture, but they ignore Color. In weed-crop datasets, color is often the most discriminative feature (e.g., specific shades of green or the absence of green in soil).

Objects are represented by RGB values.

In RGB space hue and luminocity are represented as linear combination of the RGB channels. 

Image segmentation can be performed by value thresholding of the HSV values.
    
Ref: https://scikit-image.org/docs/0.25.x/auto_examples/color_exposure/plot_rgb_to_hsv.html

In [68]:
#|export
from skimage.color import rgb2hsv
import numpy as np

def extract_hsv_features_from_list(X_images_resized, bins=16):
    """
    Extracts H, S, and V histograms from the pre-resized RGB image list.
    """
    X_hsv_features = []
    
    for img_rgb in X_images_resized:
        # Convert to HSV (skimage expects RGB, values 0-1)
        hsv_img = rgb2hsv(img_rgb)
        
        # Split channels
        h_chan = hsv_img[:, :, 0]
        s_chan = hsv_img[:, :, 1]
        v_chan = hsv_img[:, :, 2]

        # Calculate histograms for each channel
        # We use density=True to normalize the vectors
        h_hist, _ = np.histogram(h_chan, bins=bins, range=(0, 1), density=True)
        s_hist, _ = np.histogram(s_chan, bins=bins, range=(0, 1), density=True)
        v_hist, _ = np.histogram(v_chan, bins=bins, range=(0, 1), density=True)

        # Combine H, S, and V into one feature vector for this image
        hsv_vector = np.concatenate([h_hist, s_hist, v_hist]) # Flat vector
        X_hsv_features.append(hsv_vector)
        
    return np.array(X_hsv_features)

In [70]:
#|export
from skimage.color import rgb2hsv
import numpy as np

def extract_masked_hsv_features(X_images_resized, bins=16):
    """
    Extracts HSV histograms ONLY from pixels identified as 'Plant' 
    based on Hue and Value thresholds.
    """
    X_hsv_features = []
    
    for img_rgb in X_images_resized:
        # 1. Convert to HSV
        hsv_img = rgb2hsv(img_rgb)
        h, s, v = hsv_img[:,:,0], hsv_img[:,:,1], hsv_img[:,:,2]
        
        # 2. Apply your "Plant Discovery" Mask
        # We only look at 'Green' hues (0.2 - 0.45) and skip dark shadows (v > 0.1)
        mask = (h > 0.2) & (h < 0.45) & (v > 0.1)
        
        # 3. Extract pixels that belong to the plant
        # This collapses the 2D image into a 1D list of 'Plant' pixels
        plant_h = h[mask]
        plant_s = s[mask]
        plant_v = v[mask]

        # 4. Handle edge case: if no plant is detected, return zeros
        if len(plant_h) > 0:
            h_hist, _ = np.histogram(plant_h, bins=bins, range=(0, 1), density=True)
            s_hist, _ = np.histogram(plant_s, bins=bins, range=(0, 1), density=True)
            v_hist, _ = np.histogram(plant_v, bins=bins, range=(0, 1), density=True)
        else:
            h_hist = np.zeros(bins)
            s_hist = np.zeros(bins)
            v_hist = np.zeros(bins)

        # 5. Combine and append
        hsv_vector = np.concatenate([h_hist, s_hist, v_hist])
        X_hsv_features.append(hsv_vector)
        
    return np.array(X_hsv_features)

---

# Data-Level: Augmentation (Not just SMOTE)

"Feature engineering on the smaller class to increase its size" is exactly what you should do, but be careful with SMOTE. SMOTE creates "average" versions of your 60 samples in feature space, which can sometimes just create noise.

Better approach: Image Augmentation. Before extracting HOG/LBP, take your 60 crop images and create 10 new versions of each by:

    Rotating them (90°, 180°, 270°).

    Flipping them horizontally and vertically.

    Adjusting Brightness slightly.

This turns your 60 samples into 600 samples of real visual data. Extract HOG/LBP from these augmented images. This is much more robust for computer vision than purely mathematical oversampling.