# Feature Extraction Pipeline

This notebook extracts various features from plant images for disease detection and analysis.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
from pathlib import Path
import tensorflow as tf
from tensorflow.keras.applications import VGG16, ResNet50, EfficientNetB0
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input as vgg_preprocess
from tensorflow.keras.applications.resnet50 import preprocess_input as resnet_preprocess
from tensorflow.keras.applications.efficientnet import preprocess_input as efficientnet_preprocess
from skimage.feature import local_binary_pattern, greycomatrix, greycoprops
from skimage.measure import label, regionprops
from skimage import filters, exposure
import pickle
from tqdm import tqdm
import json

# Set paths
PROJECT_ROOT = Path('/Users/debabratapattnayak/web-dev/greencast')
PROCESSED_DATA_PATH = PROJECT_ROOT / 'processed_data'
FEATURES_PATH = PROJECT_ROOT / 'features'
FEATURES_PATH.mkdir(exist_ok=True)

## Traditional Computer Vision Features

In [None]:
class TraditionalFeatureExtractor:
    """Extract traditional computer vision features from images"""
    
    def __init__(self):
        self.feature_names = []
    
    def extract_color_features(self, image):
        """Extract color-based features"""
        features = []
        
        # Convert to different color spaces
        hsv = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
        lab = cv2.cvtColor(image, cv2.COLOR_RGB2LAB)
        
        # RGB statistics
        for i, channel in enumerate(['R', 'G', 'B']):
            channel_data = image[:, :, i]
            features.extend([
                np.mean(channel_data),
                np.std(channel_data),
                np.median(channel_data),
                np.percentile(channel_data, 25),
                np.percentile(channel_data, 75)
            ])
        
        # HSV statistics
        for i, channel in enumerate(['H', 'S', 'V']):
            channel_data = hsv[:, :, i]
            features.extend([
                np.mean(channel_data),
                np.std(channel_data)
            ])
        
        # Color ratios
        features.extend([
            np.mean(image[:, :, 1]) / (np.mean(image[:, :, 0]) + 1e-8),  # G/R ratio
            np.mean(image[:, :, 2]) / (np.mean(image[:, :, 1]) + 1e-8),  # B/G ratio
            np.mean(image[:, :, 0]) / (np.mean(image[:, :, 2]) + 1e-8),  # R/B ratio
        ])
        
        return features
    
    def extract_texture_features(self, image):
        """Extract texture-based features"""
        features = []
        
        # Convert to grayscale
        gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        
        # Local Binary Pattern
        radius = 3
        n_points = 8 * radius
        lbp = local_binary_pattern(gray, n_points, radius, method='uniform')
        lbp_hist, _ = np.histogram(lbp.ravel(), bins=n_points + 2, 
                                  range=(0, n_points + 2), density=True)
        features.extend(lbp_hist)
        
        # Gray Level Co-occurrence Matrix (GLCM)
        # Normalize gray image to 0-255 range for GLCM
        gray_norm = ((gray - gray.min()) / (gray.max() - gray.min()) * 255).astype(np.uint8)
        
        # Calculate GLCM for different angles
        distances = [1, 2]
        angles = [0, 45, 90, 135]
        
        for distance in distances:
            for angle in angles:
                glcm = greycomatrix(gray_norm, [distance], [np.radians(angle)], 
                                  levels=256, symmetric=True, normed=True)
                
                # Extract GLCM properties
                features.extend([
                    greycoprops(glcm, 'contrast')[0, 0],
                    greycoprops(glcm, 'dissimilarity')[0, 0],
                    greycoprops(glcm, 'homogeneity')[0, 0],
                    greycoprops(glcm, 'energy')[0, 0],
                    greycoprops(glcm, 'correlation')[0, 0]
                ])
        
        return features
    
    def extract_shape_features(self, image):
        """Extract shape-based features"""
        features = []
        
        # Convert to grayscale and threshold
        gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        
        # Apply Otsu's thresholding
        _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
        
        # Find contours
        contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        
        if contours:
            # Get the largest contour (assuming it's the main object)
            largest_contour = max(contours, key=cv2.contourArea)
            
            # Calculate shape features
            area = cv2.contourArea(largest_contour)
            perimeter = cv2.arcLength(largest_contour, True)
            
            if perimeter > 0:
                # Circularity
                circularity = 4 * np.pi * area / (perimeter * perimeter)
                features.append(circularity)
            else:
                features.append(0)
            
            # Aspect ratio
            x, y, w, h = cv2.boundingRect(largest_contour)
            aspect_ratio = float(w) / h if h > 0 else 0
            features.append(aspect_ratio)
            
            # Extent (ratio of contour area to bounding rectangle area)
            rect_area = w * h
            extent = float(area) / rect_area if rect_area > 0 else 0
            features.append(extent)
            
            # Solidity (ratio of contour area to convex hull area)
            hull = cv2.convexHull(largest_contour)
            hull_area = cv2.contourArea(hull)
            solidity = float(area) / hull_area if hull_area > 0 else 0
            features.append(solidity)
            
        else:
            # No contours found, add default values
            features.extend([0, 0, 0, 0])
        
        return features
    
    def extract_all_features(self, image):
        """Extract all traditional features from an image"""
        features = []
        
        # Extract different types of features
        color_features = self.extract_color_features(image)
        texture_features = self.extract_texture_features(image)
        shape_features = self.extract_shape_features(image)
        
        features.extend(color_features)
        features.extend(texture_features)
        features.extend(shape_features)
        
        return np.array(features)

# Initialize feature extractor
traditional_extractor = TraditionalFeatureExtractor()
print("Traditional feature extractor initialized!")

## Deep Learning Feature Extraction

In [None]:
class DeepFeatureExtractor:
    """Extract features using pre-trained deep learning models"""
    
    def __init__(self):
        self.models = {}
        self.load_models()
    
    def load_models(self):
        """Load pre-trained models for feature extraction"""
        print("Loading pre-trained models...")
        
        # VGG16
        self.models['vgg16'] = VGG16(
            weights='imagenet',
            include_top=False,
            pooling='avg',
            input_shape=(224, 224, 3)
        )
        
        # ResNet50
        self.models['resnet50'] = ResNet50(
            weights='imagenet',
            include_top=False,
            pooling='avg',
            input_shape=(224, 224, 3)
        )
        
        # EfficientNetB0
        self.models['efficientnet'] = EfficientNetB0(
            weights='imagenet',
            include_top=False,
            pooling='avg',
            input_shape=(224, 224, 3)
        )
        
        print("Models loaded successfully!")
    
    def preprocess_image(self, img, model_name):
        """Preprocess image for specific model"""
        # Ensure image is in correct format
        if img.shape != (224, 224, 3):
            img = cv2.resize(img, (224, 224))
        
        # Expand dimensions for batch processing
        img = np.expand_dims(img, axis=0)
        
        # Apply model-specific preprocessing
        if model_name == 'vgg16':
            return vgg_preprocess(img.copy())
        elif model_name == 'resnet50':
            return resnet_preprocess(img.copy())
        elif model_name == 'efficientnet':
            return efficientnet_preprocess(img.copy())
        else:
            return img / 255.0
    
    def extract_features(self, image, model_name='vgg16'):
        """Extract features using specified model"""
        if model_name not in self.models:
            raise ValueError(f"Model {model_name} not available")
        
        # Preprocess image
        processed_img = self.preprocess_image(image, model_name)
        
        # Extract features
        features = self.models[model_name].predict(processed_img, verbose=0)
        
        return features.flatten()
    
    def extract_all_model_features(self, image):
        """Extract features from all available models"""
        all_features = {}
        
        for model_name in self.models.keys():
            features = self.extract_features(image, model_name)
            all_features[model_name] = features
        
        return all_features

# Initialize deep feature extractor
deep_extractor = DeepFeatureExtractor()
print("Deep feature extractor initialized!")

## Feature Extraction Pipeline

In [None]:
def extract_features_from_dataset(dataset_path, output_path, sample_size=None):
    """Extract features from entire dataset"""
    
    # Collect all image paths and labels
    image_paths = []
    labels = []
    
    for class_name in os.listdir(dataset_path):
        class_path = os.path.join(dataset_path, class_name)
        if os.path.isdir(class_path):
            for img_file in os.listdir(class_path):
                if img_file.lower().endswith(('.jpg', '.jpeg', '.png', '.bmp')):
                    image_paths.append(os.path.join(class_path, img_file))
                    labels.append(class_name)
    
    # Sample if requested
    if sample_size and len(image_paths) > sample_size:
        indices = np.random.choice(len(image_paths), sample_size, replace=False)
        image_paths = [image_paths[i] for i in indices]
        labels = [labels[i] for i in indices]
    
    print(f"Extracting features from {len(image_paths)} images...")
    
    # Initialize feature storage
    traditional_features = []
    deep_features = {'vgg16': [], 'resnet50': [], 'efficientnet': []}
    valid_labels = []
    valid_paths = []
    
    # Extract features
    for i, img_path in enumerate(tqdm(image_paths, desc="Extracting features")):
        try:
            # Load image
            img = cv2.imread(img_path)
            if img is None:
                continue
            
            # Convert to RGB
            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            
            # Resize to standard size
            img_resized = cv2.resize(img_rgb, (224, 224))
            
            # Extract traditional features
            trad_feat = traditional_extractor.extract_all_features(img_resized)
            traditional_features.append(trad_feat)
            
            # Extract deep features
            deep_feat = deep_extractor.extract_all_model_features(img_resized)
            for model_name, features in deep_feat.items():
                deep_features[model_name].append(features)
            
            valid_labels.append(labels[i])
            valid_paths.append(img_path)
            
        except Exception as e:
            print(f"Error processing {img_path}: {e}")
            continue
    
    # Convert to numpy arrays
    traditional_features = np.array(traditional_features)
    for model_name in deep_features.keys():
        deep_features[model_name] = np.array(deep_features[model_name])
    
    # Save features
    output_path = Path(output_path)
    output_path.mkdir(exist_ok=True)
    
    # Save traditional features
    with open(output_path / 'traditional_features.pkl', 'wb') as f:
        pickle.dump({
            'features': traditional_features,
            'labels': valid_labels,
            'paths': valid_paths
        }, f)
    
    # Save deep features
    for model_name, features in deep_features.items():
        with open(output_path / f'{model_name}_features.pkl', 'wb') as f:
            pickle.dump({
                'features': features,
                'labels': valid_labels,
                'paths': valid_paths
            }, f)
    
    # Save metadata
    metadata = {
        'num_images': len(valid_labels),
        'num_classes': len(set(valid_labels)),
        'classes': list(set(valid_labels)),
        'traditional_feature_dim': traditional_features.shape[1],
        'deep_feature_dims': {name: feat.shape[1] for name, feat in deep_features.items()}
    }
    
    with open(output_path / 'metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print(f"\nFeature extraction complete!")
    print(f"Processed {len(valid_labels)} images")
    print(f"Traditional features shape: {traditional_features.shape}")
    for model_name, features in deep_features.items():
        print(f"{model_name} features shape: {features.shape}")
    
    return metadata

# Extract features from processed dataset
processed_train_path = PROCESSED_DATA_PATH / 'plantvillage_color' / 'train'
if processed_train_path.exists():
    print("Extracting features from training set...")
    train_metadata = extract_features_from_dataset(
        processed_train_path,
        FEATURES_PATH / 'train',
        sample_size=1000  # Sample for demonstration
    )
else:
    print("Processed training data not found. Please run the preprocessing notebook first.")