# Feature extraction
The feature extraction pipeline consists of two main functions. extract_enhanced_features processes individual images by extracting comprehensive visual characteristics including shape metrics (symmetry, contours), texture patterns (using LBP and GLCM), edge information (using Sobel and Canny), and color features across multiple color spaces (RGB, HSV, LAB). The process_all_images function applies this extraction to our entire dataset of 42,000 images, creating checkpoints every 1000 images and handling any failures gracefully, ultimately producing a feature matrix that maps each image's visual characteristics to its product category.


In [None]:
def extract_enhanced_features(image):
   """Enhanced feature extraction with computer vision techniques"""
   try:
       # Load and prepare image
       if isinstance(image, str):
           image = Image.open(image)
       image = image.convert('RGB')
       image = image.resize((60, 80))
       img_array = np.array(image)
       gray_image = np.mean(img_array, axis=2).astype(np.uint8)

       # 1. Shape Features
       shape_features = {
           'aspect_ratio': img_array.shape[0] / img_array.shape[1],
           'vertical_symmetry': np.mean(np.abs(gray_image - np.flipud(gray_image))),
           'horizontal_symmetry': np.mean(np.abs(gray_image - np.fliplr(gray_image))),
           'diagonal_symmetry': np.mean(np.abs(gray_image - np.rot90(np.rot90(gray_image)))),
           'quadrant_symmetry': np.mean([np.abs(gray_image[:gray_image.shape[0]//2, :gray_image.shape[1]//2] -
                                       gray_image[gray_image.shape[0]//2:, gray_image.shape[1]//2:])])
       }

       # Add contour features
       contours, _ = cv2.findContours(gray_image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
       if contours:
           main_contour = max(contours, key=cv2.contourArea)
           shape_features.update({
               'contour_area': cv2.contourArea(main_contour),
               'contour_perimeter': cv2.arcLength(main_contour, True),
               'contour_circularity': (4 * np.pi * cv2.contourArea(main_contour)) /
                                    (cv2.arcLength(main_contour, True) ** 2) if cv2.arcLength(main_contour, True) > 0 else 0
           })

       # 2. Texture Features
       lbp = feature.local_binary_pattern(gray_image, P=8, R=1)
       texture_features = {
           'texture_mean': lbp.mean(),
           'texture_var': lbp.var(),
           'texture_uniformity': len(np.unique(lbp)) / len(lbp.flatten())
       }

       # GLCM features
       angles = [0, 45, 90, 135]
       glcm = graycomatrix(gray_image, distances=[1, 2], angles=angles, normed=True)
       for angle_idx, angle in enumerate(angles):
           texture_features.update({
               f'glcm_contrast_{angle}': graycoprops(glcm, 'contrast')[0, angle_idx],
               f'glcm_homogeneity_{angle}': graycoprops(glcm, 'homogeneity')[0, angle_idx],
               f'glcm_energy_{angle}': graycoprops(glcm, 'energy')[0, angle_idx],
               f'glcm_correlation_{angle}': graycoprops(glcm, 'correlation')[0, angle_idx]
           })

       # 3. Edge Features
       sobel_h = ndimage.sobel(gray_image, axis=0)
       sobel_v = ndimage.sobel(gray_image, axis=1)
       edge_magnitude = np.sqrt(sobel_h**2 + sobel_v**2)
       canny_edges = feature.canny(gray_image, sigma=1.0)

       edge_features = {
           'edge_density': np.mean(edge_magnitude),
           'edge_variance': np.var(edge_magnitude),
           'horizontal_edges': np.mean(np.abs(sobel_h)),
           'vertical_edges': np.mean(np.abs(sobel_v)),
           'canny_edge_density': np.mean(canny_edges),
           'edge_magnitude_mean': np.mean(edge_magnitude),
           'edge_magnitude_std': np.std(edge_magnitude)
       }

       # Edge direction histogram
       edge_angles = np.arctan2(sobel_v, sobel_h) * 180 / np.pi
       hist, _ = np.histogram(edge_angles[edge_magnitude > edge_magnitude.mean()],
                            bins=12, range=(-180, 180))
       for i, count in enumerate(hist):
           edge_features[f'edge_direction_bin_{i}'] = count
       edge_features['edge_direction_entropy'] = stats.entropy(hist + 1)

       # 4. Color Features
       color_features = {}

       # RGB features
       for idx, channel in enumerate(['red', 'green', 'blue']):
           channel_data = img_array[:,:,idx]
           color_features.update({
               f'mean_{channel}': channel_data.mean(),
               f'std_{channel}': channel_data.std(),
               f'skew_{channel}': stats.skew(channel_data.flatten()),
               f'kurtosis_{channel}': stats.kurtosis(channel_data.flatten())
           })

       # HSV features
       hsv_image = cv2.cvtColor(img_array, cv2.COLOR_RGB2HSV)
       for idx, channel in enumerate(['hue', 'saturation', 'value']):
           channel_data = hsv_image[:,:,idx]
           color_features.update({
               f'mean_{channel}': channel_data.mean(),
               f'std_{channel}': channel_data.std(),
               f'kurtosis_{channel}': stats.kurtosis(channel_data.flatten())
           })

       # LAB color space features
       lab_image = cv2.cvtColor(img_array, cv2.COLOR_RGB2LAB)
       for idx, channel in enumerate(['l', 'a', 'b']):
           channel_data = lab_image[:,:,idx]
           color_features.update({
               f'mean_{channel}': np.mean(channel_data),
               f'std_{channel}': np.std(channel_data),
               f'kurtosis_{channel}': stats.kurtosis(channel_data.flatten())
           })

       # Color histogram features
       for idx, channel in enumerate(['red', 'green', 'blue']):
           hist, _ = np.histogram(img_array[:,:,idx], bins=8, range=(0, 256))
           for bin_idx, count in enumerate(hist):
               color_features[f'{channel}_hist_bin_{bin_idx}'] = count

       return {**shape_features, **texture_features, **edge_features, **color_features}

   except Exception as e:
       print(f"Error in feature extraction: {str(e)}")
       return None

def process_all_images(valid_data):
   """Process all images and create feature matrix"""
   image_paths = valid_data['image_path'].tolist()
   print(f"Starting to process {len(image_paths)} images...")

   features_list = []
   processed_paths = []
   failed_paths = []

   for idx, image_path in enumerate(tqdm(image_paths, desc="Extracting features")):
       try:
           features = extract_enhanced_features(image_path)

           if features is not None:
               # Add image ID and category
               image_id = os.path.basename(image_path).split('.')[0]
               features['image_id'] = image_id
               features['category'] = valid_data.loc[valid_data['image_path'] == image_path, 'categories'].iloc[0]

               features_list.append(features)
               processed_paths.append(image_path)
           else:
               failed_paths.append(image_path)

           # Checkpoint every 1000 images
           if (idx + 1) % 1000 == 0:
               print(f"\nCheckpoint: Processed {idx + 1} images")
               pd.DataFrame(features_list).to_csv(f'features_checkpoint_{idx + 1}.csv', index=False)

       except Exception as e:
           print(f"\nError processing {image_path}: {str(e)}")
           failed_paths.append(image_path)

   # Save results
   feature_matrix = pd.DataFrame(features_list)
   feature_matrix.to_csv('final_feature_matrix.csv', index=False)

   if failed_paths:
       with open('failed_images.txt', 'w') as f:
           f.write('\n'.join(failed_paths))

   print("\nProcessing Complete!")
   print(f"Successfully processed: {len(processed_paths)} images")
   print(f"Failed to process: {len(failed_paths)} images")
   print(f"Total features per image: {len(feature_matrix.columns) - 2}")

   return feature_matrix, processed_paths, failed_paths

# Testing
Before processing our full dataset of 42,000 images, we conducted two testing phases. First, we tested feature extraction on a single image (B000HYL1V6.jpg from Arts, Crafts & Sewing category), successfully extracting 101 features across shape, texture, edge, and color characteristics. After confirming the feature extraction works correctly on this test case, despite a minor overflow warning that doesn't impact feature quality, we now proceed to full dataset processing which will create checkpoints every 1000 images to track progress and ensure data persistence.

## With one picture

In [None]:
# Test code for single image feature extraction
import pandas as pd
from functions import extract_enhanced_features

# Load the CSV and get first image path
CSV_PATH = "../Dataset/styles.csv"
IMAGES_DIR = "../Dataset/train_images"

df = pd.read_csv(CSV_PATH)
test_image = f"{IMAGES_DIR}/{df['ImgId'].iloc[0]}.jpg"

print(f"Testing with image: {test_image}")
print(f"Category: {df['categories'].iloc[0]}")

# Extract features
features = extract_enhanced_features(test_image)

if features:
   print("\nFeature extraction successful!")
   print(f"Number of features extracted: {len(features)}")
   print("\nSample features:")
   print("- Shape:", list(features.keys())[:3])
   print("- Texture:", [k for k in features.keys() if 'texture' in k][:3])
   print("- Edge:", [k for k in features.keys() if 'edge' in k][:3])
   print("- Color:", [k for k in features.keys() if 'color' in k or 'mean_' in k][:3])
else:
   print("Feature extraction failed")

## With all images

In [None]:
from functions import connect_dataset, process_all_images

# Load and verify dataset
CSV_PATH = "../Dataset/styles.csv"
IMAGES_DIR = "../Dataset/train_images"
valid_df, missing = connect_dataset(CSV_PATH, IMAGES_DIR)

# Process all images
feature_matrix, processed_paths, failed_paths = process_all_images(valid_df)

## _Optional_ Analyzing features
The analyze_feature_importance function provides insights into how Random Forest classifier makes its decisions by examining which features contribute most significantly to the classification process. It extracts importance scores from the trained model, creates a ranked visualization of the top 15 most influential features, and saves this as a bar plot. By analyzing these feature importances, we can understand which visual characteristics (such as shape, texture, edge, or color features) are most crucial for distinguishing between different product categories, helping us validate our feature engineering approach and potentially identify areas for improvement in our image processing pipeline.

Run analyze_features.py

In [None]:
def analyze_feature_importance(rf_classifier, feature_names, top_n=15):
    # Get feature importances
    importances = rf_classifier.feature_importances_

    # Create DataFrame of features and their importance scores
    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    })

    # Sort by importance
    feature_importance = feature_importance.sort_values('importance', ascending=False)

    # Plot top N features
    plt.figure(figsize=(12, 6))
    sns.barplot(x='importance', y='feature',
                data=feature_importance.head(top_n))
    plt.title(f'Top {top_n} Most Important Features')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()

### Console output

In [None]:
Top 15 Most Important Features:
                 feature  importance
72            kurtosis_a    0.016923
75            kurtosis_b    0.015960
10       glcm_contrast_0    0.014873
19   glcm_homogeneity_90    0.014120
74                 std_b    0.013996
9     texture_uniformity    0.013952
63   kurtosis_saturation    0.013726
46              mean_red    0.013725
62        std_saturation    0.013680
22     glcm_contrast_135    0.013539
41  edge_direction_bin_8    0.013379
11    glcm_homogeneity_0    0.013199
60          kurtosis_hue    0.013171
59               std_hue    0.013104
4           contour_area    0.013068