In [115]:
# =========================================================================================
# Cell 1: Group Information and Imports
# =========================================================================================
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image

# Machine Learning Imports
from sklearn.svm import SVC                   # Support Vector Classifier
from sklearn.model_selection import GridSearchCV # Automated Parameter Tuning
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from skimage.feature import local_binary_pattern

import warnings
warnings.filterwarnings('ignore')


In [116]:
# =========================================================================================
# Cell 2: Physics-Based Feature Engineering (5 Robust Features)
# =========================================================================================

def get_label(file_name):
    if file_name.startswith('2'): return 0
    elif file_name.startswith('c'): return 1
    elif file_name.startswith('r'): return 2
    elif file_name.startswith('s'): return 3
    return 0

import cv2
import numpy as np

def extract_engineered_features_opencv(im_array):
    """
    Extracts physically descriptive features including:
    1. Shape features (Area, Compactness, Solidity, Aspect Ratio, Rectangularity)
    2. Hu Moments (7 shape invariants)
    3. Color Histogram (HSV color distribution)
    """
    # Ensure correct data type
    if im_array.dtype != np.uint8:
        im_array = im_array.astype(np.uint8)

    # --- Preprocessing for Shape Analysis ---
    # We use grayscale for contour detection
    if len(im_array.shape) == 3:
        gray = cv2.cvtColor(im_array, cv2.COLOR_BGR2GRAY)
    else:
        gray = im_array

    # Invert (assuming white background) and threshold
    inv_im = cv2.bitwise_not(gray)
    _, binary = cv2.threshold(inv_im, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # Cleanup noise
    kernel = np.ones((3,3), np.uint8)
    binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
    
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # Fallback if no object detected (returns array of correct size)
    # 5 basic + 7 Hu + 24 Color (8*3) = 36 features
    if not contours: 
        return np.zeros(36)
    
    cnt = max(contours, key=cv2.contourArea)
    
    # ============================
    # Group 1: Geometric Features
    # ============================
    
    # 1. Area
    area = cv2.contourArea(cnt)
    
    # 2. Compactness
    perimeter = cv2.arcLength(cnt, True)
    compactness = (perimeter ** 2) / area if area > 0 else 0

    # 3. Solidity
    hull = cv2.convexHull(cnt)
    hull_area = cv2.contourArea(hull)
    solidity = area / hull_area if hull_area > 0 else 0
    
    # 4. Aspect Ratio (Rotation Invariant via minAreaRect)
    rect = cv2.minAreaRect(cnt) 
    (center), (width, height), angle = rect
    dims = sorted([width, height])
    aspect_ratio = dims[1] / dims[0] if dims[0] > 0 else 0
    
    # 5. Rectangularity
    rect_area = width * height
    rectangularity = area / rect_area if rect_area > 0 else 0

    basic_features = np.array([area, compactness, solidity, aspect_ratio, rectangularity])

    # ============================
    # Group 2: Hu Moments (Shape Invariants)
    # ============================
    # Hu Moments are excellent for rotation invariance.
    # We use moments of the *contour* rather than the image for speed.
    moments = cv2.moments(cnt)
    hu = cv2.HuMoments(moments).flatten()
    
    # Log scale transform: Raw Hu moments are tiny (e.g., 10^-7). 
    # Log scaling brings them into a usable range for the classifier.
    hu_moments = -np.sign(hu) * np.log10(np.abs(hu) + 1e-10)

    # ============================
    # Combine All Features
    # ============================
    return np.hstack([basic_features, hu_moments])

def get_data_stage2(folder_path):
    file_names = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg'))]
    features_list = []
    labels_list = []
    
    for file_name in file_names:
        file_path = os.path.join(folder_path, file_name)
        im = Image.open(file_path).convert('L')
        im_array = np.array(im)
        feats = extract_engineered_features_opencv(im_array)
        lbl = get_label(file_name)
        features_list.append(feats)
        labels_list.append(lbl)
        
    return np.array(features_list), np.array(labels_list)


In [117]:
# =========================================================================================
# Cell 3: Training Function (Linear SVM for Generalization)
# =========================================================================================

def training_function(path):
    """
    Pipeline:
    1. Scaling (StandardScaler)
    2. Linear SVM
    """
    print(f"Loading training data from: {path}")
    X_train, y_train = get_data_stage2(path)
    # Pipeline
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('svm', SVC(kernel='linear', probability=True))
    ])
    
    # Grid Search for Linear SVM
    # We focus on smaller C values to enforce a "Wide Margin"
    param_grid = {
        'svm__C': [0.001, 0.01, 0.01, 0.1, 0.5, 0.8]
    }
    
    grid = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1)
    
    print("Tuning Linear SVM...")
    grid.fit(X_train, y_train)
    
    print(f"Best Parameters: {grid.best_params_}")
    
    # Evaluate
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_train)
    
    print("\n--- Training Performance ---")
    print(f"Confusion Matrix:\n{confusion_matrix(y_train, y_pred)}")
    print(f"Accuracy: {accuracy_score(y_train, y_pred):.4f}")
    
    return best_model


In [118]:
# =========================================================================================
# Cell 4: Testing Function
# =========================================================================================

def testing_function(path, model):
    """
    Tests the trained pipeline on new data.
    The pipeline automatically applies the same Scaling and Feature Selection.
    """
    print(f"\nLoading testing data from: {path}")
    X_test, y_test = get_data_stage2(path)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Evaluate
    print("\n--- Testing Performance (Stage 2) ---")
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    
    return y_test, y_pred


In [119]:
# =========================================================================================
# Cell 5: Main Execution
# =========================================================================================

# Define Paths (UPDATE THESE TO YOUR ACTUAL FOLDER PATHS)
TRAIN_PATH = 'Lego_dataset_2/training/'
TEST_PATH  = 'Lego_dataset_2/testing/'

print("========== STAGE 2: FEATURE ENGINEERING + SELECTION ==========")

try:
    # 1. Train Model (includes SFS)
    stage2_model = training_function(TRAIN_PATH)

    # 2. Test Model
    y_test, y_pred = testing_function(TEST_PATH, stage2_model)
    
except Exception as e:
     print(f"Could not run Task (check paths or data): {e}")


Loading training data from: Lego_dataset_2/training/
Tuning Linear SVM...
Best Parameters: {'svm__C': 0.8}

--- Training Performance ---
Confusion Matrix:
[[27  0  0  0]
 [ 1 26  0  0]
 [ 0  0 27  0]
 [ 0  0  0 27]]
Accuracy: 0.9907

Loading testing data from: Lego_dataset_2/testing/

--- Testing Performance (Stage 2) ---
Confusion Matrix:
[[19  5  0  3]
 [ 0 26  0  1]
 [ 0  0 27  0]
 [ 1  0  0 26]]
Accuracy: 0.9074
