ENGR 418 - Group 33
Matthew Ofina - 84790435

In [1]:
# ============================================================
# this cell imports necassary libraries
# ============================================================

from sklearn.linear_model import LogisticRegression
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from PIL import Image


In [2]:
# ============================================================
# this cell defines a function for retreiving data from a folder
# ============================================================

# converts images from folder into usable training/testing data
def get_data(folder_path, im_width):
    # lists all files in a given folder
    file_names = os.listdir(folder_path)
    # initializes arrays for data
    x = np.empty((len(file_names), im_width**2))
    y = np.empty((len(file_names), 1))
    # iterates through every file in the folder
    for i in range(len(file_names)):
        #retrieves image file and label
        file_name = file_names[i]
        file_path = folder_path + file_name
        label = get_label(file_name)
        #converts image to usable data
        im = Image.open(file_path).convert('L')
        im = im.resize((im_width, im_width))
        im_array = np.asarray(im)
        #adds image data to data array
        x[i,:] = im_array.reshape(1, -1)
        y[i,0] = label
    return x, y

# returns the label of an image based on the filename
def get_label(file_name):
    if file_name[0] == '2':
      label = 0
    elif file_name[0] == 'c':
      label = 1
    elif file_name[0] == 'r':
      label = 2
    elif file_name[0] == 's':
      label = 3
    return label


In [3]:
# ============================================================
# this cell defines a function for retreiving data from a folder
# ============================================================

def train_function(path, im_width=64):
    # uses get_data function to retrieve data
    x_train, y_train = get_data(path, im_width)

    # defines a logistic regressiion function with l2 norm and lambda=10 for regularization
    model = LogisticRegression(penalty='l2', C=0.1)

    # trains the model with the selected training data
    model.fit(x_train,y_train)

    # generate predicted labels for testset
    y_pred = model.predict(x_train)

    # get and print confusion matrix and accuracy score for train set
    print(f"Confusion matrix training:\n{confusion_matrix(y_train,y_pred)}\n")
    print(f"Accuracy score training:\n{accuracy_score(y_train,y_pred)}\n")
    

    # returns model to use for testing
    return model


In [4]:
# ============================================================
# Test function required for grading
# ============================================================

def test_function(path, model, im_width=64):
    # uses get_data function to retrieve data
    x_test, y_test = get_data(path, im_width)

    # generate predicted labels for testset
    y_pred = model.predict(x_test)

    # get and print confusion matrix and accuracy score for test set
    print(f"Confusion matrix testing:\n {confusion_matrix(y_test,y_pred)}\n")
    print(f"Accuracy testing:\n {accuracy_score(y_test,y_pred)}\n")

    return y_test, y_pred


In [5]:
# ============================================================
# Trains and tests model
# ============================================================

# input desired image width and folder paths
im_width = 64
train_folder_path = 'Lego_dataset_2/training/'
test_folder_path = 'Lego_dataset_2/testing/'

# train
model = train_function(train_folder_path, im_width)
# test
_,_ = test_function(test_folder_path, model, im_width)


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Confusion matrix training:
[[27  0  0  0]
 [ 0 27  0  0]
 [ 0  0 27  0]
 [ 0  0  0 27]]

Accuracy score training:
1.0

Confusion matrix testing:
 [[11 10  4  2]
 [ 4 11  6  6]
 [ 2  1 23  1]
 [ 1  9  8  9]]

Accuracy testing:
 0.5



In [None]:
# ====================================================================================
# Study the performance of training and testing as a function of the number of inputs
# ====================================================================================

inputs = [8, 11, 16, 22, 32, 45, 64]

# Determine the confusion matrix and accuracy for varying input sizes
for size in inputs:
    print(f"\n{size}x{size} image with {size**2} inputs\n")
    model = train_function(train_folder_path, size)
    y_test, y_pred = test_function(test_folder_path, model, size)


In [None]:
# ========================================================================================
# For the inputs of 64, 121, 256, 484, 1024, 2025, and 4096, the following accuracy
# results for the testing set were obtained to be 88.89%, 94.44%, 94.44%, 95.83%, 94.44%,
# 98.61%, and 95.83%. This shows that more data does not always necessarily mean more 
# useful information. The additional data could be introducing noise and irrelevant
# features. Another reason for accuracy fluctation could be due to underfitting and 
# overfitting.
# ========================================================================================


The following code is the new feature engineering algorithm for stage 2 (major AI help)

In [7]:
# =========================================================================================
# Cell 1: Group Information and Imports
# =========================================================================================
# Group Number: [Insert Group Number Here]
# Group Members:
#   - [Name 1] (ID: [ID 1])
#   - [Name 2] (ID: [ID 2])
#   - [Name 3] (ID: [ID 3])
#
# Description:
# This notebook implements Stage 2 of the project.
# 1. Task 1: Evaluation of the Stage 1 (raw pixel) model on the new dataset.
# 2. Task 2: Development of a new model using Feature Engineering to handle 
#    rotation and translation.
#
# Feature Engineering Library: OpenCV (cv2)
# Features Used: Geometric properties (Area, Solidity, Aspect Ratio) and Hu Moments.
# =========================================================================================

import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image

# Machine Learning Imports
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')


In [8]:
# =========================================================================================
# Cell 2: Helper Functions (Label Extraction & Feature Engineering with OpenCV)
# =========================================================================================

def get_label(file_name):
    """
    Determines the class label based on the filename.
    Mappings:
    - '2...' -> 0 (Rectangle 2x4)
    - 'c...' -> 1 (Circle 2x2)
    - 'r...' -> 2 (Small Rectangle 2x1)
    - 's...' -> 3 (Square 2x2)
    """
    if file_name.startswith('2'):
        return 0
    elif file_name.startswith('c'):
        return 1
    elif file_name.startswith('r'):
        return 2
    elif file_name.startswith('s'):
        return 3
    return 0 # Default fallback

def extract_engineered_features_opencv(im_array):
    """
    Extracts rotation and translation invariant features using OpenCV.
    
    Steps:
    1. Preprocessing (Inversion, Otsu Thresholding, Morphological Closing).
    2. Contour Detection (finding the object).
    3. Geometric Feature Calculation (Area, Solidity, Extent, Aspect Ratio, Eccentricity).
    4. Hu Moments Calculation (7 invariant moments).
    
    Returns:
        np.array: A 1D array of approx 13 features.
    """
    # 1. Preprocessing
    # Ensure input is uint8
    if im_array.dtype != np.uint8:
        im_array = im_array.astype(np.uint8)

    # Invert image: The project images typically have dark objects on light backgrounds.
    # OpenCV contours work best on White Objects on Black Backgrounds.
    inv_im = cv2.bitwise_not(im_array)
    
    # Thresholding (Otsu's method automatically finds the best threshold)
    _, binary = cv2.threshold(inv_im, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # Morphological closing to fill small holes/noise inside the object
    kernel = np.ones((3,3), np.uint8)
    binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
    
    # 2. Find Contours
    # RETR_EXTERNAL retrieves only the extreme outer contours
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    # If image is blank or no contour found, return zero vector
    if not contours:
        return np.zeros(13)
    
    # Assume the largest contour is the Lego piece (filters out small noise)
    cnt = max(contours, key=cv2.contourArea)
    
    # 3. Geometric Features
    area = cv2.contourArea(cnt)
    perimeter = cv2.arcLength(cnt, True)
    
    # Solidity: Ratio of Area to Convex Hull Area
    hull = cv2.convexHull(cnt)
    hull_area = cv2.contourArea(hull)
    solidity = area / hull_area if hull_area > 0 else 0
        
    # Extent: Ratio of Area to Bounding Rectangle Area
    x, y, w, h = cv2.boundingRect(cnt)
    rect_area = w * h
    extent = area / rect_area if rect_area > 0 else 0
        
    # Aspect Ratio & Eccentricity (using Fitted Ellipse)
    # fitEllipse requires at least 5 points
    if len(cnt) >= 5:
        (x_center, y_center), (MA, ma), angle = cv2.fitEllipse(cnt)
        # fitEllipse returns (MA, ma) as (minor_axis, major_axis) or vice versa depending on orientation
        # We sort them to ensure consistent math
        axes = sorted([MA, ma])
        minor_axis, major_axis = axes[0], axes[1]
        
        if major_axis > 0:
            aspect_ratio = major_axis / minor_axis if minor_axis > 0 else 0
            # Eccentricity = sqrt(1 - (b/a)^2)
            eccentricity = np.sqrt(1 - (minor_axis / major_axis)**2)
        else:
            aspect_ratio, eccentricity = 0, 0
    else:
        # Fallback for shapes too small to fit ellipse
        aspect_ratio, eccentricity = 0, 0

    # 4. Hu Moments (7 features invariant to scale, translation, rotation)
    moments = cv2.moments(cnt)
    hu = cv2.HuMoments(moments).flatten()
    
    # Combine features into single vector
    features = np.array([
        area, perimeter, eccentricity, solidity, extent, 
        aspect_ratio, 
        hu[0], hu[1], hu[2], hu[3], hu[4], hu[5], hu[6]
    ])
    
    return features

def get_data_stage2(folder_path):
    """
    Reads all images in a folder and extracts engineered features using OpenCV.
    """
    # Filter for image files
    file_names = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp'))]
    
    features_list = []
    labels_list = []
    
    for file_name in file_names:
        file_path = os.path.join(folder_path, file_name)
        
        # Load Image
        # We use PIL to open and convert to array to remain consistent with file handling,
        # but we immediately convert to OpenCV compatible format.
        im = Image.open(file_path).convert('L')
        im_array = np.array(im)
        
        # Extract Features
        feats = extract_engineered_features_opencv(im_array)
        lbl = get_label(file_name)
        
        features_list.append(feats)
        labels_list.append(lbl)
        
    return np.array(features_list), np.array(labels_list)


In [9]:
# =========================================================================================
# Cell 3: Training Function (Stage 2)
# =========================================================================================

def training_function(path, penalty='l2', C=1.0):
    """
    Trains a Logistic Regression model using engineered features.
    
    Args:
        path (str): Path to training folder.
    
    Returns:
        model (Pipeline): Trained scikit-learn pipeline (Scaler + Classifier).
    """
    print(f"Loading training data from: {path}")
    X_train, y_train = get_data_stage2(path)
    
    print(f"Extracted features shape: {X_train.shape}")
    
    # Create Pipeline
    # 1. StandardScaler: Normalize features (Area is ~1000s, Hu Moments are ~0.001)
    # 2. LogisticRegression: The classifier
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(penalty=penalty, C=C, max_iter=2000, multi_class='multinomial'))
    ])
    
    # Train
    pipe.fit(X_train, y_train)
    
    # Evaluate on Training Data
    y_pred = pipe.predict(X_train)
    print("\n--- Training Performance (Stage 2) ---")
    print(f"Confusion Matrix:\n{confusion_matrix(y_train, y_pred)}")
    print(f"Accuracy: {accuracy_score(y_train, y_pred):.4f}")
    
    return pipe


In [10]:
# =========================================================================================
# Cell 4: Testing Function (Stage 2)
# =========================================================================================

def testing_function(path, model):
    """
    Tests the trained pipeline on new data.
    
    Args:
        path (str): Path to testing folder.
        model: Trained pipeline.
        
    Returns:
        y_test, y_pred
    """
    print(f"\nLoading testing data from: {path}")
    X_test, y_test = get_data_stage2(path)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Evaluate
    print("\n--- Testing Performance (Stage 2) ---")
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    
    return y_test, y_pred


In [12]:
# =========================================================================================
# Cell 5: Task 1 - Evaluate Stage 1 Code on New Dataset
# =========================================================================================
# This functions acts as a baseline. It uses the Stage 1 method (Raw Pixels) on the new
# rotated/shifted data. Low accuracy is expected.

def get_data_stage1_legacy(folder_path, im_width):
    # Reads images and flattens raw pixels (no feature engineering)
    file_names = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg'))]
    x = np.empty((len(file_names), im_width**2))
    y = np.empty((len(file_names), 1))
    for i, fname in enumerate(file_names):
        fpath = os.path.join(folder_path, fname)
        label = get_label(fname)
        
        im = Image.open(fpath).convert('L')
        im = im.resize((im_width, im_width))
        im_array = np.asarray(im)
        
        x[i,:] = im_array.reshape(1, -1)
        y[i,0] = label
    return x, y.ravel()

print("========== TASK 1: STAGE 1 LEGACY MODEL EVALUATION ==========")
# Define Paths (UPDATE THESE TO YOUR ACTUAL FOLDER PATHS)
TRAIN_PATH = 'Lego_dataset_2/training/'
TEST_PATH  = 'Lego_dataset_2/testing/'

try:
    # 1. Train Stage 1 Model on New Data (Raw Pixels)
    im_width = 64
    X_train_raw, y_train_raw = get_data_stage1_legacy(TRAIN_PATH, im_width)
    
    model_s1 = LogisticRegression(penalty='l2', C=0.1, max_iter=2000)
    model_s1.fit(X_train_raw, y_train_raw)
    
    # 2. Test Stage 1 Model
    X_test_raw, y_test_raw = get_data_stage1_legacy(TEST_PATH, im_width)
    y_pred_s1 = model_s1.predict(X_test_raw)
    
    print("Stage 1 Model (Raw Pixels) Results on New Dataset:")
    print(f"Accuracy: {accuracy_score(y_test_raw, y_pred_s1):.4f}")
    print(f"Confusion Matrix:\n{confusion_matrix(y_test_raw, y_pred_s1)}")
    print("Observation: Raw pixels are not rotation invariant.")
except Exception as e:
    print(f"Could not run Task 1 (check paths or data): {e}")


Stage 1 Model (Raw Pixels) Results on New Dataset:
Accuracy: 0.5463
Confusion Matrix:
[[11 12  3  1]
 [ 4 15  6  2]
 [ 2  1 22  2]
 [ 2  8  6 11]]
Observation: Raw pixels are not rotation invariant.


In [13]:
# =========================================================================================
# Cell 6: Task 2 - Execute Stage 2 Training and Testing (OpenCV Feature Engineering)
# =========================================================================================

print("\n========== TASK 2: STAGE 2 FEATURE ENGINEERING MODEL (OpenCV) ==========")

try:
    # 1. Train the new model
    # training_function handles data loading, feature extraction (OpenCV), 
    # scaling, and model fitting.
    stage2_model = training_function(TRAIN_PATH)

    # 2. Test the new model
    y_test, y_pred = testing_function(TEST_PATH, stage2_model)
    
except Exception as e:
     print(f"Could not run Task 2 (check paths or data): {e}")



Loading training data from: Lego_dataset_2/training/
Extracted features shape: (108, 13)

--- Training Performance (Stage 2) ---
Confusion Matrix:
[[27  0  0  0]
 [ 0 25  0  2]
 [ 0  0 26  1]
 [ 0  7  1 19]]
Accuracy: 0.8981

Loading testing data from: Lego_dataset_2/testing/

--- Testing Performance (Stage 2) ---
Confusion Matrix:
[[17  8  2  0]
 [ 0 24  0  3]
 [ 1  0 24  2]
 [ 0  9  1 17]]
Accuracy: 0.7593


In [None]:


import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image

# Machine Learning & Feature Selection Imports
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')


In [None]:
# =========================================================================================
# Cell 2: Robust Feature Engineering (Stage 2 - Final)
# =========================================================================================

def get_label(file_name):
    if file_name.startswith('2'): return 0
    elif file_name.startswith('c'): return 1
    elif file_name.startswith('r'): return 2
    elif file_name.startswith('s'): return 3
    return 0

def extract_engineered_features_opencv(im_array):
    """
    Extracts geometric features using minAreaRect for rotation stability.
    """
    if im_array.dtype != np.uint8:
        im_array = im_array.astype(np.uint8)

    # 1. Preprocessing
    inv_im = cv2.bitwise_not(im_array)
    _, binary = cv2.threshold(inv_im, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    kernel = np.ones((3,3), np.uint8)
    binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
    
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    if not contours: return np.zeros(13)
    
    # Pick largest contour
    cnt = max(contours, key=cv2.contourArea)
    
    # --- Geometric Features ---
    area = cv2.contourArea(cnt)
    perimeter = cv2.arcLength(cnt, True)
    
    # Compactness (P^2 / A) 
    compactness = (perimeter ** 2) / area if area > 0 else 0

    # Solidity (Area / Convex Hull Area)
    hull = cv2.convexHull(cnt)
    hull_area = cv2.contourArea(hull)
    solidity = area / hull_area if hull_area > 0 else 0
    
    # --- ROTATION INVARIANT ASPECT RATIO (Critical) ---
    rect = cv2.minAreaRect(cnt) 
    (center), (width, height), angle = rect
    
    # Aspect Ratio (Long / Short)
    dims = sorted([width, height])
    short, long = dims[0], dims[1]
    aspect_ratio = long / short if short > 0 else 0
    
    # Rectangularity (Area / Rotated Box Area)
    rect_area = width * height
    rectangularity = area / rect_area if rect_area > 0 else 0

    # --- Hu Moments (Log Scale) ---
    moments = cv2.moments(cnt)
    hu = cv2.HuMoments(moments).flatten()
    hu_log = [ -1 * np.sign(h) * np.log10(np.abs(h) + 1e-7) for h in hu ]
    
    # Feature Vector
    features = np.array([
        area, perimeter, compactness, solidity, rectangularity, aspect_ratio, 
        hu_log[0], hu_log[1], hu_log[2], hu_log[3], hu_log[4], hu_log[5], hu_log[6]
    ])
    
    return features

def get_data_stage2(folder_path):
    file_names = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg'))]
    features_list = []
    labels_list = []
    
    for file_name in file_names:
        file_path = os.path.join(folder_path, file_name)
        im = Image.open(file_path).convert('L')
        im_array = np.array(im)
        feats = extract_engineered_features_opencv(im_array)
        lbl = get_label(file_name)
        features_list.append(feats)
        labels_list.append(lbl)
        
    return np.array(features_list), np.array(labels_list)


In [66]:
# =========================================================================================
# Cell 3: Training Function (L1 Lasso Regularization)
# =========================================================================================
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegressionCV

def training_function(path):
    """
    Pipeline:
    1. Scaling
    2. Poly Features (Degree 2) -> Generates high complexity
    3. LogisticRegressionCV (L1 Penalty) -> Intelligently selects features
    """
    print(f"Loading training data from: {path}")
    X_train, y_train = get_data_stage2(path)
    
    # Pipeline
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        # Create non-linear features (interactions)
        ('poly', PolynomialFeatures(degree=2, include_bias=False)),
        
        # LogisticRegressionCV with L1 penalty automatically performs feature selection
        # by shrinking coefficients of non-informative features to zero.
        ('clf', LogisticRegressionCV(
            cv=5, 
            penalty='l1',      # L1 = Lasso (Feature Selection)
            solver='saga',     # SAGA solver supports L1 for multinomial
            max_iter=10000,    # High iterations needed for convergence
            multi_class='multinomial',
            Cs=10              # Check 10 different regularization strengths
        ))
    ])
    
    # Train
    print("Training with L1 Regularization (this may take 30-60 seconds)...")
    pipe.fit(X_train, y_train)
    
    # Print best Regularization found
    best_c = pipe.named_steps['clf'].C_[0]
    print(f"Best C (Inverse Regularization): {best_c:.4f}")
    
    # Evaluate
    y_pred = pipe.predict(X_train)
    print("\n--- Training Performance (L1 Optimized) ---")
    print(f"Confusion Matrix:\n{confusion_matrix(y_train, y_pred)}")
    print(f"Accuracy: {accuracy_score(y_train, y_pred):.4f}")
    
    return pipe


In [67]:
# =========================================================================================
# Cell 4: Testing Function
# =========================================================================================

def testing_function(path, model):
    """
    Tests the trained pipeline on new data.
    The pipeline automatically applies the same Scaling and Feature Selection.
    """
    print(f"\nLoading testing data from: {path}")
    X_test, y_test = get_data_stage2(path)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Evaluate
    print("\n--- Testing Performance (Stage 2) ---")
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    
    return y_test, y_pred


In [68]:
# =========================================================================================
# Cell 5: Main Execution
# =========================================================================================

# Define Paths (UPDATE THESE TO YOUR ACTUAL FOLDER PATHS)
TRAIN_PATH = 'Lego_dataset_2/training/'
TEST_PATH  = 'Lego_dataset_2/testing/'

print("========== STAGE 2: FEATURE ENGINEERING + SELECTION ==========")

try:
    # 1. Train Model (includes SFS)
    stage2_model = training_function(TRAIN_PATH)

    # 2. Test Model
    y_test, y_pred = testing_function(TEST_PATH, stage2_model)
    
except Exception as e:
     print(f"Could not run Task (check paths or data): {e}")


Loading training data from: Lego_dataset_2/training/
Training with L1 Regularization (this may take 30-60 seconds)...
Best C (Inverse Regularization): 21.5443

--- Training Performance (L1 Optimized) ---
Confusion Matrix:
[[27  0  0  0]
 [ 0 27  0  0]
 [ 0  0 27  0]
 [ 0  0  0 27]]
Accuracy: 1.0000

Loading testing data from: Lego_dataset_2/testing/

--- Testing Performance (Stage 2) ---
Confusion Matrix:
[[18  6  0  3]
 [ 0 22  0  5]
 [ 1  1 23  2]
 [ 0  4  0 23]]
Accuracy: 0.7963


Different try

In [None]:
# =========================================================================================
# Cell 1: Group Information and Imports
# =========================================================================================

import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image

# Machine Learning Imports
from sklearn.svm import SVC                   # Support Vector Classifier
from sklearn.model_selection import GridSearchCV # Automated Parameter Tuning
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')


In [86]:
# =========================================================================================
# Cell 2: Physics-Based Feature Engineering (5 Robust Features)
# =========================================================================================

def get_label(file_name):
    if file_name.startswith('2'): return 0
    elif file_name.startswith('c'): return 1
    elif file_name.startswith('r'): return 2
    elif file_name.startswith('s'): return 3
    return 0

def extract_engineered_features_opencv(im_array):
    """
    Extracts ONLY the 5 physically descriptive features.
    """
    if im_array.dtype != np.uint8:
        im_array = im_array.astype(np.uint8)

    # 1. Preprocessing
    inv_im = cv2.bitwise_not(im_array)
    _, binary = cv2.threshold(inv_im, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    kernel = np.ones((3,3), np.uint8)
    binary = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
    
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    if not contours: return np.zeros(5)
    
    cnt = max(contours, key=cv2.contourArea)
    
    # --- 1. Area ---
    area = cv2.contourArea(cnt)
    
    # --- 2. Compactness ---
    perimeter = cv2.arcLength(cnt, True)
    compactness = (perimeter ** 2) / area if area > 0 else 0

    # --- 3. Solidity ---
    hull = cv2.convexHull(cnt)
    hull_area = cv2.contourArea(hull)
    solidity = area / hull_area if hull_area > 0 else 0
    
    # --- 4. Aspect Ratio (Rotation Invariant) ---
    rect = cv2.minAreaRect(cnt) 
    (center), (width, height), angle = rect
    dims = sorted([width, height])
    aspect_ratio = dims[1] / dims[0] if dims[0] > 0 else 0
    
    # --- 5. Rectangularity ---
    rect_area = width * height
    rectangularity = area / rect_area if rect_area > 0 else 0

    features = np.array([area, compactness, solidity, aspect_ratio, rectangularity])
    return features

def get_data_stage2(folder_path):
    file_names = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg'))]
    features_list = []
    labels_list = []
    
    for file_name in file_names:
        file_path = os.path.join(folder_path, file_name)
        im = Image.open(file_path).convert('L')
        im_array = np.array(im)
        feats = extract_engineered_features_opencv(im_array)
        lbl = get_label(file_name)
        features_list.append(feats)
        labels_list.append(lbl)
        
    return np.array(features_list), np.array(labels_list)


In [None]:
# =========================================================================================
# Cell 3: Training Function (Linear SVM for Generalization)
# =========================================================================================

def training_function(path):
    """
    Pipeline:
    1. Scaling (StandardScaler)
    2. Linear SVM
    """
    print(f"Loading training data from: {path}")
    X_train, y_train = get_data_stage2(path)
    
    # Pipeline
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('svm', SVC(kernel='linear', probability=True))
    ])
    
    # Grid Search for Linear SVM
    # We focus on smaller C values to enforce a "Wide Margin"
    param_grid = {
        'svm__C': [0.001, 0.01, 0.1, 1, 5, 10]
    }
    
    grid = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1)
    
    print("Tuning Linear SVM...")
    grid.fit(X_train, y_train)
    
    print(f"Best Parameters: {grid.best_params_}")
    
    # Evaluate
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_train)
    
    print("\n--- Training Performance ---")
    print(f"Confusion Matrix:\n{confusion_matrix(y_train, y_pred)}")
    print(f"Accuracy: {accuracy_score(y_train, y_pred):.4f}")
    
    return best_model


In [88]:
# =========================================================================================
# Cell 4: Testing Function
# =========================================================================================

def testing_function(path, model):
    """
    Tests the trained pipeline on new data.
    The pipeline automatically applies the same Scaling and Feature Selection.
    """
    print(f"\nLoading testing data from: {path}")
    X_test, y_test = get_data_stage2(path)
    
    # Predict
    y_pred = model.predict(X_test)
    
    # Evaluate
    print("\n--- Testing Performance (Stage 2) ---")
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    
    return y_test, y_pred


In [89]:
# =========================================================================================
# Cell 5: Main Execution
# =========================================================================================

# Define Paths (UPDATE THESE TO YOUR ACTUAL FOLDER PATHS)
TRAIN_PATH = 'Lego_dataset_2/training/'
TEST_PATH  = 'Lego_dataset_2/testing/'

print("========== STAGE 2: FEATURE ENGINEERING + SELECTION ==========")

try:
    # 1. Train Model (includes SFS)
    stage2_model = training_function(TRAIN_PATH)

    # 2. Test Model
    y_test, y_pred = testing_function(TEST_PATH, stage2_model)
    
except Exception as e:
     print(f"Could not run Task (check paths or data): {e}")


Loading training data from: Lego_dataset_2/training/
Tuning Linear SVM...
Best Parameters: {'svm__C': 10}

--- Training Performance ---
Confusion Matrix:
[[27  0  0  0]
 [ 0 25  0  2]
 [ 0  0 27  0]
 [ 0  0  0 27]]
Accuracy: 0.9815

Loading testing data from: Lego_dataset_2/testing/

--- Testing Performance (Stage 2) ---
Confusion Matrix:
[[19  3  0  5]
 [ 0 23  0  4]
 [ 1  1 25  0]
 [ 2  0  0 25]]
Accuracy: 0.8519
