In [9]:
import numpy as np
import sys
import os
import joblib
import time
from sklearn.ensemble import RandomForestClassifier # First model to try out
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,  classification_report

# Importing the function from Gabe's part (landmark_extractor)
# path to file: ""
sys.path.append(os.path.abspath('../src'))

try: 
    from landmark_extractor import extract_normalized_landmarks
    print("Successfully imported landmark extractor function")
except ImportError:
    print("Error: Could not import from source. Make sure file exists or correct directory")
    extract_normalized_landmarks = None

print ("Libraries imported.")


Successfully imported landmark extractor function
Libraries imported.


In [None]:
DATASET_PATH = '../data/raw/ASL_Alphabet_dataset/'
PROCESSED_DATA_DIR = '../data/processed/'
X_SAVE_PATH = os.path.join(PROCESSED_DATA_DIR, 'X_landmarks.npy')
Y_SAVE_PATH = os.path.join(PROCESSED_DATA_DIR, 'y_labels.npy')
CLASS_NAMES_PATH = os.path.join(PROCESSED_DATA_DIR, 'class_names.npy') 
MODEL_SAVE_PATH = '../models/asl_classifier.pkl'

os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
os.makedirs(os.path.dirname(MODEL_SAVE_PATH), exist_ok=True)

# Our model choice
MODEL_CHOICE = RandomForestClassifier
MODEL_PARAMS = {'n_estimators': 100, 'random_state': 42, 'n_jobs': -1} 

TEST_SPLIT_SIZE = 0.2
RANDOM_STATE = 42


FORCE_REPROCESS_DATA = False 

print("Constants defined.")

Constants defined.


In [None]:
X = None
y = None
class_names = None

if not FORCE_REPROCESS_DATA and os.path.exists(X_SAVE_PATH) and os.path.exists(Y_SAVE_PATH) and os.path.exists(CLASS_NAMES_PATH):
    print(f"Loading pre-processed data from {PROCESSED_DATA_DIR}...")
    try:
        X = np.load(X_SAVE_PATH)
        y = np.load(Y_SAVE_PATH)
        class_names = np.load(CLASS_NAMES_PATH)
        print(f"Loaded X shape: {X.shape}, y shape: {y.shape}")
        print(f"Class names: {class_names}")
    except Exception as e:
        print(f"Error loading .npy files: {e}. Will reprocess data.")
        FORCE_REPROCESS_DATA = True 

# If data wasn't loaded, process it
if X is None or y is None or class_names is None or FORCE_REPROCESS_DATA:
    print(f"Processing dataset from: {DATASET_PATH}")
    if extract_normalized_landmarks is None:
        print("ERROR: Landmark extractor function not available. Cannot process data.")
    else:
        X_data = []
        y_data = []

        try:
            class_names = sorted([d for d in os.listdir(DATASET_PATH) if os.path.isdir(os.path.join(DATASET_PATH, d))])
            print(f"Found classes: {class_names}")
            start_time = time.time()
            processed_count = 0

            for label_index, class_name in enumerate(class_names):
                class_path = os.path.join(DATASET_PATH, class_name)
                print(f"Processing class: {class_name} ({label_index+1}/{len(class_names)})...")
                image_files = [f for f in os.listdir(class_path) if os.path.isfile(os.path.join(class_path, f)) and f.lower().endswith(('.png', '.jpg', '.jpeg'))]

                for image_name in image_files:
                    image_path = os.path.join(class_path, image_name)
                    # Call Gabe's function
                    landmarks_vector = extract_normalized_landmarks(image_path) 

                    if landmarks_vector is not None:
                        X_data.append(landmarks_vector)
                        y_data.append(label_index) 
                        processed_count += 1
                        if processed_count % 1000 == 0:
                             print(f"  Processed {processed_count} images...")

            end_time = time.time()
            print(f"\nDataset processing complete. Extracted landmarks from {processed_count} images in {end_time - start_time:.2f} seconds.")

            # Convert lists to NumPy arrays
            X = np.array(X_data)
            y = np.array(y_data)
            class_names = np.array(class_names)

            # Save the processed data
            print(f"Saving processed data to {PROCESSED_DATA_DIR}...")
            np.save(X_SAVE_PATH, X)
            np.save(Y_SAVE_PATH, y)
            np.save(CLASS_NAMES_PATH, class_names)
            print("Processed data saved.")
            print(f"Final X shape: {X.shape}") 
            print(f"Final y shape: {y.shape}") 

        except FileNotFoundError:
            print(f"ERROR: Dataset path not found during processing: {DATASET_PATH}")
        except Exception as e:
            print(f"An error occurred during data processing: {e}")

Loading pre-processed data from ../data/processed/...
Loaded X shape: (1245, 63), y shape: (1245,)
Class names: ['A' 'B' 'C' 'D' 'E' 'F' 'G' 'H' 'I' 'J' 'K' 'L' 'M' 'N' 'O' 'P' 'Q' 'R'
 'S' 'T' 'U' 'V' 'W' 'X' 'Y' 'Z']


In [None]:
if X is not None and y is not None:
    print("\nSplitting real data...")
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, 
        test_size=TEST_SPLIT_SIZE, 
        random_state=RANDOM_STATE,
        stratify=y 
    )
    print(f"Real Train set size: {X_train.shape[0]} samples")
    print(f"Real Validation set size: {X_val.shape[0]} samples")

    print("\nInitializing and training model on real data...")
    model = MODEL_CHOICE(**MODEL_PARAMS)
    print(f"Using model: {model}")
    start_time = time.time()
    model.fit(X_train, y_train)
    end_time = time.time()
    print(f"Model training finished in {end_time - start_time:.2f} seconds.")

    print("\nEvaluating model on real validation data...")
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"Validation Accuracy: {accuracy:.4f}")

    print("Classification Report:")
    target_names = class_names if class_names is not None else None
    report = classification_report(y_val, y_pred, target_names=target_names, zero_division=0)
    print(report)

    # Could add a confusion matrix if there's enough time

else:
    print("\nSkipping training and evaluation as real data was not loaded/processed.")


Splitting real data...
Real Train set size: 996 samples
Real Validation set size: 249 samples

Initializing and training model on real data...
Using model: RandomForestClassifier(n_jobs=-1, random_state=42)
Model training finished in 0.47 seconds.

Evaluating model on real validation data...
Validation Accuracy: 0.9960
Classification Report:
              precision    recall  f1-score   support

           A       1.00      1.00      1.00        10
           B       1.00      1.00      1.00        10
           C       1.00      1.00      1.00        10
           D       1.00      1.00      1.00        10
           E       1.00      1.00      1.00        10
           F       1.00      1.00      1.00        10
           G       1.00      1.00      1.00        10
           H       1.00      1.00      1.00        10
           I       1.00      1.00      1.00        10
           J       1.00      1.00      1.00         9
           K       1.00      1.00      1.00        10
      

In [None]:
if 'model' in locals() and X is not None:
     print(f"\nSaving the trained model to: {MODEL_SAVE_PATH}")
     try:
         joblib.dump(model, MODEL_SAVE_PATH)
         print("Model saved successfully.")
     except Exception as e:
         print(f"Error saving model: {e}")
else:
     print("\nModel not trained/saved as data was not available.")


Saving the trained model to: ../models/asl_classifier.pkl
Model saved successfully.
