Import Dependencies

In [27]:
import cv2
import numpy as np
import os
from matplotlib import pyplot as plt
import time
import mediapipe as mp

In [28]:
mp_hands = mp.solutions.hands           # Hands model
mp_drawing = mp.solutions.drawing_utils # Drawing utilities

In [29]:
def mediapipe_detection(image, model):
    """Processes a frame using the MediaPipe model (Hands)."""
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # BGR -> RGB
    image.flags.writeable = False                  
    results = model.process(image)                 # Make prediction
    image.flags.writeable = True                   
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # RGB -> BGR
    return image, results

In [None]:
def draw_styled_landmarks(image, results):
    """Draws detected hand landmarks and connections."""
    
    # Check if any hands were detected
    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            # Draw landmarks and connections for the current hand
            mp_drawing.draw_landmarks(
                image, 
                hand_landmarks, 
                mp_hands.HAND_CONNECTIONS, # Uses mp_hands connections
                mp_drawing.DrawingSpec(color=(245, 117, 66), thickness=2, circle_radius=4), 
                mp_drawing.DrawingSpec(color=(245, 66, 230), thickness=2, circle_radius=2)
            )

Extract Keypoints Values

In [None]:
def extract_keypoints(results):
    """
    Extracts x, y, z coordinates for the 21 landmarks of the FIRST detected hand 
    and flattens them into a 63-value NumPy array (21 * 3).
    
    Returns an array of zeros (63,) if no hand is detected.
    """
    
    #placeholder
    keypoints = np.zeros(21 * 3) 
    
    # The Hands model returns results in 'results.multi_hand_landmarks'.
    if results.multi_hand_landmarks:
        # Focus on the FIRST detected hand
        hand = results.multi_hand_landmarks[0]
        
        # Extract x, y, z for all 21 landmarks and flatten into a (63,) array.
        keypoints = np.array([[res.x, res.y, res.z] for res in hand.landmark]).flatten()
        
    return keypoints

# keypoint_vector = extract_keypoints(results)
# print(keypoint_vector.shape) # Output should be (63,)

In [None]:
cap = cv2.VideoCapture(0)

# max_num_hands=2 allowed
with mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5, max_num_hands=2) as hands:
    while cap.isOpened():

        # Read feed
        ret, frame = cap.read()
        if not ret:
            break

        # Make detections (passing the 'hands' model)
        image, results = mediapipe_detection(frame, hands)
        
        # Draw landmarks (Hand-specific drawing)
        draw_styled_landmarks(image, results)

        #Extract and get the feature vector
        keypoint_vector = extract_keypoints(results)

        # if not np.all(keypoint_vector == 0):
        #     print("Hand Detected! Vector length:", keypoint_vector.shape)
        #     print("Sample Values:", keypoint_vector[0:6]) # Print the first 6 values
        # else:
        #     print("No Hand Detected (Vector is all zeros)")

        cv2.putText(image, f'Vector Size: {keypoint_vector.shape}', (10, 30), 
                    cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2, cv2.LINE_AA)

        # Show to screen
        cv2.imshow('SignBridge Hand Detection Feed', image)

        # Break gracefully
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

    # Clean up
    cap.release()
    cv2.destroyAllWindows()

In [33]:
draw_styled_landmarks(frame, results)

In [34]:
# plt.imshow(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

Setup Folder for Collection

In [None]:
# # Path for exported data, numpy arrays
DATA_PATH = os.path.join('MP_Data')

# 29 Common Indian Standard Hand Signs (ISL)
actions = np.array([
    'hello', 'thanks', 'sorry', 'please', 'yes', 'no',    # Greetings/Manners
    'I', 'you', 'name', 'time', 'what', 'where', 'how',  # Questions/Pronouns
    'help', 'learn', 'work', 'eat', 'drink', 'home',     # Actions/Places
    'good', 'bad', 'happy', 'sad', 'tired',              # Feelings
    'one', 'two', 'three', 'four', 'five'                # Numbers
])

# # Sixty sequences worth of data 
no_sequences = 60

# Videos are going to be 30 frames in length
sequence_length = 30

#Folder start
start_folder = 1

In [None]:
for action in actions: 
    for sequence in range(1, no_sequences + 1):
        try: 
            # This creates paths like: 'MP_Data/hello/1', 'MP_Data/thanks/2', etc.
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

Collect Keypoint Values for Training and Testing

In [None]:
break_flag = False

os.makedirs(DATA_PATH, exist_ok=True) 

cap = cv2.VideoCapture(0)

mp_hands = mp.solutions.hands 
with mp_hands.Hands(min_detection_confidence=0.5, min_tracking_confidence=0.5, max_num_hands=2) as hands:
     
    # Loop through actions (30 ISL Signs)
    for action in actions:
        # CHECK 1: Exit action loop if flag is set
        if break_flag:
            break
            
        # Loop through sequences aka videos (60 samples per sign)
        for sequence in range(start_folder, start_folder + no_sequences):
            # CHECK 2: Exit sequence loop if flag is set
            if break_flag:
                break
            
            # Loop through video length aka sequence length (30 frames per sample)
            for frame_num in range(sequence_length):

                # Read feed
                ret, frame = cap.read()
                if not ret: break

                # Make detections (Using the 'hands' model)
                image, results = mediapipe_detection(frame, hands) 
                draw_styled_landmarks(image, results)
                
                if frame_num == 0: 
                    # Display instruction
                    cv2.putText(image, 'SIGN: ' + action.upper(), (30, 80), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 255, 255), 5, cv2.LINE_AA)
                    cv2.putText(image, 'PRESS SPACE TO RECORD', (30, 400), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 3, cv2.LINE_AA)
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15, 12), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    
                    cv2.imshow('SignBridge Data Collection Feed', image)
                    
                    while True:
                        key = cv2.waitKey(1)
                        if key & 0xFF == 32: # Spacebar pressed
                            break
                        if key & 0xFF == ord('q'):
                            # SET THE BREAK FLAG HERE (no 'global' needed as it's top-level)
                            break_flag = True
                            break # Break out of the inner while loop
                
                else: 
                    # Display recording status
                    cv2.putText(image, 'SIGN: ' + action.upper(), (30, 80), cv2.FONT_HERSHEY_SIMPLEX, 2, (0, 255, 255), 5, cv2.LINE_AA)
                    cv2.putText(image, 'RECORDING... Frame: {}'.format(frame_num), (30, 400), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2, cv2.LINE_AA)
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15, 12), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    cv2.imshow('SignBridge Data Collection Feed', image)
                
                # Export keypoints
                keypoints = extract_keypoints(results)
                npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
                np.save(npy_path, keypoints)

                
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break_flag = True
                    break 
            
    cap.release()
    cv2.destroyAllWindows()

if break_flag:
    print("\nData collection manually stopped by user ('q'). Clean exit from all loops.")
else:
    print("\nData collection complete.")


Data collection complete.


In [38]:
print(np.load('MP_Data/five/60/29.npy'))

[ 8.20976734e-01  5.06322384e-01  3.54597745e-07  8.64707053e-01
  4.61435795e-01  7.49870948e-03  8.83628249e-01  4.18436944e-01
  7.98900519e-03  8.97339821e-01  3.80015910e-01  8.63042381e-03
  9.19853449e-01  3.61854196e-01  8.57773237e-03  8.40409338e-01
  3.32743913e-01 -1.69777423e-02  8.60230863e-01  2.54025221e-01
 -2.75644213e-02  8.70371759e-01  2.04063684e-01 -3.19840387e-02
  8.77809882e-01  1.68231025e-01 -3.44632827e-02  8.04935217e-01
  3.32628727e-01 -2.34557949e-02  8.05117965e-01  2.36434609e-01
 -3.58894616e-02  8.03960145e-01  1.79003596e-01 -3.99411842e-02
  8.02050591e-01  1.36247873e-01 -4.21160460e-02  7.74348438e-01
  3.48875552e-01 -2.80257612e-02  7.58534431e-01  2.66039610e-01
 -4.09392491e-02  7.51441061e-01  2.13719666e-01 -5.12330979e-02
  7.47850955e-01  1.73928082e-01 -5.71116917e-02  7.50176966e-01
  3.76037627e-01 -3.02906446e-02  7.17127979e-01  3.24305683e-01
 -4.55108434e-02  6.96071565e-01  2.91274041e-01 -5.59962019e-02
  6.80379152e-01  2.61501

Preprocess Data and Create Labels and Features

In [39]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [40]:
label_map = {label:num for num, label in enumerate(actions)}

In [41]:
label_map

{'hello': 0,
 'thanks': 1,
 'sorry': 2,
 'please': 3,
 'yes': 4,
 'no': 5,
 'I': 6,
 'you': 7,
 'name': 8,
 'time': 9,
 'what': 10,
 'where': 11,
 'how': 12,
 'help': 13,
 'learn': 14,
 'work': 15,
 'eat': 16,
 'drink': 17,
 'home': 18,
 'good': 19,
 'bad': 20,
 'happy': 21,
 'sad': 22,
 'tired': 23,
 'one': 24,
 'two': 25,
 'three': 26,
 'four': 27,
 'five': 28}

In [None]:
sequences, labels = [], []
for action in actions:
    for sequence in np.array(os.listdir(os.path.join(DATA_PATH, action))).astype(int):
        window = []
        is_valid_sequence = True # New flag to check for bad data
        for frame_num in range(sequence_length):
            try:
                res = np.load(os.path.join(DATA_PATH, action, str(sequence), "{}.npy".format(frame_num)))
                
                # If the entire frame is all zeros, skip the whole sequence
                if not np.any(res):
                    is_valid_sequence = False
                    break 
                    
                window.append(res)
            except FileNotFoundError:
                print(f"File not found: {action}/{sequence}/{frame_num}. Skipping sequence.")
                is_valid_sequence = False
                break
        
        # Only append the window and label if all 30 frames were loaded and valid
        if is_valid_sequence:
            sequences.append(window)
            labels.append(label_map[action])
        else:
            pass

In [43]:
np.array(sequences).shape

(1638, 30, 63)

In [44]:
X = np.array(sequences)

In [45]:
X.shape

(1638, 30, 63)

In [46]:
y = to_categorical(labels).astype(int)

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Added random_state for reproducibility

print("--- Data Structure Summary ---")
print(f"Total Sequences Loaded: {len(X)}")
print(f"X_train Shape (Features): {X_train.shape}")
print(f"y_train Shape (Labels): {y_train.shape}")
print(f"X_test Shape (Features): {X_test.shape}")
print(f"y_test Shape (Labels): {y_test.shape}")

--- Data Structure Summary ---
Total Sequences Loaded: 1638
X_train Shape (Features): (1310, 30, 63)
y_train Shape (Labels): (1310, 29)
X_test Shape (Features): (328, 30, 63)
y_test Shape (Labels): (328, 29)


Build and Train LSTM Neural Network

In [48]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import TensorBoard

In [49]:
sequence_length = X_train.shape[1]   # 30 (time steps)
n_features = X_train.shape[2]        # 63 (keypoints features)
n_classes = y_train.shape[1]         # 29 (number of signs)

In [None]:
log_dir = os.path.join('Logs')
tb_callback = TensorBoard(log_dir=log_dir)

model = Sequential()

# Layer 1: LSTM Input
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(sequence_length, n_features)))
model.add(Dropout(0.2)) # Added Dropout for robustness

# Layer 2: Second LSTM Layer
model.add(LSTM(128, return_sequences=True, activation='relu'))

# Layer 3: Final LSTM Layer (return_sequences=False to output a single vector)
model.add(LSTM(64, return_sequences=False, activation='relu'))
model.add(Dropout(0.2)) # Added Dropout for robustness

# Layer 4: Dense Classification Head
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))

# Layer 5: Output Layer (CRITICAL CHANGE: Output units must be n_classes = 29)
model.add(Dense(n_classes, activation='softmax'))

In [51]:
model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

In [None]:
print("\n--- Starting Model Training (200 Epochs) ---")
history = model.fit(X_train, y_train, epochs=200, callbacks=[tb_callback])


--- Starting Model Training (200 Epochs) ---
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
E

In [53]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_3 (LSTM)               (None, 30, 64)            32768     
                                                                 
 dropout_2 (Dropout)         (None, 30, 64)            0         
                                                                 
 lstm_4 (LSTM)               (None, 30, 128)           98816     
                                                                 
 lstm_5 (LSTM)               (None, 64)                49408     
                                                                 
 dropout_3 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 64)                4160      
                                                                 
 dense_4 (Dense)             (None, 32)               

In [54]:
res = model.predict(X_test)



In [55]:
sample_index = 4

#Print the Model's Prediction
model_prediction_index = np.argmax(res[sample_index])
predicted_sign = actions[model_prediction_index]

print(f"Model Predicted Sign: **{predicted_sign}**")

Model Predicted Sign: **one**


In [56]:
#Print the True Label
true_label_index = np.argmax(y_test[sample_index])
true_sign = actions[true_label_index]

print(f"True Correct Sign:    **{true_sign}**")

True Correct Sign:    **one**


Evaluation using Confusion Matrix and Accuracy

In [57]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score
import numpy as np
import pandas as pd

In [None]:
yhat_probs = model.predict(X_test, verbose=0)

In [None]:
ytrue = np.argmax(y_test, axis=1)

In [None]:
yhat = np.argmax(yhat_probs, axis=1)

In [None]:
overall_accuracy = accuracy_score(ytrue, yhat)

In [62]:
# This returns 29 separate 2x2 matrices
cm = multilabel_confusion_matrix(ytrue, yhat)

In [63]:
# 5. Display Results Clearly
print('\n--- Final Model Evaluation ---')
print(f"Overall Test Accuracy: {overall_accuracy * 100:.2f}%")

print('\n--- Confusion Matrix Summary (First 5 Signs) ---')
# We print the first 5 matrices for inspection.
# Each matrix is [TN, FP] and [FN, TP] for that class vs. all others.
for i in range(min(5, len(actions))):
    print(f"Sign: {actions[i].upper()}")
    # Using pandas DataFrame for clean display
    df_cm = pd.DataFrame(cm[i], index=['Actual NEG', 'Actual POS'], columns=['Pred NEG', 'Pred POS'])
    print(df_cm)
    print("-" * 20)


--- Final Model Evaluation ---
Overall Test Accuracy: 97.87%

--- Confusion Matrix Summary (First 5 Signs) ---
Sign: HELLO
            Pred NEG  Pred POS
Actual NEG       318         0
Actual POS         0        10
--------------------
Sign: THANKS
            Pred NEG  Pred POS
Actual NEG       318         0
Actual POS         0        10
--------------------
Sign: SORRY
            Pred NEG  Pred POS
Actual NEG       318         0
Actual POS         0        10
--------------------
Sign: PLEASE
            Pred NEG  Pred POS
Actual NEG       315         0
Actual POS         0        13
--------------------
Sign: YES
            Pred NEG  Pred POS
Actual NEG       312         0
Actual POS         0        16
--------------------


Save the Model

In [64]:
model.save('signbridge_isl_29_signs.h5')

  saving_api.save_model(
