# 1. Import and Install Dependencies
Python version 3.12.4 has been used in the testing environment on macOS.

In [1]:
!pip install numpy scikit-learn opencv-python tensorflow keras mediapipe


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import os
import cv2
import numpy as np
import mediapipe as mp

# 2. Keypoints using MP Holistic

Initialize MediaPipe Holistic model and drawing utilities

In [3]:
# 'mp_holistic' provides the holistic model which includes face, pose, and hand landmarks detection.
mp_holistic = mp.solutions.holistic

# 'mp_drawing' provides utility functions for drawing the detected landmarks on images.
mp_drawing = mp.solutions.drawing_utils

In [4]:
# Function to perform MediaPipe detection on an image
def mediapipe_detection(image, model):
    # Convert the image from BGR to RGB color space.
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Mark the image as not writable to improve performance by preventing unnecessary data copying.
    image.flags.writeable = False 
    
    # Process the image using the provided model to detect landmarks.
    results = model.process(image)
    
    # Mark the image as writable again for further operations.
    image.flags.writeable = True
    
    # Convert the image back from RGB to BGR color space.
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
    
    # Return the processed image and the detection results.
    return image, results

In [5]:
# Function to draw landmarks on an image based on the detection results
def draw_landmarks(image, results):
    # If face landmarks are detected, draw them on the image.
    if results.face_landmarks:
        mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS)
    
    # If pose landmarks are detected, draw them on the image.
    if results.pose_landmarks:
        mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS)
    
    # If left hand landmarks are detected, draw them on the image.
    if results.left_hand_landmarks:
        mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS)
    
    # If right hand landmarks are detected, draw them on the image.
    if results.right_hand_landmarks:
        mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS)


# 3. Extract Keypoint Values

In [6]:
# Function to extract keypoint values from the MediaPipe detection results
def extract_keypoints(results):
    # Extract pose landmarks (if available) and flatten the list.
    # Each landmark includes x, y, z coordinates and visibility.
    # If pose landmarks are not available, return an array of zeros with the same length.
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    
    # Extract face landmarks (if available) and flatten the list.
    # Each landmark includes x, y, z coordinates.
    # If face landmarks are not available, return an array of zeros with the same length.
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    
    # Extract left hand landmarks (if available) and flatten the list.
    # Each landmark includes x, y, z coordinates.
    # If left hand landmarks are not available, return an array of zeros with the same length.
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    
    # Extract right hand landmarks (if available) and flatten the list.
    # Each landmark includes x, y, z coordinates.
    # If right hand landmarks are not available, return an array of zeros with the same length.
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    
    # Concatenate all the extracted keypoints (pose, face, left hand, right hand) into a single array.
    return np.concatenate([pose, face, lh, rh])

# 4. Define Actions

In [7]:
# Define the list of actions to be collected (e.g., 'one', 'two', 'three')
actions = np.array(['one', 'two', 'three'])

# 5. Define and Load the Model

In [8]:
# Import necessary modules from Keras.
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.callbacks import TensorBoard

In [9]:
# Define the directory to store the TensorBoard logs.
log_dir = os.path.join('logs')

# Create a TensorBoard callback object to log training information.
tb_callback = TensorBoard(log_dir=log_dir)

# Initialize a sequential model. This type of model is appropriate for a plain stack of layers where each layer has exactly one input tensor and one output tensor.
model = Sequential()

# Add an LSTM (Long Short-Term Memory) layer with 64 units. 
# return_sequences=True ensures that the output of this layer is a sequence.
# activation='relu' specifies the ReLU activation function.
# input_shape=(30,1662) defines the shape of the input data (30 time steps and 1662 features).
model.add(LSTM(64, return_sequences=True, activation='relu', input_shape=(30, 1662)))

# Add another LSTM layer with 128 units, also returning sequences.
model.add(LSTM(128, return_sequences=True, activation='relu'))

# Add a third LSTM layer with 64 units, not returning sequences (only the output of the last time step).
model.add(LSTM(64, return_sequences=False, activation='relu'))

# Add a Dense (fully connected) layer with 64 units and ReLU activation.
model.add(Dense(64, activation='relu'))

# Add another Dense layer with 32 units and ReLU activation.
model.add(Dense(32, activation='relu'))

# Add the output Dense layer with a number of units equal to the number of actions (3 in this case: 'hello', 'thanks', 'iloveyou').
# The softmax activation function is used to get a probability distribution over the actions.
model.add(Dense(actions.shape[0], activation='softmax'))

  super().__init__(**kwargs)


In [10]:
# Load pre-trained weights into the model from a file named 'collection.h5'.
model.load_weights('model.h5')

# 6. Test in Real Time

In [11]:
# Defining colors for the probability visualization.
colors = [(245,117,16), (117,245,16), (16,117,245)]

# Function to visualize the probabilities of each action.
def prob_viz(res, actions, input_frame, colors):
    output_frame = input_frame.copy()
    for num, prob in enumerate(res):
        cv2.rectangle(output_frame, (0,60+num*40), (int(prob*100), 90+num*40), colors[num], -1)
        cv2.putText(output_frame, actions[num], (0, 85+num*40), cv2.FONT_HERSHEY_SIMPLEX, 1, (255,255,255), 2, cv2.LINE_AA)
    return output_frame

In [12]:
# Initialize variables for storing the sequence of keypoints, the predicted sentence, and model predictions.
sequence = []
sentence = []
predictions = []
threshold = 0.5  # Confidence threshold for action detection.

In [13]:
# Open a connection to the webcam.
# For macos device index must be 1, for windows device index must be 0
cap = cv2.VideoCapture(1)

# Using the holistic model from MediaPipe with specified confidence thresholds for detection and tracking.
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    while cap.isOpened():
        # Read a frame from the webcam.
        ret, frame = cap.read()

        # Perform MediaPipe detection on the frame to get landmarks.
        image, results = mediapipe_detection(frame, holistic)
        print(results)  # Print the results for debugging.

        # Draw the landmarks on the image.
        draw_landmarks(image, results)

        # Extract keypoints from the results and append to the sequence.
        keypoints = extract_keypoints(results)
        sequence.append(keypoints)
        sequence = sequence[-30:]  # Keep only the last 30 frames in the sequence.

        # Once we have a sequence of 30 frames, make a prediction.
        if len(sequence) == 30:
            res = model.predict(np.expand_dims(sequence, axis=0))[0]
            print(actions[np.argmax(res)])  # Print the predicted action for debugging.
            predictions.append(np.argmax(res))

            # Check if the prediction is consistent over the last 10 frames and above the confidence threshold.
            if np.unique(predictions[-10:])[0] == np.argmax(res):
                if res[np.argmax(res)] > threshold:
                    if len(sentence) > 0:
                        if actions[np.argmax(res)] != sentence[-1]:
                            sentence.append(actions[np.argmax(res)])
                    else:
                        sentence.append(actions[np.argmax(res)])

            # Limit the sentence length to the last 5 actions.
            if len(sentence) > 5:
                sentence = sentence[-5:]

            # Visualize the probabilities of the predictions.
            image = prob_viz(res, actions, image, colors)

        # Draw a rectangle and the current sentence on the image.
        cv2.rectangle(image, (0, 0), (640, 40), (245, 117, 16), -1)
        cv2.putText(image, ' '.join(sentence), (3, 30), 
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

        # Display the image.
        cv2.imshow('OpenCV Feed', image)

        # Break the loop if 'q' is pressed.
        if cv2.waitKey(10) & 0xFF == ord('q'):
            break

    # Release the webcam and close all OpenCV windows.
    cap.release()
    cv2.destroyAllWindows()


I0000 00:00:1723657287.913009  108290 gl_context.cc:357] GL version: 2.1 (2.1 Metal - 88.1), renderer: Apple M1
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1723657287.995496  108498 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1723657288.002672  108500 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1723657288.003977  108500 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1723657288.004043  108497 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1723657288.004600  108495 inference_feedback_manager.cc:114] Feedback manager requires a mod

<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.solution_base.SolutionOutputs'>
<class 'mediapipe.python.soluti

In [None]:
cv2.waitKey(1)
cv2.waitKey(1)
cap.release()
cv2.destroyAllWindows()