# Import necessary libraries

In [3]:
import os
import cv2
import time
import numpy as np
import mediapipe as mp
from matplotlib import pyplot as plt

# Initialize MediaPipe holistic model and drawing utilities

In [4]:
mp_holistic = mp.solutions.holistic # Holistic model -  used for tracking and detecting multiple body parts simultaneously
mp_drawing = mp.solutions.drawing_utils # Drawing utilities - used for visualizing the detected landmarks on images or video frames

# Section 1: Utility Functions
-----------------------------------------------------------------------------------

In [5]:
def draw_styled_landmarks(image, results):
    """
    Draws styled landmarks on the input image for face, pose, and hands using MediaPipe.
    
    Parameters:
        image (ndarray): The image on which landmarks will be drawn.
        results (object): MediaPipe holistic model predictions.
    """
    # Draw face connections
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_CONTOURS, 
                             mp_drawing.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             mp_drawing.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             ) 
    # Draw pose connections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_drawing.DrawingSpec(color=(80,22,10), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(80,44,121), thickness=2, circle_radius=2)
                             ) 
    # Draw left hand connections
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(121,22,76), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(121,44,250), thickness=2, circle_radius=2)
                             ) 
    # Draw right hand connections  
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4), 
                             mp_drawing.DrawingSpec(color=(245,66,230), thickness=2, circle_radius=2)
                             ) 

In [6]:
def extract_keypoints(results):
    """
    Extracts keypoints from MediaPipe holistic model predictions for pose, face, left hand, and right hand.
    
    Parameters:
        results (object): MediaPipe holistic model predictions.
        
    Returns:
        ndarray: A concatenated array of keypoints for pose (33x4), face (468x3), left hand (21x3), 
                 and right hand (21x3). Missing landmarks are replaced with zeros.
    """
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    
    return np.concatenate([pose, face, lh, rh])

In [7]:
def mediapipe_detection(image, model):
    """
    Performs MediaPipe detection on an input image.
    
    Parameters:
        image (ndarray): The input image for processing.
        model (object): MediaPipe holistic model.
        
    Returns:
        tuple: Processed image and holistic model predictions.
    """
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # COLOR CONVERSION BGR 2 RGB - Why? -> OpenCV captures images in the BGR color format, while MediaPipe requires the RGB color format for processing
    image.flags.writeable = False                  # MAKE IMAGE NOT WRITABLE - Why? -> 1. optimizes processing by reducing memory operations 2. MediaPipe doesn’t need to modify the image during detection, so this prevents accidental changes
    results = model.process(image)                 # PREDICTION - detecting landmarks for the face, pose, and hands
    image.flags.writeable = True                   # MAKE IMAGE WRITABLE 
    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR) # COLOR COVERSION RGB 2 BGR
    return image, results

# Section 2: Data Collection Setup
----------------------------------------------

In [8]:
DATA_PATH = os.path.join('MP_Data') 

actions = np.array(['hello', 'yes', 'no', 'thank-you'])

# Thirty videos worth of data
no_sequences = 30

# Videos in 30 frames in length
sequence_length = 30

In [9]:
# Create directories to store data for each action and sequence

# The folder structure will look like:
# MP_Data/
#   ├── thank-you/
#   │    ├── 0/
#   │    │    ├── 0.npy
#   │    │    ├── 1.npy
#   │    │    └── ...
#   │    └── ...
#   ├── no/
#   ├── thank-you/
#   └── yes/

for action in actions: 
    for sequence in range(no_sequences):
        try: 
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except:
            pass

# Section 3: Data Collection Loop
------------------------------

In [53]:
# Connection to the webcam
cap = cv2.VideoCapture(0)

# Initialize MediaPipe Holistic model
# The holistic model detects landmarks for the face, pose, and hands.
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    
    # Loop through each action in the list
    for action in actions:
        # Loop through the number of sequences (videos) for the current action
        for sequence in range(no_sequences):
            # Loop through the frames within each sequence
            for frame_num in range(sequence_length):

                # Capture a frame from the webcam
                ret, frame = cap.read()

                # Use MediaPipe to detect landmarks in the frame
                image, results = mediapipe_detection(frame, holistic)

                # Draw detected landmarks (face, pose, hands) on the frame
                draw_styled_landmarks(image, results)
                
                # Display collection status on the first frame of each sequence
                if frame_num == 0: 
                    # Display a message indicating that collection has started
                    cv2.putText(image, 'STARTING COLLECTION', (120,200), 
                               cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255, 0), 4, cv2.LINE_AA)
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    
                    # Show the frame with the status message
                    cv2.imshow('OpenCV Feed', image)
                    # Wait for 2 seconds before starting frame capture
                    cv2.waitKey(2000) 
                else: 
                    # Display the ongoing collection status on subsequent frames
                    cv2.putText(image, 'Collecting frames for {} Video Number {}'.format(action, sequence), (15,12), 
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1, cv2.LINE_AA)
                    # Show the frame with the status message
                    cv2.imshow('OpenCV Feed', image)
                
                # Extract keypoints from the detected landmarks
                keypoints = extract_keypoints(results)
                # Define the save path for the keypoints
                npy_path = os.path.join(DATA_PATH, action, str(sequence), str(frame_num))
                # Save the keypoints as a .npy file
                np.save(npy_path, keypoints)

                # Allow user to gracefully exit the loop by pressing 'q'
                if cv2.waitKey(10) & 0xFF == ord('q'):
                    break

    # Release the webcam and close all OpenCV windows
    cap.release()
    cv2.destroyAllWindows()