### Key Points 

### Importing Dependencies 

In [1]:
import cv2 
from matplotlib import pyplot as plt
import numpy as np 
import os 
import time 
import mediapipe as mp 


from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

Matplotlib is building the font cache; this may take a moment.


### Extracting Key Points

In [2]:
mp_holistic = mp.solutions.holistic                       # Holistic model 
mp_drawing = mp.solutions.drawing_utils                  # Drawing utilities 

In [3]:
def mediapipe_detection(img, model): 
    """
        Processes an image using a MediaPipe model to detect holistic features.

        Parameters:
        - image: The input image in BGR format (as read by OpenCV).
        - model: A MediaPipe model instance configured for holistic detection (e.g., mp_holistic.Holistic).

        Returns:
        - image_rgb: The input image converted back from RGB to BGR format after processing.
        - results: The detection results from the MediaPipe model, including landmarks for face, pose, and hands.
    """
    image_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)              # COLOR CONVERSION BGR 2 RGB
    results = model.process(image_rgb.copy())                       # Make prediction on a copy of the image
    image_rgb = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR)          # COLOR CONVERSION RGB 2 BGR 
    return image_rgb, results


def annotate_with_landmarks(img, res): 
    """
        Draws landmarks and connections for face, pose, and hands on an image.

        Parameters:
        - img: The input image where landmarks will be drawn, expected in BGR format.
        - res: The detection results containing landmarks detected by MediaPipe.
               It includes face_landmarks, pose_landmarks, left_hand_landmarks, and right_hand_landmarks.

        Returns:
        - None. The function directly modifies the input image to draw landmarks and connections.
        - These landmarks and connections are styled.
    """
    # Face Landmarks
    mp_drawing.draw_landmarks( 
        img, res.face_landmarks, mp_holistic.FACEMESH_TESSELATION, 
        mp_drawing.DrawingSpec(color=(247, 198, 246), thickness=1, circle_radius=1),  
        mp_drawing.DrawingSpec(color=(115, 61, 191), thickness=1, circle_radius=1))        
    # Pose connections
    mp_drawing.draw_landmarks( 
        img, res.pose_landmarks, mp_holistic.POSE_CONNECTIONS, 
        mp_drawing.DrawingSpec(color=(250, 249, 187), thickness=2, circle_radius=4),  
        mp_drawing.DrawingSpec(color=(158, 207, 255), thickness=2, circle_radius=2))           
    # right hand connections 
    mp_drawing.draw_landmarks( 
        img, res.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
        mp_drawing.DrawingSpec(color=(224, 224, 164), thickness=2, circle_radius=4),  
        mp_drawing.DrawingSpec(color=(167, 204, 169), thickness=2, circle_radius=2))      
    # left hand connections 
    mp_drawing.draw_landmarks( 
        img, res.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
        mp_drawing.DrawingSpec(color=(245,117,66), thickness=2, circle_radius=4),  
        mp_drawing.DrawingSpec(color=(151, 199, 154), thickness=2, circle_radius=2))       



    

In [4]:
webcam = cv2.VideoCapture(0)                               # device number 0
with mp_holistic.Holistic(
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5) as holistic:
    while webcam.isOpened():                               # while the webcam is  turned on
        data = webcam.read()                               # get the data
        if not data[0]:
            break
        # make landmarks for detection
        img, res = mediapipe_detection(data[1], holistic)
        # annoatate on video
        annotate_with_landmarks(img, res)   
        
        cv2.imshow('Camera Feed', img)                     # display the image feed

        if cv2.waitKey(10) & 0xFF == ord('q'):             # close the feed using key "q"
            break

    # once the webcam is closed, close the display window
    webcam.release()                              
    cv2.destroyAllWindows()
    cv2.waitKey(1)  
    time.sleep(1)   

I0000 00:00:1711671358.808760       1 gl_context.cc:344] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M1
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.


### Getting Essential Key Point Values 

In [5]:
def extract_landmarks(landmarks, dimensions, default_value=0):
    """Extract landmarks as a flattened array or return a default array if landmarks are None."""
    if landmarks:
        # Extract x, y, z, and optionally visibility from each landmark, based on the specified dimensions
        return np.array([[getattr(res, dim) for dim in dimensions] for res in landmarks.landmark]).flatten()
    else:
        # Return a flattened array of zeros based on the number of landmarks and the dimensions specified
        return np.zeros(len(landmarks.landmark) * len(dimensions)) if landmarks else np.zeros(default_value)

def extract_keypoints(results):
    # Define the dimensions to extract for each type of landmarks
    pose_dimensions = ['x', 'y', 'z', 'visibility']
    hand_and_face_dimensions = ['x', 'y', 'z']

    # Extract keypoints for pose, face, and hands
    pose = extract_landmarks(results.pose_landmarks, pose_dimensions, 33*4)
    face = extract_landmarks(results.face_landmarks, hand_and_face_dimensions, 468*3)
    left_hand = extract_landmarks(results.left_hand_landmarks, hand_and_face_dimensions, 21*3)
    right_hand = extract_landmarks(results.right_hand_landmarks, hand_and_face_dimensions, 21*3)

    # Concatenate all keypoints into a single array
    keypoints = np.concatenate([pose, face, left_hand, right_hand])
    
    return keypoints

In [6]:
extract_keypoints(res).shape

(1662,)

### Saving the collected Keypoints

In [16]:
DATA_PATH = os.path.join('media-pipe-data')
# Actions we would want to do on the TV
actions = np.array(['on', 'off', 'vol_up', 'vol_down', 'netflix', 'amazon_prime'])
# Number of sequences 
num_sequences = 30
# Length of sequence
seq_length = 30


for action in actions:
    for sequence in range(num_sequences):
        try:
            os.makedirs(os.path.join(DATA_PATH, action, str(sequence)))
        except: 
            pass 

### Collect Keypoint Values for Training and Testing

In [17]:
webcam = cv2.VideoCapture(0) 
with mp_holistic.Holistic(
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5) as holistic:

    for curr_action in actions:
        for seq in range(num_sequences):
            for f in range(seq_length): 
                data = webcam.read()
                if not data[0]:
                    break
                image, results = mediapipe_detection(data[1], holistic)
                annotate_with_landmarks(image, results)
                
                if f == 0:
                    cv2.putText(image, 'STARTING DATA COLLECTION', (120, 200), 
                                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 4, cv2.LINE_AA)
                    cv2.putText(image, f'Collecting frames for {curr_action} Video Number {seq}', (15, 12), 
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1, cv2.LINE_AA)
                    cv2.imshow('Camera Feed', image) 
                    cv2.waitKey(2000)  # Wait 2 seconds on the first frame of each sequence
                else:            
                    cv2.putText(image, f'Collecting frames for {curr_action} Video Number {seq}', (15, 12), 
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1, cv2.LINE_AA)
                    cv2.imshow('Camera Feed', image)  # Ensure consistent window name

                keypoints = extract_keypoints(results)
                np_path = os.path.join(DATA_PATH, curr_action, str(seq), str(f))
                # cv2.waitKey(2000)  # Removed to avoid pausing every frame
                
                if cv2.waitKey(1) & 0xFF == ord('q'):  # Allow quick exit with 'q'
                    break

    webcam.release()
    cv2.destroyAllWindows()


I0000 00:00:1711671829.228980       1 gl_context.cc:344] GL version: 2.1 (2.1 Metal - 88), renderer: Apple M1


KeyboardInterrupt: 

In [18]:
webcam.release()                              
cv2.destroyAllWindows()
cv2.waitKey(1)  
time.sleep(1)  

### Preprocessing Data and Labelling Features 

In [None]:
label_map = {label:num for num, label in enumerate(actions)}
label_map