In [1]:
import mediapipe as mp
import cv2
import numpy as np
from datetime import datetime

In [10]:
class DetectedGesture:
    def __init__(self) -> None:
        self.category_name = None
        self.start_time = None
        self.end_time = None

BaseOptions = mp.tasks.BaseOptions
GestureRecognizer = mp.tasks.vision.GestureRecognizer
GestureRecognizerOptions = mp.tasks.vision.GestureRecognizerOptions
VisionRunningMode = mp.tasks.vision.RunningMode

# Create a gesture recognizer instance with the video mode:
options = GestureRecognizerOptions(
    base_options=BaseOptions(model_asset_path='../assets/gesture_recognizer.task'),
    running_mode=VisionRunningMode.VIDEO)

# Use OpenCV’s VideoCapture to load the input video.
cap = cv2.VideoCapture('../assets/hand_gesture.mp4')

# Load the frame rate of the video using OpenCV’s CV_CAP_PROP_FPS
# You’ll need it to calculate the timestamp for each frame.
fps = cap.get(cv2.CAP_PROP_FPS)

time_stamp_ms = 0
recognizer = GestureRecognizer.create_from_options(options)
current_gesture = DetectedGesture()
all_gestures = []

start_time = datetime.now()
while cap.isOpened():
    # Read the frame from OpenCV’s VideoCapture.
    success, frame = cap.read()

    # Break out of the loop if there are no more frames.
    if not success:
        break

    # Convert the frame to RGB using OpenCV’s cvtColor().
    # rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Convert the frame to a numpy array using OpenCV’s asarray().
    numpy_frame_from_opencv = np.asarray(frame)

    # Convert the frame received from OpenCV to a MediaPipe’s Image object.
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=numpy_frame_from_opencv)

    # Process the image using the gesture recognizer.
    results = recognizer.recognize_for_video(mp_image, time_stamp_ms)

    # Print the gesture label and the timestamp of the frame.
    if results.gestures:
        gesture_name = results.gestures[0][0].category_name
        
        if not current_gesture.start_time:
            current_gesture.start_time = time_stamp_ms
            current_gesture.category_name = gesture_name
        
        if current_gesture.category_name != gesture_name:
            all_gestures.append(current_gesture)
            current_gesture = DetectedGesture()
        
        current_gesture.end_time = time_stamp_ms
        print(f'{gesture_name} at {time_stamp_ms / 1000} s')
    
    time_stamp_ms += round(1000 / fps)

all_gestures.append(current_gesture)

print(f'Elapsed time: {datetime.now() - start_time}')

Open_Palm at 0.924 s
Open_Palm at 0.957 s
Open_Palm at 0.99 s
Open_Palm at 1.023 s
Open_Palm at 1.056 s
Open_Palm at 1.089 s
Open_Palm at 1.122 s
Open_Palm at 1.155 s
Open_Palm at 1.188 s
Open_Palm at 1.221 s
Open_Palm at 1.254 s
Open_Palm at 1.287 s
Open_Palm at 1.32 s
Open_Palm at 1.353 s
Open_Palm at 1.386 s
Open_Palm at 1.419 s
Open_Palm at 1.452 s
Open_Palm at 1.485 s
Open_Palm at 1.518 s
Open_Palm at 1.551 s
Open_Palm at 1.584 s
Open_Palm at 1.617 s
Open_Palm at 1.65 s
Open_Palm at 1.683 s
Open_Palm at 1.716 s
Open_Palm at 1.749 s
Open_Palm at 1.782 s
Open_Palm at 1.815 s
Open_Palm at 1.848 s
Open_Palm at 1.881 s
Open_Palm at 1.914 s
Open_Palm at 1.947 s
Open_Palm at 1.98 s
Open_Palm at 2.013 s
Open_Palm at 2.046 s
Open_Palm at 2.079 s
Open_Palm at 2.112 s
Open_Palm at 2.145 s
Open_Palm at 2.178 s
None at 2.211 s
None at 2.244 s
Closed_Fist at 2.277 s
Closed_Fist at 2.31 s
Closed_Fist at 2.343 s
Closed_Fist at 2.376 s
Closed_Fist at 2.409 s
Closed_Fist at 2.442 s
Closed_Fist at 2

In [11]:
for gesture in all_gestures:
    print(f'{gesture.category_name} from {gesture.start_time / 1000} s to {gesture.end_time / 1000} s')

Open_Palm from 0.924 s to 2.178 s
None from 2.244 s to 2.244 s
Closed_Fist from 2.31 s to 3.597 s
None from 3.663 s to 3.663 s
Victory from 3.729 s to 4.95 s
None from 5.016 s to 5.016 s
ILoveYou from 5.082 s to 7.359 s
None from 7.425 s to 7.59 s
Thumb_Down from 7.656 s to 8.811 s
Closed_Fist from 8.877 s to 9.042 s
