##### Copyright 2023 The MediaPipe Authors. All Rights Reserved.

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Hand Landmarks Detection with MediaPipe Tasks

This notebook shows you how to use MediaPipe Tasks Python API to detect hand landmarks from images.

## Preparation

Let's start with installing MediaPipe.

In [1]:
!pip install -q mediapipe

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.6/35.6 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25h

Then download an off-the-shelf model bundle. Check out the [MediaPipe documentation](https://developers.google.com/mediapipe/solutions/vision/hand_landmarker#models) for more information about this model bundle.

In [2]:
!wget -q https://storage.googleapis.com/mediapipe-models/hand_landmarker/hand_landmarker/float16/1/hand_landmarker.task

## Visualization utilities

In [3]:
#@markdown We implemented some functions to visualize the hand landmark detection results. <br/> Run the following cell to activate the functions.

from mediapipe import solutions
from mediapipe.framework.formats import landmark_pb2
import numpy as np

MARGIN = 10  # pixels
FONT_SIZE = 1
FONT_THICKNESS = 1
HANDEDNESS_TEXT_COLOR = (88, 205, 54) # vibrant green

def draw_landmarks_on_image(rgb_image, detection_result):
  hand_landmarks_list = detection_result.hand_landmarks
  handedness_list = detection_result.handedness
  annotated_image = np.copy(rgb_image)

  # Loop through the detected hands to visualize.
  for idx in range(len(hand_landmarks_list)):
    hand_landmarks = hand_landmarks_list[idx]
    handedness = handedness_list[idx]

    # Draw the hand landmarks.
    hand_landmarks_proto = landmark_pb2.NormalizedLandmarkList()
    hand_landmarks_proto.landmark.extend([
      landmark_pb2.NormalizedLandmark(x=landmark.x, y=landmark.y, z=landmark.z) for landmark in hand_landmarks
    ])
    solutions.drawing_utils.draw_landmarks(
      annotated_image,
      hand_landmarks_proto,
      solutions.hands.HAND_CONNECTIONS,
      solutions.drawing_styles.get_default_hand_landmarks_style(),
      solutions.drawing_styles.get_default_hand_connections_style())

    # Get the top left corner of the detected hand's bounding box.
    height, width, _ = annotated_image.shape
    x_coordinates = [landmark.x for landmark in hand_landmarks]
    y_coordinates = [landmark.y for landmark in hand_landmarks]
    text_x = int(min(x_coordinates) * width)
    text_y = int(min(y_coordinates) * height) - MARGIN

    # Draw handedness (left or right hand) on the image.
    cv2.putText(annotated_image, f"{handedness[0].category_name}",
                (text_x, text_y), cv2.FONT_HERSHEY_DUPLEX,
                FONT_SIZE, HANDEDNESS_TEXT_COLOR, FONT_THICKNESS, cv2.LINE_AA)

  return annotated_image

## Download test image

Let's grab a test image that we'll use later. The image is from [Unsplash](https://unsplash.com/photos/mt2fyrdXxzk).

In [6]:
!wget -q -O image.jpg https://storage.googleapis.com/mediapipe-tasks/hand_landmarker/woman_hands.jpg

import cv2
from google.colab.patches import cv2_imshow

#img = cv2.imread("image.jpg")
#cv2_imshow(img)

Optionally, you can upload your own image. If you want to do so, uncomment and run the cell below.

In [4]:
from google.colab import files
uploaded = files.upload()

for filename in uploaded:
  content = uploaded[filename]
  with open(filename, 'wb') as f:
    f.write(content)

if len(uploaded.keys()):
  IMAGE_FILE = next(iter(uploaded))
  print('Uploaded file:', IMAGE_FILE)

Saving WIN_20250215_10_22_20_Pro.jpg to WIN_20250215_10_22_20_Pro.jpg
Uploaded file: WIN_20250215_10_22_20_Pro.jpg


## Running inference and visualizing the results

Here are the steps to run hand landmark detection using MediaPipe.

Check out the [MediaPipe documentation](https://developers.google.com/mediapipe/solutions/vision/hand_landmarker/python) to learn more about configuration options that this solution supports.


In [21]:
# STEP 1: Import the necessary modules.
import mediapipe as mp
from mediapipe.tasks import python
from mediapipe.tasks.python import vision

# STEP 2: Create an HandLandmarker object.
base_options = python.BaseOptions(model_asset_path='hand_landmarker.task')
options = vision.HandLandmarkerOptions(base_options=base_options,
                                       num_hands=2)
detector = vision.HandLandmarker.create_from_options(options)

# STEP 3: Load the input image.
image = mp.Image.create_from_file("WIN_20250215_10_33_32_Pro.mp4")

# STEP 4: Detect hand landmarks from the input image.
detection_result = detector.detect(image)

# STEP 5: Process the classification result. In this case, visualize it.
annotated_image = draw_landmarks_on_image(image.numpy_view(), detection_result)
cv2_imshow(cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR))
with open('image.jpg', 'w'):
  cv2.imwrite('image.jpg', cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR))
  print('Saved image')

RuntimeError: Image decoding failed (unknown image type): WIN_20250215_10_33_32_Pro.mp4

In [27]:
import mediapipe as mp

BaseOptions = mp.tasks.BaseOptions
HandLandmarker = mp.tasks.vision.HandLandmarker
HandLandmarkerOptions = mp.tasks.vision.HandLandmarkerOptions
VisionRunningMode = mp.tasks.vision.RunningMode

# Create a hand landmarker instance with the video mode:
options = HandLandmarkerOptions(
    base_options=BaseOptions(model_asset_path='hand_landmarker.task'),
    running_mode=VisionRunningMode.VIDEO)

with HandLandmarker.create_from_options(options) as landmarker:
  # Use OpenCV’s VideoCapture to load the input video.
  cap = cv2.VideoCapture("WIN_20250215_10_33_32_Pro.mp4")

  # Load the frame rate of the video using OpenCV’s CV_CAP_PROP_FPS
  # You’ll need it to calculate the timestamp for each frame.
  numpy_frame_from_opencv = cap.get(cv2.CAP_PROP_FPS)


  # Loop through each frame in the video using VideoCapture#read()
  frame_count = 0
  fps = cap.get(cv2.CAP_PROP_FPS)
  width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
  height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
  fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Use appropriate codec
  out = cv2.VideoWriter('output.mp4', fourcc, fps, (width, height))
  while True:
    success, frame = cap.read()

    if not success:
      break
    timestamp_ms = frame_count / fps * 1000
    print(f"Frame {frame_count}: Timestamp {timestamp_ms} ms")

    frame_count += 1
  # Convert the frame received from OpenCV to a MediaPipe’s Image object.
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)

  # Perform hand landmarks detection on the provided single image.
  # The hand landmarker must be created with the video mode.
    hand_landmarker_result = landmarker.detect_for_video(mp_image, int(timestamp_ms))

  # Draw the hand landmarks detection result on the frame.
    annotated_image = draw_landmarks_on_image(frame, hand_landmarker_result)
    out.write(annotated_image)
  # Save frame to output file
  out.release()
  # Release the VideoCapture object
  cap.release()


Frame 0: Timestamp 0.0 ms
Frame 1: Timestamp 33.416753022452504 ms
Frame 2: Timestamp 66.83350604490501 ms
Frame 3: Timestamp 100.25025906735752 ms
Frame 4: Timestamp 133.66701208981002 ms
Frame 5: Timestamp 167.08376511226254 ms
Frame 6: Timestamp 200.50051813471504 ms
Frame 7: Timestamp 233.91727115716753 ms
Frame 8: Timestamp 267.33402417962003 ms
Frame 9: Timestamp 300.7507772020725 ms
Frame 10: Timestamp 334.1675302245251 ms
Frame 11: Timestamp 367.5842832469776 ms
Frame 12: Timestamp 401.0010362694301 ms
Frame 13: Timestamp 434.41778929188257 ms
Frame 14: Timestamp 467.83454231433507 ms
Frame 15: Timestamp 501.25129533678756 ms
Frame 16: Timestamp 534.6680483592401 ms
Frame 17: Timestamp 568.0848013816926 ms
Frame 18: Timestamp 601.501554404145 ms
Frame 19: Timestamp 634.9183074265976 ms
Frame 20: Timestamp 668.3350604490502 ms
Frame 21: Timestamp 701.7518134715026 ms
Frame 22: Timestamp 735.1685664939552 ms
Frame 23: Timestamp 768.5853195164076 ms
Frame 24: Timestamp 802.0020725

In [29]:
import cv2
import time
# Kezdeti időmérés
start_time = time.time()

with HandLandmarker.create_from_options(options) as landmarker:
  # Use OpenCV’s VideoCapture to load the input video.
  cap = cv2.VideoCapture("WIN_20250215_10_33_32_Pro.mp4")

  # Load the frame rate of the video using OpenCV’s CV_CAP_PROP_FPS
  # You’ll need it to calculate the timestamp for each frame.
  numpy_frame_from_opencv = cap.get(cv2.CAP_PROP_FPS)


  # Loop through each frame in the video using VideoCapture#read()
  frame_count = 0
  fps = cap.get(cv2.CAP_PROP_FPS)
  width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
  height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
  fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Use appropriate codec
  out = cv2.VideoWriter('output.mp4', fourcc, fps, (width, height))
  while True:
    success, frame = cap.read()

    if not success:
      break
    timestamp_ms = frame_count / fps * 1000
    print(f"Frame {frame_count}: Timestamp {timestamp_ms} ms")

    frame_count += 1
  # Convert the frame received from OpenCV to a MediaPipe’s Image object.
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=frame)

  # Perform hand landmarks detection on the provided single image.
  # The hand landmarker must be created with the video mode.
    hand_landmarker_result = landmarker.detect_for_video(mp_image, int(timestamp_ms))

  # Draw the hand landmarks detection result on the frame.
    annotated_image = draw_landmarks_on_image(frame, hand_landmarker_result)

# Idő elteltének kiszámítása
elapsed_time = time.time() - start_time
# FPS kiszámítása
fps = frame_count / elapsed_time
# Eredmények kiíratása
print(f'Elapsed time: {elapsed_time:.2f} seconds')
print(f'Processed frames: {frame_count}')
print(f'FPS: {fps:.2f}')
# Videofájl bezárása
cap.release()

# RESULT:
# Elapsed time: 6.83 seconds
# Processed frames: 193
# FPS: 28.27

Frame 0: Timestamp 0.0 ms
Frame 1: Timestamp 33.416753022452504 ms
Frame 2: Timestamp 66.83350604490501 ms
Frame 3: Timestamp 100.25025906735752 ms
Frame 4: Timestamp 133.66701208981002 ms
Frame 5: Timestamp 167.08376511226254 ms
Frame 6: Timestamp 200.50051813471504 ms
Frame 7: Timestamp 233.91727115716753 ms
Frame 8: Timestamp 267.33402417962003 ms
Frame 9: Timestamp 300.7507772020725 ms
Frame 10: Timestamp 334.1675302245251 ms
Frame 11: Timestamp 367.5842832469776 ms
Frame 12: Timestamp 401.0010362694301 ms
Frame 13: Timestamp 434.41778929188257 ms
Frame 14: Timestamp 467.83454231433507 ms
Frame 15: Timestamp 501.25129533678756 ms
Frame 16: Timestamp 534.6680483592401 ms
Frame 17: Timestamp 568.0848013816926 ms
Frame 18: Timestamp 601.501554404145 ms
Frame 19: Timestamp 634.9183074265976 ms
Frame 20: Timestamp 668.3350604490502 ms
Frame 21: Timestamp 701.7518134715026 ms
Frame 22: Timestamp 735.1685664939552 ms
Frame 23: Timestamp 768.5853195164076 ms
Frame 24: Timestamp 802.0020725