In [10]:
import cv2
import mediapipe as mp
from mediapipe.tasks.python.vision.gesture_recognizer import GestureRecognizerResult
import numpy as np
import math
import time

# Initialization of Mediapipe
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils
model_path = './gesture_recognizer.task'
base_options = mp.tasks.BaseOptions(model_asset_path=model_path)

# Configuration variables
colors = [(0, 0, 255), (0, 255, 0), (255, 0, 0), (0, 255, 255)]  # Red, Green, Blue, Yellow
current_color = colors[0]
brush_size = 10
menu_width = 100
drawing = False
canvas = None
dominant_hand = None
calibrating = True
frame_width = None
frame_height = None
current_frame = None
processed_frame = None

I0000 00:00:1736160589.158617  245240 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1736160589.160102  245968 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 24.0.9-0ubuntu0.2), renderer: Mesa Intel(R) UHD Graphics 620 (WHL GT2)


In [11]:
def draw_menu(frame):
    for i, color in enumerate(colors):
        cx, cy = 50, 100 + i * 100
        cv2.circle(frame, (cx, cy), 30, color, -1)

def count_visible_fingers(hand_landmarks):
    finger_tips = [
        mp_hands.HandLandmark.THUMB_TIP,
        mp_hands.HandLandmark.INDEX_FINGER_TIP,
        mp_hands.HandLandmark.MIDDLE_FINGER_TIP,
        mp_hands.HandLandmark.RING_FINGER_TIP,
        mp_hands.HandLandmark.PINKY_TIP
    ]
    visible_fingers = 0
    for tip in finger_tips:
        tip_coord = hand_landmarks[tip]
        base_coord = hand_landmarks[tip - 2]  # Base of each finger
        if tip_coord.y < base_coord.y:  # Finger is up if tip is above base
            visible_fingers += 1
    return visible_fingers

def calculate_brush_size(hand_landmarks):
    index_tip = hand_landmarks[mp_hands.HandLandmark.INDEX_FINGER_TIP]
    middle_tip = hand_landmarks[mp_hands.HandLandmark.MIDDLE_FINGER_TIP]
    distance = math.sqrt((index_tip.x - middle_tip.x)**2 + (index_tip.y - middle_tip.y)**2)
    return int(distance * 300)

def detect_dominant_hand(result):
    global calibrating
    if result.hand_landmarks and result.handedness and result.gestures:
        for hand_idx in range(len(result.hand_landmarks)):
            hand_label = result.handedness[hand_idx][0].category_name
            gesture_name = result.gestures[hand_idx][0].category_name
            if gesture_name == "Open_Palm":
                calibrating = False
                return hand_label
    return None

def process_hand_interaction(frame, hand_landmarks, handedness):
    global drawing, current_color, brush_size, canvas

    # Get index finger coordinates
    index_finger = hand_landmarks[mp_hands.HandLandmark.INDEX_FINGER_TIP]
    x, y = int(index_finger.x * frame_width), int(index_finger.y * frame_height)

    # Count visible fingers
    visible_fingers = count_visible_fingers(hand_landmarks)
    
    if visible_fingers == 0:
        drawing = False
    else:
        drawing = True
        if visible_fingers == 1:
            brush_size = 10
        elif visible_fingers >= 2:
            brush_size = calculate_brush_size(hand_landmarks)

    # Menu interaction
    wrist_x = int(hand_landmarks[mp_hands.HandLandmark.WRIST].x * frame_width)
    if wrist_x < menu_width:
        for i, color in enumerate(colors):
            cx, cy = 50, 100 + i * 100
            if math.hypot(x - cx, y - cy) < 30:
                current_color = color
    else:
        if drawing:
            cv2.circle(canvas, (x, y), brush_size, current_color, -1)

    # Show index finger color
    cv2.circle(frame, (x, y), 10, current_color, -1)

def gesture_callback(result: GestureRecognizerResult, output_image: mp.Image, timestamp_ms: int):
    global calibrating, dominant_hand, canvas, frame_width, frame_height, current_frame, processed_frame
    
    # Get the frame from the output_image
    # frame = np.array(output_image.numpy_view()).copy()
    frame = current_frame.copy()

    if canvas is None:
        frame_height, frame_width, _ = frame.shape
        canvas = np.zeros((frame_height, frame_width, 3), dtype=np.uint8)
    
    if calibrating:
        cv2.putText(frame, "Show your dominant hand", (10, 50), 
                    cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 255), 2)
        if result.gestures:
            dominant_hand = detect_dominant_hand(result)
            if dominant_hand:
                print('dominant hand set to', dominant_hand)
    else:
        if result.hand_landmarks and result.handedness:
            for i, hand_landmarks in enumerate(result.hand_landmarks):
                handedness = result.handedness[i][0].category_name
                if handedness == dominant_hand:
                    process_hand_interaction(frame, hand_landmarks, handedness)
    
    # Combine canvas with frame
    frame = cv2.addWeighted(frame, 0.5, canvas, 0.5, 0)
    draw_menu(frame)
         
    
    #cv2.imshow("Interactive Paint", frame)
    processed_frame = frame


W0000 00:00:1736160589.196943  245961 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [12]:
# Set up gesture recognizer
options = mp.tasks.vision.GestureRecognizerOptions(
    base_options=base_options,
    running_mode=mp.tasks.vision.RunningMode.LIVE_STREAM,
    result_callback=gesture_callback
)
recognizer = mp.tasks.vision.GestureRecognizer.create_from_options(options)

# Start camera
cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
        
    frame = cv2.flip(frame, 1)  # Horizontal flip for mirror effect
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Store the current frame for the callback to use
    current_frame = frame.copy()
    
    # Process frame with gesture recognizer
    mp_image = mp.Image(image_format=mp.ImageFormat.SRGB, data=rgb_frame)
    current_timestamp_ms = int(time.time() * 1000)
    recognizer.recognize_async(mp_image, current_timestamp_ms)

    # Display the processed frame if available
    if processed_frame is not None:
        cv2.imshow("Interactive Paint", processed_frame)
    else:
        cv2.imshow("Interactive Paint", frame)
    
    # Check for key presses
    key = cv2.waitKey(1)
    if key & 0xFF == 27:  # ESC to exit
        break
    elif key & 0xFF == ord('d'):  # Toggle drawing
        drawing = not drawing

cap.release()
cv2.destroyAllWindows()


W0000 00:00:1736160589.218526  245960 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1736160589.224035  245240 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1736160589.225636  245971 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 24.0.9-0ubuntu0.2), renderer: Mesa Intel(R) UHD Graphics 620 (WHL GT2)
W0000 00:00:1736160589.226145  245240 gesture_recognizer_graph.cc:129] Hand Gesture Recognizer contains CPU only ops. Sets HandGestureRecognizerGraph acceleration to Xnnpack.
I0000 00:00:1736160589.227617  245240 hand_gesture_recognizer_graph.cc:250] Custom gesture classifier is not defined.
W0000 00:00:1736160589.270989  245975 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1736160589.303505  245973 inference_feedback_manager.cc:11