In [6]:
import cv2
import mediapipe as mp
from mediapipe.tasks.python.vision.gesture_recognizer import GestureRecognizerResult
from mediapipe.tasks.python.vision.pose_landmarker import PoseLandmarkerResult
import numpy as np
import math
import time

# os is used for Ilka's linux :)
import os
os.environ["QT_QPA_PLATFORM"] = "xcb"

# settings
threshold_mask = True
fps = 10
frame_scale = 0 # in percentage, 0 for deactivation
clear_mem_in_process = False

# Initialization of Mediapipe
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=1, min_detection_confidence=0.5)
mp_drawing = mp.solutions.drawing_utils
model_path_gesture = './gesture_recognizer.task'
model_path_pose = './pose_landmarker_full.task'

# Configuration variables
colors = [(0, 0, 255), (0, 255, 0), (255, 0, 0), (0, 255, 255)]  # Red, Green, Blue, Yellow
current_color = colors[0]
brush_size = 8
menu_width = 100
menu_dist_y = 0
FONT = cv2.FONT_HERSHEY_SIMPLEX
drawing = False
erasing = False
single_finger = True
dominant_hand = None
calibrating = True

frame_width = None
frame_height = None
canvas = None
segment_mask = None

current_frame = None
gesture_frame = None
frame_to_save = None
layers = ["BACK", "FRONT"]
finger_layer = None # layer for colored point when drawing or circle when erasing 
current_layer = 0

last_T_pose = 0

I0000 00:00:1736417497.311849  399442 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1736417497.315159  400445 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 24.0.9-0ubuntu0.2), renderer: Mesa Intel(R) UHD Graphics 620 (WHL GT2)


In [7]:
def draw_menu(frame):
    global menu_dist_y
    menu_dist_y = int(frame_height / len(colors) /2)
    # color menu
    for i, color in enumerate(colors):
        cx, cy = int(menu_width/2), menu_dist_y + i * menu_dist_y*2
        cv2.circle(frame, (cx, cy), 30, color, -1)

    # layer menu
    for j, layer in enumerate(layers):
        cx, cy = frame_width - int(menu_width/2), menu_dist_y + j * menu_dist_y*2
        cv2.circle(frame, (cx, cy), 30, (255, 255, 255), -1)
        # center text
        text_size = cv2.getTextSize(layer, FONT, 0.6, 1)[0]
        text_x = int(cx - text_size[0] // 2) 
        text_y = int(cy + text_size[1] // 2)

        cv2.putText(frame, layer, (text_x, text_y), FONT, 0.6, (0, 0, 0), 1)


def count_visible_fingers(hand_landmarks):
    finger_tips = [
        mp_hands.HandLandmark.THUMB_TIP,
        mp_hands.HandLandmark.INDEX_FINGER_TIP,
        mp_hands.HandLandmark.MIDDLE_FINGER_TIP,
        mp_hands.HandLandmark.RING_FINGER_TIP,
        mp_hands.HandLandmark.PINKY_TIP
    ]
    visible_fingers = 0
    for tip in finger_tips:
        tip_coord = hand_landmarks[tip]
        base_coord = hand_landmarks[tip - 2]  # Base of each finger
        if tip_coord.y < base_coord.y:  # Finger is up if tip is above base
            visible_fingers += 1
    return visible_fingers

def calculate_brush_size(hand_landmarks):
    index_tip = hand_landmarks[mp_hands.HandLandmark.INDEX_FINGER_TIP]
    middle_tip = hand_landmarks[mp_hands.HandLandmark.MIDDLE_FINGER_TIP]
    distance = math.sqrt((index_tip.x - middle_tip.x)**2 + (index_tip.y - middle_tip.y)**2)
    return int(distance * 300)

def menu_interaction(finger_x, finger_y):
    global current_color, current_layer
    # color menu
    if finger_x < menu_width:
        # Calculate which color zone the index finger is in
        for i, color in enumerate(colors):
            layer_y = menu_dist_y + i * menu_dist_y*2  # Y position of each color selector
            # Check if index finger is within the vertical range of this color
            if abs(finger_y - layer_y) < 30:  # 30 pixel threshold for selection
                current_color = color
                break
    # layer menu
    if finger_x > frame_width - menu_width:
        # Calculate which zone the index finger is in
        for i in range(len(layers)):
            layer_y = menu_dist_y + i * menu_dist_y*2  # Y position of each selector
            # Check if index finger is within the vertical range of this button
            if abs(finger_y - layer_y) < 30:  # 30 pixel threshold for selection
                current_layer = i
                break

def detect_dominant_hand(result):
    global calibrating
    if result.hand_landmarks and result.handedness and result.gestures:
        for hand_idx in range(len(result.hand_landmarks)):
            hand_label = result.handedness[hand_idx][0].category_name
            gesture_name = result.gestures[hand_idx][0].category_name
            if gesture_name == "Open_Palm":
                calibrating = False
                return hand_label
    return None

def process_dominant_hand_interaction(frame, hand_landmarks, gesture):
    global drawing, erasing, single_finger, current_color, brush_size, canvas, finger_layer

    # reset finger layer
    finger_layer = np.zeros((frame_height, frame_width, 3), dtype=np.uint8)

    drawing = False
    erasing = False
    single_finger = False
    if gesture == "Open_Palm":
        drawing = False
        print('open palm')
    elif gesture == "Pointing_Up":
        drawing = True
        single_finger = True
        brush_size = 8
        print('pointing up')
    elif gesture == "Victory":
        drawing = True
        brush_size = calculate_brush_size(hand_landmarks)
        print('victory')
    elif gesture == "Closed_Fist":
        drawing = False
        erasing = True


    # Get index finger coordinates
    index_finger = hand_landmarks[mp_hands.HandLandmark.INDEX_FINGER_TIP]
    x, y = int(index_finger.x * frame_width), int(index_finger.y * frame_height)

    menu_interaction(x,y)

    if not (x < menu_width and x > frame_width - menu_width): # if finger is not in menu area
        if drawing:
            if single_finger:
                cv2.circle(canvas[current_layer], (x, y), brush_size, current_color, -1)
            else:
                middle_finger = hand_landmarks[mp_hands.HandLandmark.MIDDLE_FINGER_TIP]
                # Calculate rectangle parameters between fingers
                x1 = int(index_finger.x * frame_width)
                y1 = int(index_finger.y * frame_height)
                x2 = int(middle_finger.x * frame_width)
                y2 = int(middle_finger.y * frame_height)
                
                # Calculate center point of rectangle
                center_x = (x1 + x2) // 2
                center_y = (y1 + y2) // 2
                
                # Calculate rotation angle
                angle = math.atan2(y2 - y1, x2 - x1)
                
                # Create rectangle points
                rect_length = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)
                rect_height = 10  # Small fixed height
                
                # Calculate rectangle corners
                cos_angle = math.cos(angle)
                sin_angle = math.sin(angle)
                
                # Define rectangle points
                points = np.array([
                    [-rect_length/2, -rect_height/2],
                    [rect_length/2, -rect_height/2],
                    [rect_length/2, rect_height/2],
                    [-rect_length/2, rect_height/2]
                ], dtype=np.float32)
                
                # Rotate points
                rotated_points = np.array([
                    [cos_angle, -sin_angle],
                    [sin_angle, cos_angle]
                ]) @ points.T
                
                # Translate points to center position
                final_points = (rotated_points.T + [center_x, center_y]).astype(np.int32)
                
                # Draw the rotated rectangle
                cv2.fillPoly(canvas[current_layer], [final_points], current_color)
        if erasing:
            # calculate hand middle point
            wrist = hand_landmarks[mp_hands.HandLandmark.WRIST]
            wrist_x, wrist_y = int(wrist.x * frame_width), int(wrist.y * frame_height)
            middle_finger_mcp = hand_landmarks[mp_hands.HandLandmark.MIDDLE_FINGER_MCP]
            middle_finger_mcp_x, middle_finger_mcp_y = int(middle_finger_mcp.x * frame_width), int(middle_finger_mcp.y * frame_height)
            middle_x = int((wrist_x + middle_finger_mcp_x) /2)
            middle_y = int((wrist_y + middle_finger_mcp_y) /2)
            eraser_size = int(math.sqrt((wrist_x - middle_finger_mcp_x)**2 + (wrist_y - middle_finger_mcp_y)**2) /2)
            # erase
            cv2.circle(canvas[current_layer], (middle_x, middle_y), eraser_size, (0,0,0), -1)
            # show circle of eraser
            cv2.circle(finger_layer, (middle_x, middle_y), eraser_size, (255,255,255), 2)


    # Show index finger color
    if not erasing:
        cv2.circle(finger_layer, (x, y), 10, current_color, -1)

def callback_gesture(result: GestureRecognizerResult, output_image: mp.Image, timestamp_ms: int):
    global calibrating, dominant_hand, canvas, frame_width, frame_height, gesture_frame
    
    frame = current_frame.copy()

    if canvas is None:
        frame_height, frame_width, _ = frame.shape
        canvas = []
        for i in range(len(layers)):
            canvas.append(np.zeros((frame_height, frame_width, 3), dtype=np.uint8)) 
    
    if calibrating:
        cv2.putText(frame, "Show your dominant hand", (menu_width, 50), 
                    FONT, 1, (0, 255, 255), 2)
        if result.gestures:
            dominant_hand = detect_dominant_hand(result)
            if dominant_hand:
                print('dominant hand set to', dominant_hand)
    else: # paint
        if result.hand_landmarks and result.handedness:
            for i, hand_landmarks in enumerate(result.hand_landmarks):
                handedness = result.handedness[i][0].category_name
                gesture = result.gestures[i][0].category_name
                if handedness == dominant_hand:
                    process_dominant_hand_interaction(frame, hand_landmarks, gesture)
    
    # Combine all canvas layers with frame
    for i in range(len(layers)):
        frame = cv2.addWeighted(frame, 1.0, canvas[i], 1.0, 0)
    
    gesture_frame = frame


def callback_pose(result: PoseLandmarkerResult, output_image: mp.Image, timestamp_ms: int):
    global processed_frame, canvas, last_T_pose, segment_mask
    if not result.pose_landmarks:
        return
    
    landmarks = result.pose_landmarks[0]
    if result.segmentation_masks:
        segment_mask = np.array(result.segmentation_masks[0].numpy_view())
        if threshold_mask:
            _, segment_mask = cv2.threshold(segment_mask, 0.5, 1, cv2.THRESH_BINARY)
    else:
        segment_mask = None
    
    left_shoulder = landmarks[11]  
    right_shoulder = landmarks[12]  
    left_wrist = landmarks[15]     
    right_wrist = landmarks[16]    
    
    # Check if arms are horizontal (y-coordinates approximately equal)
    shoulder_wrist_y_diff_left = abs(left_shoulder.y - left_wrist.y)
    shoulder_wrist_y_diff_right = abs(right_shoulder.y - right_wrist.y)
    
    # Check if arms are extended (x-coordinates significantly different)
    shoulder_wrist_x_diff_left = abs(left_shoulder.x - left_wrist.x)
    shoulder_wrist_x_diff_right = abs(right_shoulder.x - right_wrist.x)
    
    Y_THRESHOLD = 0.1
    X_THRESHOLD = 0.2
    
    is_t_pose = (
        shoulder_wrist_y_diff_left < Y_THRESHOLD and
        shoulder_wrist_y_diff_right < Y_THRESHOLD and
        shoulder_wrist_x_diff_left > X_THRESHOLD and
        shoulder_wrist_x_diff_right > X_THRESHOLD
    )
    
    # you always have at least 5 seconds between the savings
    if is_t_pose and (time.time() - last_T_pose) > 5 and frame_to_save is not None:
        filename = 'drawing-' + str(int(time.time())) + '.png'
        cv2.imwrite(filename, frame_to_save)
        print('image saved as ' + filename)
        last_T_pose = time.time()
        # reset canvas
        canvas = None

# Optimizations
# Frame Processing Rate Control
def limit_frame_rate(cap, target_fps=30):
    # Limit frame processing rate
    cap.set(cv2.CAP_PROP_FPS, target_fps)
    return 1.0 / target_fps

# Frame Resizing
def resize_frame(frame, scale_percent=50):
    # Resize frame to reduce processing load
    width = int(frame.shape[1] * scale_percent / 100)
    height = int(frame.shape[0] * scale_percent / 100)
    return cv2.resize(frame, (width, height))

# Memory Management
def clear_memory():
    # Clear unused memory periodically
    if 'cv2' in globals():
        cv2.destroyAllWindows()
    if 'mp' in globals():
        mp.solutions.hands.Hands().close()

W0000 00:00:1736417497.429988  400438 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


In [8]:
# Set up gesture recognizer
options_gesture = mp.tasks.vision.GestureRecognizerOptions(
    base_options=mp.tasks.BaseOptions(model_asset_path=model_path_gesture),
    running_mode=mp.tasks.vision.RunningMode.LIVE_STREAM,
    result_callback=callback_gesture
)
options_pose = mp.tasks.vision.PoseLandmarkerOptions(
    base_options=mp.tasks.BaseOptions(model_asset_path=model_path_pose),
    running_mode=mp.tasks.vision.RunningMode.LIVE_STREAM,
    result_callback=callback_pose,
    output_segmentation_masks=True
)

recognizer_gesture = mp.tasks.vision.GestureRecognizer.create_from_options(options_gesture)
pose_landmarker = mp.tasks.vision.PoseLandmarker.create_from_options(options_pose)

# Start camera
cap = cv2.VideoCapture(0)
frame_delay = limit_frame_rate(cap, target_fps=fps)  # Reduce FPS

try:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
            
        # Resize frame to reduce processing load
        if frame_scale:
            frame = resize_frame(frame, scale_percent=frame_scale)
        frame = cv2.flip(frame, 1)
        
        # Convert only once and reuse
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        current_frame = frame.copy()
        
        # Use numpy view instead of copy when possible
        mp_image = mp.Image(
            image_format=mp.ImageFormat.SRGB, 
            data=rgb_frame
        )
        
        current_timestamp_ms = int(time.time() * 1000)
        
        # Batch process recognitions
        recognizer_gesture.recognize_async(mp_image, current_timestamp_ms)
        pose_landmarker.detect_async(mp_image, current_timestamp_ms)
        
        if gesture_frame is not None and canvas is not None:
            composite_frame = gesture_frame.copy()
            # Add back layer
            composite_frame = cv2.addWeighted(composite_frame, 1.0, canvas[0], 1.0, 0)
            
            # Add masked camera frame (person)
            if segment_mask is not None:
                mask_3channel = (segment_mask * 255).astype(np.uint8)
                mask_3channel = cv2.cvtColor(mask_3channel, cv2.COLOR_GRAY2BGR)
                
                inv_mask = cv2.bitwise_not(mask_3channel)
                masked_frame = cv2.bitwise_and(frame, mask_3channel)
                masked_composite = cv2.bitwise_and(composite_frame, inv_mask)
                composite_frame = cv2.add(masked_composite, masked_frame)
            
            # Add front layer
            composite_frame = cv2.addWeighted(composite_frame, 1.0, canvas[1], 1.0, 0)
            frame_to_save = composite_frame

            # Add 'finger layer' 
            if finger_layer is not None:
                composite_frame = cv2.addWeighted(composite_frame, 1.0, finger_layer, 1.0, 0)
            
            draw_menu(composite_frame)
            cv2.imshow("Interactive Paint", composite_frame)
        else:
            cv2.imshow("Interactive Paint", frame)
            
        if cv2.waitKey(int(frame_delay * 1000)) & 0xFF == 27:
            break
            
        # Clear memory periodically, every 30 seconds
        if clear_mem_in_process and current_timestamp_ms % 30000 == 0: 
            clear_memory()
            
finally:
    cap.release()
    cv2.destroyAllWindows()
    pose_landmarker.close()
    recognizer_gesture.close()
    clear_memory()

W0000 00:00:1736417497.507217  400437 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
I0000 00:00:1736417497.513314  399442 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1736417497.516680  400449 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 24.0.9-0ubuntu0.2), renderer: Mesa Intel(R) UHD Graphics 620 (WHL GT2)
W0000 00:00:1736417497.518462  399442 gesture_recognizer_graph.cc:129] Hand Gesture Recognizer contains CPU only ops. Sets HandGestureRecognizerGraph acceleration to Xnnpack.
I0000 00:00:1736417497.522536  399442 hand_gesture_recognizer_graph.cc:250] Custom gesture classifier is not defined.
W0000 00:00:1736417497.629963  400455 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1736417497.676697  400451 inference_feedback_manager.cc:11

dominant hand set to Left
open palm
open palm
open palm
open palm
open palm
open palm
open palm
open palm
open palm
open palm
open palm
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up
pointing up


I0000 00:00:1736417524.260879  399442 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1736417524.264395  400633 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 24.0.9-0ubuntu0.2), renderer: Mesa Intel(R) UHD Graphics 620 (WHL GT2)
W0000 00:00:1736417524.358784  400624 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1736417524.398020  400627 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
