<a href="https://colab.research.google.com/github/dkiran100/Automatic-hand-tracking/blob/main/Hand_tracking_mediapipe_ultralytics_sam2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install ultralytics torch torchvision torchaudio opencv-python numpy matplotlib mediapipe


Collecting ultralytics
  Downloading ultralytics-8.3.70-py3-none-any.whl.metadata (35 kB)
Collecting mediapipe
  Downloading mediapipe-0.10.20-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downl

In [2]:
import cv2
import torch
import numpy as np
import mediapipe as mp
from ultralytics import SAM

# Load SAM 2 model
model = SAM("sam2_b.pt")

# Initialize MediaPipe Hands
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2, min_detection_confidence=0.5)
mp_draw = mp.solutions.drawing_utils

# Load video
video_path = "/content/drive/MyDrive/VLM/test.mp4"
cap = cv2.VideoCapture(video_path)

# Get video properties
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

# Define output writer
output_path = "/content/drive/MyDrive/VLM/output_video.mp4"
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

frame_number = 0
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    frame_number += 1
    print(f"Processing frame {frame_number}/{frame_count}...")

    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results_hands = hands.process(rgb_frame)

    hand_bboxes = []
    if results_hands.multi_hand_landmarks:
        for hand_landmarks in results_hands.multi_hand_landmarks:
            x_min, y_min, x_max, y_max = width, height, 0, 0
            for landmark in hand_landmarks.landmark:
                x, y = int(landmark.x * width), int(landmark.y * height)
                x_min, y_min = min(x_min, x), min(y_min, y)
                x_max, y_max = max(x_max, x), max(y_max, y)

            x_min, y_min = max(x_min - 10, 0), max(y_min - 10, 0)
            x_max, y_max = min(x_max + 10, width), min(y_max + 10, height)
            hand_bboxes.append((x_min, y_min, x_max, y_max))

    mask = np.zeros((height, width), dtype=np.uint8)

    if hand_bboxes:
        for (x_min, y_min, x_max, y_max) in hand_bboxes:
            hand_roi = rgb_frame[y_min:y_max, x_min:x_max]
            if hand_roi.shape[0] > 0 and hand_roi.shape[1] > 0:
                results = model(hand_roi)
                if results and hasattr(results[0], "masks") and results[0].masks is not None:
                    sam_mask = results[0].masks.data.cpu().numpy()
                    if sam_mask.shape[0] > 0:
                        mask_resized = cv2.resize(
                            (sam_mask[0] * 255).astype(np.uint8),
                            (x_max - x_min, y_max - y_min),
                            interpolation=cv2.INTER_NEAREST,
                        )
                        mask[y_min:y_max, x_min:x_max] = mask_resized

    if mask.any():
        overlay = cv2.applyColorMap(mask, cv2.COLORMAP_JET)
        blended_frame = cv2.addWeighted(frame, 0.5, overlay, 0.5, 0)
    else:
        blended_frame = frame.copy()

    for (x_min, y_min, x_max, y_max) in hand_bboxes:
        cv2.rectangle(blended_frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)

    if results_hands.multi_hand_landmarks:
        for hand_landmarks in results_hands.multi_hand_landmarks:
            mp_draw.draw_landmarks(blended_frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

    out.write(blended_frame)

cap.release()
out.release()
cv2.destroyAllWindows()

print(f"Processing complete! Output video saved at: {output_path}")


Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/sam2_b.pt to 'sam2_b.pt'...


100%|██████████| 154M/154M [00:04<00:00, 38.0MB/s]


Processing frame 1/210...

0: 1024x1024 1 0, 1 1, 1 2, 1 3, 5809.0ms
Speed: 89.6ms preprocess, 5809.0ms inference, 0.7ms postprocess per image at shape (1, 3, 1024, 1024)

0: 1024x1024 1 0, 1 1, 1 2, 1 3, 1 4, 1681.8ms
Speed: 9.1ms preprocess, 1681.8ms inference, 0.5ms postprocess per image at shape (1, 3, 1024, 1024)
Processing frame 2/210...

0: 1024x1024 1 0, 1 1, 1 2, 1 3, 1706.1ms
Speed: 8.3ms preprocess, 1706.1ms inference, 0.5ms postprocess per image at shape (1, 3, 1024, 1024)

0: 1024x1024 1 0, 1 1, 1 2, 1 3, 1 4, 1691.2ms
Speed: 5.4ms preprocess, 1691.2ms inference, 0.5ms postprocess per image at shape (1, 3, 1024, 1024)
Processing frame 3/210...

0: 1024x1024 1 0, 1 1, 1 2, 1 3, 1708.5ms
Speed: 6.2ms preprocess, 1708.5ms inference, 0.5ms postprocess per image at shape (1, 3, 1024, 1024)

0: 1024x1024 1 0, 1 1, 1 2, 1 3, 1685.5ms
Speed: 5.4ms preprocess, 1685.5ms inference, 0.5ms postprocess per image at shape (1, 3, 1024, 1024)
Processing frame 4/210...

0: 1024x1024 1 0, 1 