In [1]:
import torch
import numpy as np
import cv2
from ultralytics import YOLO
import time
import os
import random
import matplotlib.pyplot as plt
from pynput.mouse import Controller
from screeninfo import get_monitors
from collections import deque, Counter
from collections import Counter

In [2]:
# !pip install pynput
# !pip install torch
# !pip install numpy
# !pip install opencv-python
# !pip install ultralytics
# !pip install matplotlib
# !pip install screeninfo

In [3]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

cuda


In [4]:
class Gestura:
    """
    Класс для детекции рук и управления курсором мыши с использованием YOLO.
    """
    
    def __init__(self, model_weights, buffer_size=5, scale=0.1, speed=0.3, 
                 conf=0.7, iou=0.5, device='cpu', frame_size=(720, 480), see=False):
        self.model = YOLO(model_weights)
        self.confidence = conf
        self.iou_threshold = iou
        self.device = device
        self.frame_size = frame_size
        self.mouse_controller = Controller()
        self.scale_factor = scale
        self.movement_speed = speed
        self.see = see
        
        self._initialize_buffers(buffer_size)
        self._initialize_monitor_info()
        
    def _initialize_buffers(self, buffer_size):
        """Инициализирует буферы для сглаживания значений."""
        self.buffer = {
            'x': deque(maxlen=buffer_size),
            'y': deque(maxlen=buffer_size),
            'bbox': deque(maxlen=buffer_size),
            'classes': deque(maxlen=buffer_size)
        }
        
    def _initialize_monitor_info(self):
        """Получает информацию о мониторах."""
        self.primary_monitor = next((m for m in get_monitors() if m.is_primary), get_monitors()[0])
        self.screen_width, self.screen_height = self.primary_monitor.width, self.primary_monitor.height
    
    def predict_hands(self, image):
        """Выполняет предсказание положения рук на изображении."""
        return self.model.predict(
            image, 
            device=self.device, 
            conf=self.confidence, 
            iou=self.iou_threshold, 
            verbose=False
        )[0]
    
    def process_detections(self, image, detections):
        """Обрабатывает обнаружения и рисует их на изображении."""
        for detection in detections:
            bboxes = detection.boxes.xyxy.cpu().int().tolist()
            class_ids = detection.boxes.cls.cpu().int().tolist()
            
            for bbox, class_id in zip(bboxes, class_ids):
                self._update_buffers(bbox, class_id)
                self._draw_detection(image)
                self._control_cursor(class_id)
                
        return image
    
    def _update_buffers(self, bbox, class_id):
        """Обновляет буферы с новыми значениями."""
        x_min, y_min, x_max, y_max = bbox
        center_x, center_y = (x_min + x_max) // 2, (y_min + y_max) // 2
        
        self.buffer['bbox'].append(bbox)
        self.buffer['classes'].append(class_id)
        self.buffer['x'].append(center_x)
        self.buffer['y'].append(center_y)
    
    def _draw_detection(self, image):
        """Рисует обнаруженные объекты на изображении."""
        if not self.buffer['bbox']:
            return
            
        avg_bbox = self._calculate_average_bbox()
        x_min, y_min, x_max, y_max = avg_bbox
        
        cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 0, 255), 2)
        
        class_name = self.model.names[self._most_common_class()]
        label = f"{class_name} (ID: {self._most_common_class()})"
        cv2.putText(image, label, (x_min, y_min - 10), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
        
        center_x, center_y = self._calculate_average_position()
        cv2.circle(image, (center_x, center_y), 3, (0, 0, 255), -1)
        
        self._draw_control_area(image, avg_bbox)
    
    def _calculate_average_bbox(self):
        """Вычисляет средний bounding box из буфера."""
        bboxes = list(self.buffer['bbox'])
        avg_x_min = sum(b[0] for b in bboxes) // len(bboxes)
        avg_y_min = sum(b[1] for b in bboxes) // len(bboxes)
        avg_x_max = sum(b[2] for b in bboxes) // len(bboxes)
        avg_y_max = sum(b[3] for b in bboxes) // len(bboxes)
        return (avg_x_min, avg_y_min, avg_x_max, avg_y_max)
    
    def _calculate_average_position(self):
        """Вычисляет среднюю позицию из буфера."""
        return (int(sum(self.buffer['x']) / len(self.buffer['x'])), 
                int(sum(self.buffer['y']) / len(self.buffer['y'])))
    
    def _most_common_class(self):
        """Возвращает наиболее часто встречающийся класс."""
        return Counter(self.buffer['classes']).most_common(1)[0][0]
    
    def _draw_control_area(self, image, bbox):
        """Рисует область управления курсором."""
        x_min, y_min, x_max, y_max = bbox
        offset_x = int((x_max - x_min) * (1 + self.scale_factor) / 2)
        offset_y = int((y_max - y_min) * (1 + self.scale_factor) / 2)
        
        control_area_x1 = offset_x
        control_area_y1 = offset_y
        control_area_x2 = image.shape[1] - offset_x
        control_area_y2 = image.shape[0] - offset_y
        
        cv2.rectangle(image, 
                     (control_area_x1, control_area_y1),
                     (control_area_x2, control_area_y2),
                     (0, 255, 0), 2)
        
        self.control_area = {
            'x1': control_area_x1,
            'y1': control_area_y1,
            'width': max(control_area_x2 - control_area_x1, 1),
            'height': max(control_area_y2 - control_area_y1, 1)
        }
    
    def _control_cursor(self, class_id):
        """Управляет курсором мыши в зависимости от класса."""
        if class_id == 18:
            self._move_cursor()
    
    def _move_cursor(self):
        """Плавно перемещает курсор мыши."""
        if not self.buffer['x'] or not self.control_area:
            return
            
        avg_x, avg_y = self._calculate_average_position()
        
        norm_x = (avg_x - self.control_area['x1']) / self.control_area['width']
        norm_y = (avg_y - self.control_area['y1']) / self.control_area['height']
        
        target_x = norm_x * self.screen_width
        target_y = norm_y * self.screen_height
        
        current_x, current_y = self.mouse_controller.position
        new_x = current_x + (target_x - current_x) * self.movement_speed
        new_y = current_y + (target_y - current_y) * self.movement_speed
        
        new_x = max(0, min(new_x, self.screen_width))
        new_y = max(0, min(new_y, self.screen_height))
        
        self.mouse_controller.position = (new_x, new_y)
    
    def run(self, camera_index=0):
        """Запускает основной цикл обработки видео."""
        cap = cv2.VideoCapture(camera_index)
        if not cap.isOpened():
            raise RuntimeError('Не удалось открыть камеру')
            
        cap.set(cv2.CAP_PROP_FRAME_WIDTH, self.frame_size[0])
        cap.set(cv2.CAP_PROP_FRAME_HEIGHT, self.frame_size[1])
        
        try:
            while True:
                success, frame = cap.read()
                if not success:
                    raise RuntimeError('Не удалось получить кадр с камеры')
                
                frame = cv2.flip(frame, 1)
                detections = self.predict_hands(frame)
                processed_frame = self.process_detections(frame, detections)
                
                cv2.imshow('Hand Detection', processed_frame)
                
                if cv2.waitKey(1) & 0xFF == ord('q'):
                    break
        finally:
            cap.release()
            cv2.destroyAllWindows()

In [None]:
detector = Gestura('YOLOv10n_gestures.pt', conf=0.7, iou=0.5, device='cuda', buffer_size=5, scale=0.2, speed=0.5, see=False)
detector.run()