In [1]:
import torch
import numpy as np
import cv2
from ultralytics import YOLO
import matplotlib.pyplot as plt
from pynput.mouse import Controller, Button
from screeninfo import get_monitors
from collections import deque, Counter
import time

In [8]:
# !pip install pynput
# !pip install torch
# !pip install numpy
# !pip install opencv-python
# !pip install ultralytics
# !pip install matplotlib
# !pip install screeninfo
# !pip install pyyaml



In [3]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

cpu


## Version 1

In [2]:
class Gestura:
    def __init__(self, model_weights, camera_index, buffer_size=5, scale=0.1, speed=0.3, 
                 conf=0.7, iou=0.5, device='cpu', frame_size=(720, 480), see=False):
        self.model = YOLO(model_weights)
        self.camera_index = camera_index
        self.confidence = conf
        self.iou_threshold = iou
        self.device = device
        self.frame_size = frame_size
        self.mouse_controller = Controller()
        self.scale_factor = scale
        self.movement_speed = speed
        self.see = see
        
        self._initialize_buffers(buffer_size)
        self._initialize_monitor_info()
        self._initialize_control_area()
        
    def _initialize_buffers(self, buffer_size):
        self.buffer = {
            'x': deque(maxlen=buffer_size),
            'y': deque(maxlen=buffer_size),
            'bbox': deque(maxlen=buffer_size),
            'classes': deque(maxlen=buffer_size)
        }
        
    def _initialize_monitor_info(self):
        self.primary_monitor = next((m for m in get_monitors() if m.is_primary), get_monitors()[0])
        self.screen_width, self.screen_height = self.primary_monitor.width, self.primary_monitor.height
    
    def _initialize_control_area(self):
        """Initialize control area with default values that will be updated during detection"""
        self.control_area = {
            'x1': 0,
            'y1': 0,
            'width': 1,
            'height': 1  
        }
    
    def predict_hands(self, image):
        return self.model.predict(
            image, 
            device=self.device, 
            conf=self.confidence, 
            iou=self.iou_threshold, 
            verbose=False
        )[0]
    
    def process_detections(self, image, detections):
        for detection in detections:
            bboxes = detection.boxes.xyxy.cpu().int().tolist()
            class_ids = detection.boxes.cls.cpu().int().tolist()
            
            for bbox, class_id in zip(bboxes, class_ids):
                self._update_buffers(bbox, class_id)
                self._update_control_area(image, bbox)
                if self.see:
                    self._draw_detection(image)
                self._control_cursor(class_id)
                
        return image
    
    def _update_buffers(self, bbox, class_id):
        x_min, y_min, x_max, y_max = bbox
        center_x, center_y = (x_min + x_max) // 2, (y_min + y_max) // 2
        
        self.buffer['bbox'].append(bbox)
        self.buffer['classes'].append(class_id)
        self.buffer['x'].append(center_x)
        self.buffer['y'].append(center_y)
    
    def _update_control_area(self, image, bbox):
        """Calculate and update control area based on current bbox"""
        x_min, y_min, x_max, y_max = bbox
        offset_x = int((x_max - x_min) * (1 + self.scale_factor) / 2)
        offset_y = int((y_max - y_min) * (1 + self.scale_factor) / 2)
        
        control_area_x1 = offset_x
        control_area_y1 = offset_y
        control_area_x2 = image.shape[1] - offset_x
        control_area_y2 = image.shape[0] - offset_y
        
        self.control_area = {
            'x1': control_area_x1,
            'y1': control_area_y1,
            'width': max(control_area_x2 - control_area_x1, 1),
            'height': max(control_area_y2 - control_area_y1, 1)
        }
    
    def _draw_detection(self, image):
        if not self.buffer['bbox']:
            return
            
        avg_bbox = self._calculate_average_bbox()
        x_min, y_min, x_max, y_max = avg_bbox
        
        cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 0, 255), 2)
        
        class_name = self.model.names[self._most_common_class()]
        label = f"{class_name} (ID: {self._most_common_class()})"
        cv2.putText(image, label, (x_min, y_min - 10), 
                   cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 1)
        
        center_x, center_y = self._calculate_average_position()
        cv2.circle(image, (center_x, center_y), 3, (0, 0, 255), -1)
        
        self._draw_control_area_visual(image)
    
    def _draw_control_area_visual(self, image):
        """Draw control area visualization (only for visual purposes)"""
        cv2.rectangle(image, 
                     (self.control_area['x1'], self.control_area['y1']),
                     (self.control_area['x1'] + self.control_area['width'], 
                      self.control_area['y1'] + self.control_area['height']),
                     (0, 255, 0), 2)
    
    def _calculate_average_bbox(self):
        bboxes = list(self.buffer['bbox'])
        avg_x_min = sum(b[0] for b in bboxes) // len(bboxes)
        avg_y_min = sum(b[1] for b in bboxes) // len(bboxes)
        avg_x_max = sum(b[2] for b in bboxes) // len(bboxes)
        avg_y_max = sum(b[3] for b in bboxes) // len(bboxes)
        return (avg_x_min, avg_y_min, avg_x_max, avg_y_max)
    
    def _calculate_average_position(self):
        return (int(sum(self.buffer['x']) / len(self.buffer['x'])), 
                int(sum(self.buffer['y']) / len(self.buffer['y'])))
    
    def _most_common_class(self):
        return Counter(self.buffer['classes']).most_common(1)[0][0]
    
    def _control_cursor(self, class_id):
        if class_id == 18:
            self._move_cursor()
    
    def _move_cursor(self):
        if not self.buffer['x']:
            return
            
        avg_x, avg_y = self._calculate_average_position()
        
        norm_x = (avg_x - self.control_area['x1']) / self.control_area['width']
        norm_y = (avg_y - self.control_area['y1']) / self.control_area['height']
        
        target_x = norm_x * self.screen_width
        target_y = norm_y * self.screen_height
        
        current_x, current_y = self.mouse_controller.position
        new_x = current_x + (target_x - current_x) * self.movement_speed
        new_y = current_y + (target_y - current_y) * self.movement_speed
        
        new_x = max(0, min(new_x, self.screen_width))
        new_y = max(0, min(new_y, self.screen_height))
        
        self.mouse_controller.position = (new_x, new_y)
    
    def run(self):
        cap = cv2.VideoCapture(self.camera_index)
        if not cap.isOpened():
            raise RuntimeError('Не удалось открыть камеру')
            
        cap.set(cv2.CAP_PROP_FRAME_WIDTH, self.frame_size[0])
        cap.set(cv2.CAP_PROP_FRAME_HEIGHT, self.frame_size[1])
        
        try:
            while True:
                success, frame = cap.read()
                if not success:
                    raise RuntimeError('Не удалось получить кадр с камеры')
                
                frame = cv2.flip(frame, 1)
                detections = self.predict_hands(frame)
                if self.see:
                    processed_frame = self.process_detections(frame, detections)
                    cv2.imshow('Hand Detection', processed_frame)
                else:
                    self.process_detections(frame, detections)
                
                if self.see and (cv2.waitKey(1) & 0xFF == ord('q')):
                    break
        finally:
            cap.release()
            if self.see:
                cv2.destroyAllWindows()

In [3]:
detector = Gestura('YOLOv10n_gestures.pt', camera_index=2,
                   buffer_size=8, scale=0.2, speed=0.2, see=True)
detector.run()

2025-03-25 10:51:50.280 Python[46853:5667456] +[IMKClient subclass]: chose IMKClient_Modern
2025-03-25 10:51:50.280 Python[46853:5667456] +[IMKInputSession subclass]: chose IMKInputSession_Modern


## Version 2

In [None]:
class Gestura:
    def __init__(self, model_weights, camera_index, buffer_size=5, scale=0.1, speed=0.3, conf=0.7, iou=0.5, device='cpu', frame_size=(720, 480)):
        self.model = YOLO(model_weights)
        self.camera_index = camera_index
        self.confidence = conf
        self.iou_threshold = iou
        self.device = device
        self.frame_size = frame_size
        self.mouse_controller = Controller()
        self.scale_factor = scale
        self.movement_speed = speed

        self.buffer = {'x': deque(maxlen=buffer_size), 'y': deque(maxlen=buffer_size), 'bbox': deque(maxlen=buffer_size), 'classes': deque(maxlen=buffer_size)}
        monitor = next((m for m in get_monitors() if m.is_primary), get_monitors()[0])
        self.screen_width, self.screen_height = monitor.width, monitor.height
        self.control_area = {'x1': 0, 'y1': 0, 'width': 1, 'height': 1}

        self.clicked = False
        self.dragging = False
        self.exit = False

    def predict_hands(self, image):
        return self.model.predict(image, device=self.device, conf=self.confidence, iou=self.iou_threshold, verbose=False)[0]

    def process_detections(self, detections, image_shape):
        ids_detected = set()

        for detection in detections:
            for bbox, class_id in zip(detection.boxes.xyxy.cpu().int().tolist(), detection.boxes.cls.cpu().int().tolist()):
                x_min, y_min, x_max, y_max = bbox
                center_x, center_y = (x_min + x_max) // 2, (y_min + y_max) // 2
                self.buffer['bbox'].append(bbox)
                self.buffer['classes'].append(class_id)
                self.buffer['x'].append(center_x)
                self.buffer['y'].append(center_y)
                self._update_control_area(image_shape, bbox)

                ids_detected.add(class_id)

                if class_id == 18:
                    self._move_cursor()
                    self.clicked = False

                if class_id == 14 and not self.clicked:
                    self._left_click()
                    self.clicked = True

                if class_id in {28, 29}:
                    if not self.dragging:
                        self._start_drag()
                    self._move_cursor()

        if not ids_detected.intersection({28, 29}) and self.dragging:
            self._stop_drag()

        if 2 in ids_detected:
            self.exit = True
            

    def _update_control_area(self, image_shape, bbox):
        x_min, y_min, x_max, y_max = bbox
        offset_x = int((x_max - x_min) * (1 + self.scale_factor) / 2)
        offset_y = int((y_max - y_min) * (1 + self.scale_factor) / 2)
        self.control_area['x1'] = offset_x
        self.control_area['y1'] = offset_y
        self.control_area['width'] = max(image_shape[1] - 2 * offset_x, 1)
        self.control_area['height'] = max(image_shape[0] - 2 * offset_y, 1)

    def _move_cursor(self):
        avg_x = sum(self.buffer['x']) / len(self.buffer['x'])
        avg_y = sum(self.buffer['y']) / len(self.buffer['y'])
        norm_x = (avg_x - self.control_area['x1']) / self.control_area['width']
        norm_y = (avg_y - self.control_area['y1']) / self.control_area['height']
        target_x = norm_x * self.screen_width
        target_y = norm_y * self.screen_height
        current_x, current_y = self.mouse_controller.position
        new_x = max(0, min(current_x + (target_x - current_x) * self.movement_speed, self.screen_width))
        new_y = max(0, min(current_y + (target_y - current_y) * self.movement_speed, self.screen_height))
        self.mouse_controller.position = (new_x, new_y)

    def _left_click(self):
        self.mouse_controller.click(Button.left)

    def _start_drag(self):
        self.mouse_controller.press(Button.left)
        self.dragging = True

    def _stop_drag(self):
        self.mouse_controller.release(Button.left)
        self.dragging = False

    def run(self):
        cap = cv2.VideoCapture(self.camera_index)
        if not cap.isOpened():
            raise RuntimeError('Не удалось открыть камеру')
        cap.set(cv2.CAP_PROP_FRAME_WIDTH, self.frame_size[0])
        cap.set(cv2.CAP_PROP_FRAME_HEIGHT, self.frame_size[1])

        try:
            while True and not self.exit:
                success, frame = cap.read()
                if not success:
                    raise RuntimeError('Не удалось получить кадр с камеры')
                frame = cv2.flip(frame, 1)
                detections = self.predict_hands(frame)
                self.process_detections(detections, frame.shape)
        finally:
            cap.release()
        


In [7]:
detector = Gestura('YOLOv10n_gestures.pt', camera_index=2,
                   buffer_size=5, scale=0.2, speed=0.2)
detector.run()

KeyboardInterrupt: 

In [2]:
import torch
from ultralytics import YOLO

model = YOLO('YOLOv10n_gestures.pt')

for name, param in model.model.named_parameters():
    print(f"{name}: {param.dtype}")
    break  # уберите break, чтобы увидеть все параметры

model.0.conv.weight: torch.float32
