In [None]:
#dependencies
!pip install pyautogui
!pip install ultralytics

Cell below runs the video detection. Variables at the top modify model source locations and adjustable properties for the detection.

Code for running the models using the live video feed modified from: https://dipankarmedh1.medium.com/real-time-object-detection-with-yolo-and-webcam-enhancing-your-computer-vision-skills-861b97c78993

Due to the video feed and keyboard output functions, this will not run in Colab.

In [None]:
# Hand Detection Model file loaction
detectionModelFile = "HandDetection.pt"
# Gestue classification model file location
classificationModelFile = "GestureClassifier.pt"

# How much extra space should be given when cropping the hand detection output
cropPadding = 32
# How many frames will be stored at once for taking actions
actionMemorySize = 20
# How many of the stored frames should be of one gesture to decide to take that action
actionThreshhold = 10
# What confidence does the hand detection model need to pass something off to the gesture recognition model.
handConfidenceThreshhold = 0.3

#Keyboard Output variables. Change these based on the controls for the presenting software.
keyNextSlide = "right"
keyPrevSlide = "left"
keyLastSlide = "t"
keyFirstSlide = "r"

import ultralytics
from ultralytics import YOLO
from IPython.display import Image
from ultralytics.utils.plotting import Annotator
import cv2
import math
import numpy as np
from pyautogui import press, hotkey

cap = cv2.VideoCapture(0)
cap.set(3, 640)
cap.set(4, 480)

model = YOLO(detectionModelFile)
model2 = YOLO(classificationModelFile)
classNames = ["hand"]
classNames2 = ['none', 'none', 'PreviousSlide', 'none', 'none', 'none', 'none', 'NextSlide', 'LastSlide', 'none', 'none', 'none', 'none', 'none', 'FirstSlide', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none', 'none']

actionMemory = []

while True:
    success, img = cap.read()
    results = model(img, stream=True)

    actionsDetected = []

    # coordinates
    for r in results:
        boxes = r.boxes

        for box in boxes:

            # confidence
            confidence = math.ceil((box.conf[0]*100))/100
            print("Confidence --->",confidence)

            if confidence > handConfidenceThreshhold:
            
                # bounding box
                x1, y1, x2, y2 = box.xyxy[0]
                x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) # convert to int values
    
                x1 = x1 - cropPadding
                if x1 < 1:
                    x1 = 1
                            
                x2 = x2 + cropPadding
                if x2 > 639:
                    x2 = 639
    
                y1 = y1 - cropPadding
                if y1 < 1:
                    y1 = 1
                    
                y2 = y2 + cropPadding
                if y2 > 639:
                    y2 = 639
                
                # put box in cam
                cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 255), 3)
    
                handImg = img[y1:y2, x1:x2]
    
                detectionResults = model2(handImg, stream=True)
    
                for h in detectionResults:
                    handBoxes = h.boxes
    
                    for hbox in handBoxes:
                        # bounding box
                        x3, y3, x4, y4 = hbox.xyxy[0]
                        x3, y3, x4, y4 = int(x3) + x1, int(y3) + y1, int(x4) + x1, int(y4) + y1 # convert to int values and add the x1/y1 value to offset them to properly place them in the uncropped image.
    
                        # put box in cam
                        cv2.rectangle(img, (x3, y3), (x4, y4), (255, 255, 255), 3)
                        
                        # confidence
                        confidence = math.ceil((hbox.conf[0]*100))/100
                        print("Confidence --->",confidence)
            
                        # class name
                        cls = int(hbox.cls[0])
                        classDetected = classNames2[cls]
                        print("Class name -->", classDetected)
            
                        # object details
                        org = [x3, y3]
                        font = cv2.FONT_HERSHEY_SIMPLEX
                        fontScale = 1
                        color = (255, 255, 255)
                        thickness = 2
            
                        cv2.putText(img, classNames2[cls], org, font, fontScale, color, thickness)

                        # add actions to memory
                        if classDetected != "none":
                            actionsDetected = [classDetected] + actionsDetected
            
    #if no actions detected, add a blank to the memorym, otherwise add the actions
    if len(actionsDetected) == 0:
        actionMemory = ["none"] + actionMemory
    else:
        actionMemory = actionsDetected + actionMemory

    #limit the memory size
    actionMemory = actionMemory[:actionMemorySize]

    #check how many of each action there are in the memory
    nextCount=actionMemory.count("NextSlide")
    prevCount=actionMemory.count("PreviousSlide")
    lastCount=actionMemory.count("LastSlide")
    firstCount=actionMemory.count("FirstSlide")

    #display the memory
    font = cv2.FONT_HERSHEY_SIMPLEX
    fontScale = 0.5
    color = (255, 255, 255)
    thickness = 1
    cv2.putText(img, "Next Slide", [10,30], font, fontScale, color, thickness)
    cv2.rectangle(img, (10, 40), (10 + nextCount*10, 50), (255, 0, 0), -1)
    cv2.rectangle(img, (10, 40), (10 + actionThreshhold*10, 50), (255, 255, 255), 1)
    cv2.putText(img, "Previous Slide", [10,80], font, fontScale, color, thickness)
    cv2.rectangle(img, (10, 90), (10 + prevCount*10, 100), (0, 150, 128), -1)
    cv2.rectangle(img, (10, 90), (10 + actionThreshhold*10, 100), (255, 255, 255), 1)
    cv2.putText(img, "Last Slide", [10,130], font, fontScale, color, thickness)
    cv2.rectangle(img, (10, 140), (10 + lastCount*10, 150), (0, 0, 255), -1)
    cv2.rectangle(img, (10, 140), (10 + actionThreshhold*10, 150), (255, 255, 255), 1)
    cv2.putText(img, "First Slide", [10,180], font, fontScale, color, thickness)
    cv2.rectangle(img, (10, 190), (10 + firstCount*10, 200), (255, 255, 0), -1)
    cv2.rectangle(img, (10, 190), (10 + actionThreshhold*10, 200), (255, 255, 255), 1)

    #take action
    if nextCount >= actionThreshhold:
        actionMemory = []
        press(keyNextSlide)
    if prevCount >= actionThreshhold:
        actionMemory = []
        press(keyPrevSlide)
    if lastCount >= actionThreshhold:
        actionMemory = []
        press(keyLastSlide)
    if firstCount >= actionThreshhold:
        actionMemory = []
        press(keyFirstSlide)

    
    cv2.imshow('Webcam', img)
    if cv2.waitKey(1) == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()


0: 480x640 (no detections), 48.4ms
Speed: 2.5ms preprocess, 48.4ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 44.4ms
Speed: 1.0ms preprocess, 44.4ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 84.2ms
Speed: 1.5ms preprocess, 84.2ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 78.9ms
Speed: 2.0ms preprocess, 78.9ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 49.7ms
Speed: 2.5ms preprocess, 49.7ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 42.1ms
Speed: 1.0ms preprocess, 42.1ms inference, 0.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 54.2ms
Speed: 8.0ms preprocess, 54.2ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 44.4ms
Speed: 1.0ms preprocess, 44.4ms i