### Importação das bibliotecas

In [1]:
import cv2 as cv
from ultralytics import YOLO
import random

### Configurações

In [2]:
MODEL_SOURCE_PATH = 'yolov8n.pt'
VIDEO_SOURCE_PATH = 'files/vehicle-counting.mp4' # WEB CAM = 0

# Definindo a largura e a altura dos frames
LARGURA_FRAME = 640
ALTURA_FRAME = 480

LIMIAR_CONFIANCA = 0.3 # Limiar de confiança

# Se SKIP_FRAMES for 2, a cada 2 frames será processado
SKIP_FRAMES = 2
frame_count = 0

### Modelo pré-treinado

In [3]:
# Carregando o modelo pré-treinado YOLOv8n
model = YOLO(MODEL_SOURCE_PATH, "v8")

# Obtendo o nome de todas as classes do modelo
lista_classes = list(model.model.names.values())

# Obtendo o número máximo de classes detectadas pelo modelo
num_classes = len(model.model.names)

# Vamos gerar cores aleatórias para as classes
cores_deteccao = []
for i in range(num_classes):
    r = random.randint(0, 255)
    g = random.randint(0, 255)
    b = random.randint(0, 255)
    cores_deteccao.append((b, g, r))

In [4]:
def process_video(source_path: str | int = 0) -> None:
    global frame_count

    # Carregando o vídeo
    cap = cv.VideoCapture(source_path)

    while cap.isOpened():
        # Capturando frame a frame
        ret, frame = cap.read()

        if not ret:
            print("FIM!")
            break

        frame_count += 1
        if frame_count % SKIP_FRAMES != 0:
            continue

        # Redimensionando o frame
        frame = cv.resize(frame, (LARGURA_FRAME, ALTURA_FRAME))

        # Realizando a detecção de objetos no frame
        deteccoes = model.track(source=[frame], conf=LIMIAR_CONFIANCA, save=False, iou=0.70, imgsz=640)

        # Convertendo a saída do modelo para um numpy array
        if len(deteccoes) != 0:
            for deteccao in deteccoes:
                caixas = deteccao.boxes
                for caixa in caixas:
                    id_classe = int(caixa.cls[0])
                    confianca = float(caixa.conf[0])
                    bb = caixa.xyxy[0]

                    # Desenhando uma caixa delimitadora ao redor do objeto detectado
                    cv.rectangle(frame,
                                (int(bb[0]), int(bb[1])),
                                (int(bb[2]), int(bb[3])),
                                cores_deteccao[id_classe],
                                3)
                    
                    # Exibindo o nome da classe e a confiança da detecção
                    fonte = cv.FONT_HERSHEY_COMPLEX
                    cv.putText(
                        frame,
                        lista_classes[int(id_classe)]
                        + " "
                        + str(round(confianca, 3))
                        + "%",
                        (int(bb[0]), int(bb[1]) - 10),
                        fonte,
                        1,
                        (255, 255, 255),
                        2,
                    )

        # Exibindo o frame resultante
        cv.imshow('Detecção de Objetos', frame)

        # Terminando a execução quando "Q" é pressionado
        if cv.waitKey(1) == ord('q'):
            break

    cap.release() # Libera a captura de vídeo
    cv.destroyAllWindows() # Fecha todas as janelas

In [7]:
# Executar web cam
process_video()


0: 480x640 1 person, 192.1ms
Speed: 14.0ms preprocess, 192.1ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 242.7ms
Speed: 3.0ms preprocess, 242.7ms inference, 4.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 295.0ms
Speed: 1.0ms preprocess, 295.0ms inference, 4.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 238.9ms
Speed: 3.1ms preprocess, 238.9ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 183.4ms
Speed: 1.0ms preprocess, 183.4ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 173.2ms
Speed: 2.0ms preprocess, 173.2ms inference, 1.6ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 179.7ms
Speed: 3.0ms preprocess, 179.7ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 182.8ms
Speed: 5.0ms preprocess, 182.8ms inference, 2.5ms postprocess per image a

In [6]:
# Executar video de demonstração
process_video(source_path=VIDEO_SOURCE_PATH)


0: 480x640 2 cars, 219.8ms
Speed: 9.0ms preprocess, 219.8ms inference, 5.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 3 cars, 1 bus, 220.3ms
Speed: 2.0ms preprocess, 220.3ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 cars, 203.4ms
Speed: 5.6ms preprocess, 203.4ms inference, 3.5ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 cars, 1 truck, 186.0ms
Speed: 2.0ms preprocess, 186.0ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 cars, 1 truck, 174.2ms
Speed: 2.0ms preprocess, 174.2ms inference, 6.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 cars, 221.9ms
Speed: 1.0ms preprocess, 221.9ms inference, 2.4ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 cars, 241.3ms
Speed: 3.0ms preprocess, 241.3ms inference, 3.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 cars, 205.8ms
Speed: 2.0ms preprocess, 205.8ms inference, 5.0ms postprocess per