In [2]:
pip install timm

Defaulting to user installation because normal site-packages is not writeable
Collecting timm
  Downloading timm-0.9.5-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Installing collected packages: timm
Successfully installed timm-0.9.5
Note: you may need to restart the kernel to use updated packages.


In [3]:
import cv2
import torch
import numpy as np
from PIL import Image
from transformers import DetrImageProcessor, DetrForObjectDetection

# Load DETR model
processor = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")

# Open the camera
cv2.namedWindow("preview")
cap = cv2.VideoCapture(0)

#if cap.isOpened(): # try to get the first frame
#    rval, frame = cap.read()
#else:
#    rval = False

while True:
    # Capture frame-by-frame
    #cv2.imshow("preview", frame)
    ret, frame = cap.read()
    key = cv2.waitKey(20)
    if key == 27: # exit on ESC
        break

    if not ret:
        continue

    # Convert the frame to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Convert the frame to PIL image
    pil_image = Image.fromarray(np.uint8(rgb_frame)).convert('RGB')

    # Preprocess the image with DETR processor
    inputs = processor(images=pil_image, return_tensors="pt")

    # Forward pass with DETR model
    outputs = model(**inputs)

    # Convert outputs (bounding boxes and class logits) to COCO API
    # let's only keep detections with score > 0.9
    target_sizes = torch.tensor([pil_image.size[::-1]])
    results = processor.post_process_object_detection(outputs, target_sizes=target_sizes, threshold=0.9)[0]

    for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
        box = [round(i, 2) for i in box.tolist()]
        print(
            f"Detected {model.config.id2label[label.item()]} with confidence "
            f"{round(score.item(), 3)} at location {box}"
        )

    # Display the resulting frame
    cv2.imshow('frame', frame)

    # Press 'q' to exit
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the capture and destroy the window
cap.release()
cv2.destroyAllWindows()

Detected person with confidence 0.998 at location [23.88, 40.93, 580.4, 475.56]
Detected person with confidence 0.997 at location [23.54, 40.4, 583.61, 475.7]
Detected person with confidence 0.999 at location [26.01, 41.51, 583.62, 475.51]
Detected person with confidence 0.999 at location [24.36, 41.18, 582.84, 475.42]
Detected person with confidence 0.999 at location [25.4, 40.43, 582.7, 475.31]
Detected cell phone with confidence 0.965 at location [51.64, 164.86, 168.77, 369.93]
Detected person with confidence 0.996 at location [27.19, 41.61, 583.6, 475.6]
Detected person with confidence 0.999 at location [21.85, 39.96, 583.7, 475.39]
Detected person with confidence 0.998 at location [25.02, 45.05, 582.33, 475.51]
Detected person with confidence 0.99 at location [0.26, 50.62, 517.27, 475.18]
Detected person with confidence 0.998 at location [0.99, 27.9, 586.26, 474.99]
Detected person with confidence 0.992 at location [0.18, 33.38, 613.58, 474.88]
Detected person with confidence 0.98

KeyboardInterrupt: 