In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import cv2
import torch
from torchvision.transforms import functional as F
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights
import torchvision.transforms as transforms

# Check if CUDA (GPU) is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
# Load a pretrained Faster R-CNN model and move it to the appropriate device (GPU/CPU)
model = fasterrcnn_resnet50_fpn(weights=FasterRCNN_ResNet50_FPN_Weights.COCO_V1)
model.to(device)  # Move model to GPU (if available)
model.eval()  # Set the model to evaluation mode

pipeline = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((800, 800)),
    transforms.ToTensor(),
    transforms.Normalize(                   # Normalize based on ImageNet's statistics
        mean=[0.485, 0.456, 0.406],         # Mean for ImageNet
        std=[0.229, 0.224, 0.225]           # Std for ImageNet
    ),
])

# Load COCO class labels (91 classes)
coco_classes = [
    "__background__", "person", "bicycle", "car", "motorcycle", "airplane", "bus",
    "train", "truck", "boat", "traffic light", "fire hydrant", "stop sign",
    "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
    "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag",
    "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite",
    "baseball bat", "baseball glove", "skateboard", "surfboard", "tennis racket",
    "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana",
    "apple", "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza",
    "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table",
    "toilet", "TV", "laptop", "mouse", "remote", "keyboard", "cell phone",
    "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock",
    "vase", "scissors", "teddy bear", "hair drier", "toothbrush"
]

# Open webcam feed
cap = cv2.VideoCapture(1)  # my webcam is on index 1, you may need to change this

while True:
    ret, frame = cap.read()
    if not ret:
        print("Error: Could not read frame.")
        break

    frame = cv2.flip(frame, 1)  # Flip the frame horizontally
    # Convert the frame from BGR to RGB
    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    # Convert the frame to a PyTorch tensor and add a batch dimension
    input_tensor = pipeline(rgb_frame).unsqueeze(0)

    # Move input tensor to the same device as the model (GPU or CPU)
    input_tensor = input_tensor.to(device)

    # Perform object detection
    with torch.no_grad():
        predictions = model(input_tensor)[0]  # Get predictions for the first image in batch

    # Draw the bounding boxes and labels on the frame
    for idx, box in enumerate(predictions["boxes"]):
        score = predictions["scores"][idx].item()
        label_idx = predictions["labels"][idx].item()

        # Only draw boxes for high-confidence detections
        if score > 0.5:
            # Extract box coordinates
            x1, y1, x2, y2 = map(int, box)

            # Check if label_idx is within the valid range
            if 0 <= label_idx < len(coco_classes):
                label = coco_classes[label_idx]
            else:
                label = "Unknown"

            # Draw the bounding box
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

            # Add the label and score
            label_text = f"{label}: {score:.2f}"
            cv2.putText(frame, label_text, (x1, y1 - 10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    # Display the frame with detections
    cv2.imshow("Object Detection", frame)

    # Exit the loop on 'q' key press
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the webcam and close all OpenCV windows
cap.release()
cv2.destroyAllWindows()

cuda
