In [3]:
import os
import cv2
import torch
from detectron2.config import get_cfg
from detectron2.engine import DefaultPredictor
from detectron2.data import MetadataCatalog
from detectron2 import model_zoo
import numpy as np

# Function to set up the custom Detectron2 model
def get_custom_model(model_path, min_size):
    cfg = get_cfg()
    cfg.merge_from_file(model_zoo.get_config_file("COCO-Detection/faster_rcnn_R_50_FPN_3x.yaml"))
    cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1  # Only one class (chair)
    cfg.MODEL.WEIGHTS = model_path  # Path to the saved model
    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.7  # Increased confidence threshold for detection
    cfg.INPUT.MIN_SIZE_TEST = min_size  # Set minimum size for the input

    MetadataCatalog.get("chair_val").thing_classes = ["chair"]  # Ensure metadata is registered

    return DefaultPredictor(cfg)

# Function to filter out small boxes
def is_large_box(box, min_width=100, min_height=100):
    x1, y1, x2, y2 = box
    return (x2 - x1) > min_width and (y2 - y1) > min_height

# Function to process the video
def process_video_with_custom_model(input_path, model_path, min_size, output_path):
    cap = cv2.VideoCapture(input_path)

    if not cap.isOpened():
        print('Error while trying to read video')
        return

    # Get frame width and height
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)

    # Define codec and create VideoWriter object
    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))

    frame_count = 0
    predictor = get_custom_model(model_path, min_size)

    previous_boxes = []
    previous_scores = []

    # Read until end of video
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        outputs = predictor(frame)
        instances = outputs["instances"].to("cpu")

        boxes = instances.pred_boxes.tensor.numpy()
        scores = instances.scores.numpy()
        classes = instances.pred_classes.numpy()

        current_boxes = []
        current_scores = []

        for box, score, class_id in zip(boxes, scores, classes):
            if score > 0.7 and is_large_box(box):  # Confidence threshold and size filter
                current_boxes.append(box)
                current_scores.append(score)
                x1, y1, x2, y2 = map(int, box)
                label_name = f"chair: {score:.2f}"
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)  # Red bounding box
                cv2.putText(frame, label_name, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2)

        # Stabilize bounding boxes
        if previous_boxes and previous_scores:
            for prev_box, prev_score in zip(previous_boxes, previous_scores):
                if any(np.allclose(prev_box, box, atol=10) for box in current_boxes):
                    for i, (box, score) in enumerate(zip(current_boxes, current_scores)):
                        if np.allclose(prev_box, box, atol=10):
                            if abs(score - prev_score) < 0.1:  # If the score change is less than 10%
                                current_boxes[i] = prev_box
                                current_scores[i] = prev_score

        previous_boxes = current_boxes
        previous_scores = current_scores

        print(f"Frame {frame_count}: Detected {len(current_boxes)} instances")

        # Write the frame with detections to the output video
        out.write(frame)

        # Display the frame
        cv2.imshow("Chair Detection", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

        frame_count += 1

    cap.release()
    out.release()
    cv2.destroyAllWindows()

    print(f"Processed video saved as {output_path}")

# Parameters
input_path = "D:/ARTIFICIAL INTELLIGENCE/SEMESTER 1/Computer Vision and Deep Learning/CV_PROJECT_LAZY_TRAIN/human-pose-estimation-opencv/Chair_occupancy/Training_chair_detection_FasterRCNN/videoDet3.mp4"
model_path = "D:/ARTIFICIAL INTELLIGENCE/SEMESTER 1/Computer Vision and Deep Learning/CV_PROJECT_LAZY_TRAIN/human-pose-estimation-opencv/Chair_occupancy/Training_chair_detection_FasterRCNN/output/model_final.pth"
min_size = 800
output_path = "D:/ARTIFICIAL INTELLIGENCE/SEMESTER 1/Computer Vision and Deep Learning/CV_PROJECT_LAZY_TRAIN/human-pose-estimation-opencv/Chair_occupancy/Training_chair_detection_FasterRCNN/video_output/output3_video_with_custom_model.mp4"

process_video_with_custom_model(input_path, model_path, min_size, output_path)

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


Frame 0: Detected 0 instances
Frame 1: Detected 0 instances
Frame 2: Detected 0 instances
Frame 3: Detected 0 instances
Frame 4: Detected 0 instances
Frame 5: Detected 0 instances
Frame 6: Detected 0 instances
Frame 7: Detected 0 instances
Frame 8: Detected 0 instances
Frame 9: Detected 0 instances
Frame 10: Detected 0 instances
Frame 11: Detected 0 instances
Frame 12: Detected 0 instances
Frame 13: Detected 0 instances
Frame 14: Detected 0 instances
Frame 15: Detected 0 instances
Frame 16: Detected 0 instances
Frame 17: Detected 0 instances
Frame 18: Detected 0 instances
Frame 19: Detected 0 instances
Frame 20: Detected 0 instances
Frame 21: Detected 0 instances
Frame 22: Detected 0 instances
Frame 23: Detected 0 instances
Frame 24: Detected 0 instances
Frame 25: Detected 0 instances
Frame 26: Detected 0 instances
Frame 27: Detected 0 instances
Frame 28: Detected 0 instances
Frame 29: Detected 0 instances
Frame 30: Detected 0 instances
Frame 31: Detected 0 instances
Frame 32: Detected

KeyboardInterrupt: 