In [2]:
import numpy as np
import cv2
from ultralytics import YOLO
import supervision as sv
import pandas as pd

# Load YOLOv8 model
model = YOLO("yolov8n.pt")
box_annotator = sv.BoxAnnotator()

columns =['Person ID','Counter','Time taken']

df=pd.DataFrame(columns=columns)



# Define zones
zone_1 = np.array([[747, 622], [707, 38], [807, 22], [931, 654], [747, 622]])
zone_2 = np.array([[1039, 62], [1243, 546], [1271, 502], [1231, 286], [1107, 34], [1039, 62]])
zones = [zone_1, zone_2]
zone_annotator = sv.PolygonAnnotator()  # Assuming you have a PolygonAnnotator for zones
box_annotator = sv.BoxAnnotator()

tracker = sv.ByteTrack()  # ByteTrack tracker object
people_enter_queue = {}  # Dictionary to store entry times and zones
frame_count = 0
cap = cv2.VideoCapture("Retail.mp4")
def process_frame(frame: np.ndarray, i) -> np.ndarray:
    global frame_count

    
    results = model(frame)
    detections = sv.Detections.from_ultralytics(results[0])
    detections = detections[(detections.class_id == 0) & (detections.confidence > 0.5)]

    # Update tracker with detections
    detections = tracker.update_with_detections(detections)

    # Get the boxes and track IDs
    boxes = detections.xyxy
    track_ids = detections.tracker_id if isinstance(detections.tracker_id, np.ndarray) else np.array(detections.tracker_id)

    num_people_in_zone1 = 0
    num_people_in_zone2 = 0
    annotations = []

    for box, track_id in zip(boxes, track_ids):
        x1, y1, x2, y2 = map(int, box)
        x = (x1 + x2) / 2
        y = (y1 + y2) / 2

        in_zone_1 = cv2.pointPolygonTest(zone_1, (x, y), False) > 0
        in_zone_2 = cv2.pointPolygonTest(zone_2, (x, y), False) > 0

        if in_zone_1 or in_zone_2:
            if in_zone_1:
                num_people_in_zone1+=1
            else:
                num_people_in_zone2+=1
                   
            if str(track_id) not in people_enter_queue:
                people_enter_queue[str(track_id)] = {
                    'entry_frame': frame_count,
                    'zone': '1' if in_zone_1 else '2'
                }

            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            
            entry_frame = people_enter_queue[str(track_id)]['entry_frame']
            time_spent = (frame_count - entry_frame) / cap.get(cv2.CAP_PROP_FPS)
            cv2.putText(frame, f"ID: {track_id}", (x1, y1 - 25), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 255, 0), 2)
            cv2.putText(frame, f"Time: {time_spent:.2f}s", (x1, y1 - 5), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 255, 0), 2)
            # zone = people_enter_queue[str(track_id)]['zone']
            # df.loc[len(df)]=[track_id,zone,time_spent]

        else:
            if str(track_id) in people_enter_queue:
                entry_frame = people_enter_queue[str(track_id)]['entry_frame']
                exit_frame = frame_count
                time_spent = (exit_frame - entry_frame) / cap.get(cv2.CAP_PROP_FPS)
                zone = people_enter_queue[str(track_id)]['zone']
                df.loc[len(df)]=[track_id,zone,time_spent]
                print(f"Person ID: {track_id} spent {time_spent:.2f} seconds in {zone}")
                
                del people_enter_queue[str(track_id)]

    # Annotate zones
    frame = box_annotator.annotate(frame.copy(), detections=detections)  
    cv2.putText(frame, f'People in counter 1: {num_people_in_zone1}', (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 
                1, (255, 255, 255), 2, cv2.LINE_AA)
    cv2.putText(frame, f'People in counter 2: {num_people_in_zone2}', (50, 100), cv2.FONT_HERSHEY_SIMPLEX, 
                1, (255, 255, 255), 2, cv2.LINE_AA) 

    frame_count += 1

    return frame

sv.process_video(
    source_path="Retail.mp4",
    target_path="result.mp4",
    callback=process_frame
)

df.to_csv('waittime.csv')


0: 384x640 5 persons, 1 tv, 80.8ms
Speed: 4.0ms preprocess, 80.8ms inference, 9.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 2 tvs, 66.7ms
Speed: 4.9ms preprocess, 66.7ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 3 tvs, 53.9ms
Speed: 1.7ms preprocess, 53.9ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 2 tvs, 50.7ms
Speed: 1.8ms preprocess, 50.7ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 1 tv, 61.9ms
Speed: 1.7ms preprocess, 61.9ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 1 tv, 62.2ms
Speed: 2.1ms preprocess, 62.2ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 1 tv, 90.7ms
Speed: 3.5ms preprocess, 90.7ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 persons, 1 tv, 107.2ms
Speed: 4.6ms preprocess, 107.