**YOLOV8 IMPORT**

In [1]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.2.39-py3-none-any.whl (792 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m792.7/792.7 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.0-py3-none-any.whl (25 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.8.0->ultralytics)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.8.0->ultralytics)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.8.0->ultralytics)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.8.0->ultralytics)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-c

In [2]:
import ultralytics

from ultralytics import YOLO
model = YOLO("yolov8n.pt")

Downloading https://github.com/ultralytics/assets/releases/download/v8.2.0/yolov8n.pt to 'yolov8n.pt'...


100%|██████████| 6.23M/6.23M [00:00<00:00, 23.0MB/s]


**DEEPSORT IMPORT**

In [3]:
!git clone https://github.com/granthikhalder/deep_sort

Cloning into 'yolov8_deepsort'...
remote: Enumerating objects: 84, done.[K
remote: Counting objects: 100% (48/48), done.[K
remote: Compressing objects: 100% (43/43), done.[K
remote: Total 84 (delta 4), reused 45 (delta 4), pack-reused 36[K
Receiving objects: 100% (84/84), 51.25 MiB | 19.03 MiB/s, done.
Resolving deltas: 100% (5/5), done.


In [5]:
from deep_sort.deep_sort import DeepSort

deep_sort_weights = 'deep_sort/deep/checkpoint/ckpt.t7'
deep_sort = DeepSort(model_path=deep_sort_weights)

**OBJECT CLASS NAMES**

In [6]:
class_names = model.names

**YOLOV8 & DEEPSORT IMPLEMENTATION**

In [10]:
import cv2
import numpy as np
from tqdm import tqdm
import torch

input_video_path = 'input.mp4'
output_video_path = 'output_yolo_deepsort.mp4'
cap = cv2.VideoCapture(input_video_path)

# Video properties
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

# Initialize VideoWriter
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

# Progress bar
pbar = tqdm(total=total_frames, desc='Processing video')

track_id_to_class_id = {}

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # YOLO
    results = model(frame)

    detections = results[0].boxes.data.cpu().numpy()

    bbox_xywh = []
    confidences = []
    class_ids = []

    for detection in detections:
        x1, y1, x2, y2, conf, cls = detection[:6]
        x_center = (x1 + x2) / 2
        y_center = (y1 + y2) / 2
        width = x2 - x1
        height = y2 - y1
        bbox_xywh.append([x_center, y_center, width, height])
        confidences.append(conf)
        class_ids.append(int(cls))

    bbox_xywh = np.array(bbox_xywh)
    confidences = np.array(confidences)
    class_ids = np.array(class_ids)

    # DEEPSORT Tracker
    outputs = deep_sort.update(bbox_xywh, confidences, frame)

    # Draw the tracking results
    for output in outputs:
        x1, y1, x2, y2, track_id = output[:5]

        # Map the track ID to the class ID
        if track_id not in track_id_to_class_id:
            track_id_to_class_id[track_id] = class_ids[0]  # assign the first class ID for this example

        class_id = track_id_to_class_id[track_id]
        class_name = class_names[class_id]

        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, f"ID: {track_id} {class_name}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    # Output video
    out.write(frame)

    # Progress bar Update
    pbar.update(1)

cap.release()
out.release()
cv2.destroyAllWindows()
pbar.close()


Processing video:   0%|          | 0/350 [00:00<?, ?it/s][A




Processing video:   7%|▋         | 25/350 [00:14<03:05,  1.75it/s]

0: 384x640 2 persons, 163.4ms
Speed: 6.5ms preprocess, 163.4ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)




Processing video:   0%|          | 1/350 [00:00<01:57,  2.98it/s][A


0: 384x640 3 persons, 148.0ms
Speed: 3.7ms preprocess, 148.0ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   1%|          | 2/350 [00:00<02:04,  2.80it/s][A


0: 384x640 3 persons, 192.5ms
Speed: 5.4ms preprocess, 192.5ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   1%|          | 3/350 [00:01<02:17,  2.53it/s][A


0: 384x640 4 persons, 160.3ms
Speed: 3.8ms preprocess, 160.3ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   1%|          | 4/350 [00:01<02:24,  2.40it/s][A


0: 384x640 4 persons, 203.1ms
Speed: 12.9ms preprocess, 203.1ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   1%|▏         | 5/350 [00:02<02:36,  2.21it/s][A


0: 384x640 4 persons, 148.5ms
Speed: 4.0ms preprocess, 148.5ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   2%|▏         | 6/350 [00:02<02:35,  2.21it/s][A


0: 384x640 5 persons, 151.3ms
Speed: 4.1ms preprocess, 151.3ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   2%|▏         | 7/350 [00:03<02:44,  2.09it/s][A


0: 384x640 3 persons, 146.1ms
Speed: 3.9ms preprocess, 146.1ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   2%|▏         | 8/350 [00:03<02:31,  2.25it/s][A


0: 384x640 3 persons, 146.6ms
Speed: 3.8ms preprocess, 146.6ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   3%|▎         | 9/350 [00:03<02:26,  2.33it/s][A


0: 384x640 4 persons, 162.6ms
Speed: 4.2ms preprocess, 162.6ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   3%|▎         | 10/350 [00:04<02:28,  2.29it/s][A


0: 384x640 4 persons, 162.9ms
Speed: 4.9ms preprocess, 162.9ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   3%|▎         | 11/350 [00:04<02:32,  2.23it/s][A


0: 384x640 2 persons, 171.6ms
Speed: 4.4ms preprocess, 171.6ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   3%|▎         | 12/350 [00:05<02:20,  2.40it/s][A


0: 384x640 2 persons, 1 bird, 151.2ms
Speed: 4.0ms preprocess, 151.2ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   4%|▎         | 13/350 [00:05<02:17,  2.45it/s][A


0: 384x640 3 persons, 173.6ms
Speed: 4.0ms preprocess, 173.6ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   4%|▍         | 14/350 [00:05<02:18,  2.43it/s][A


0: 384x640 3 persons, 1 dog, 268.6ms
Speed: 3.9ms preprocess, 268.6ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   4%|▍         | 15/350 [00:06<02:47,  2.00it/s][A


0: 384x640 3 persons, 275.4ms
Speed: 4.0ms preprocess, 275.4ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   5%|▍         | 16/350 [00:07<02:58,  1.87it/s][A


0: 384x640 3 persons, 1 skateboard, 269.2ms
Speed: 3.8ms preprocess, 269.2ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   5%|▍         | 17/350 [00:07<03:15,  1.70it/s][A


0: 384x640 3 persons, 1 skateboard, 306.8ms
Speed: 3.9ms preprocess, 306.8ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   5%|▌         | 18/350 [00:08<03:32,  1.56it/s][A


0: 384x640 3 persons, 1 skateboard, 287.4ms
Speed: 6.8ms preprocess, 287.4ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   5%|▌         | 19/350 [00:09<03:36,  1.53it/s][A


0: 384x640 3 persons, 1 skateboard, 183.4ms
Speed: 3.9ms preprocess, 183.4ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   6%|▌         | 20/350 [00:09<03:19,  1.65it/s][A


0: 384x640 3 persons, 1 backpack, 150.1ms
Speed: 3.7ms preprocess, 150.1ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   6%|▌         | 21/350 [00:10<03:03,  1.79it/s][A


0: 384x640 4 persons, 149.7ms
Speed: 5.3ms preprocess, 149.7ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   6%|▋         | 22/350 [00:10<02:53,  1.90it/s][A


0: 384x640 4 persons, 152.8ms
Speed: 3.7ms preprocess, 152.8ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   7%|▋         | 23/350 [00:11<02:45,  1.98it/s][A


0: 384x640 4 persons, 159.4ms
Speed: 6.5ms preprocess, 159.4ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   7%|▋         | 24/350 [00:11<02:40,  2.03it/s][A


0: 384x640 3 persons, 181.0ms
Speed: 3.8ms preprocess, 181.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   7%|▋         | 25/350 [00:12<02:31,  2.14it/s][A


0: 384x640 4 persons, 163.8ms
Speed: 3.9ms preprocess, 163.8ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   7%|▋         | 26/350 [00:12<02:30,  2.15it/s][A


0: 384x640 3 persons, 188.1ms
Speed: 7.1ms preprocess, 188.1ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   8%|▊         | 27/350 [00:13<02:25,  2.21it/s][A


0: 384x640 3 persons, 148.1ms
Speed: 14.8ms preprocess, 148.1ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   8%|▊         | 28/350 [00:13<02:20,  2.30it/s][A


0: 384x640 2 persons, 158.2ms
Speed: 5.0ms preprocess, 158.2ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   8%|▊         | 29/350 [00:13<02:09,  2.47it/s][A


0: 384x640 3 persons, 161.1ms
Speed: 6.3ms preprocess, 161.1ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   9%|▊         | 30/350 [00:14<02:07,  2.51it/s][A


0: 384x640 3 persons, 1 suitcase, 150.3ms
Speed: 9.9ms preprocess, 150.3ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   9%|▉         | 31/350 [00:14<02:11,  2.42it/s][A


0: 384x640 3 persons, 189.8ms
Speed: 9.7ms preprocess, 189.8ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   9%|▉         | 32/350 [00:15<02:12,  2.41it/s][A


0: 384x640 3 persons, 1 suitcase, 157.2ms
Speed: 3.8ms preprocess, 157.2ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:   9%|▉         | 33/350 [00:15<02:16,  2.33it/s][A


0: 384x640 3 persons, 180.5ms
Speed: 15.4ms preprocess, 180.5ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  10%|▉         | 34/350 [00:15<02:16,  2.32it/s][A


0: 384x640 3 persons, 1 skateboard, 148.9ms
Speed: 3.6ms preprocess, 148.9ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  10%|█         | 35/350 [00:16<02:17,  2.29it/s][A


0: 384x640 3 persons, 1 skateboard, 196.7ms
Speed: 3.8ms preprocess, 196.7ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  10%|█         | 36/350 [00:16<02:25,  2.16it/s][A


0: 384x640 3 persons, 189.4ms
Speed: 11.1ms preprocess, 189.4ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  11%|█         | 37/350 [00:17<02:21,  2.21it/s][A


0: 384x640 3 persons, 173.3ms
Speed: 4.2ms preprocess, 173.3ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  11%|█         | 38/350 [00:17<02:16,  2.29it/s][A


0: 384x640 3 persons, 176.7ms
Speed: 9.1ms preprocess, 176.7ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  11%|█         | 39/350 [00:18<02:13,  2.34it/s][A


0: 384x640 3 persons, 144.7ms
Speed: 4.3ms preprocess, 144.7ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  11%|█▏        | 40/350 [00:18<02:07,  2.44it/s][A


0: 384x640 3 persons, 166.8ms
Speed: 4.4ms preprocess, 166.8ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  12%|█▏        | 41/350 [00:18<02:06,  2.45it/s][A


0: 384x640 3 persons, 3 skateboards, 149.5ms
Speed: 5.1ms preprocess, 149.5ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  12%|█▏        | 42/350 [00:19<02:26,  2.10it/s][A


0: 384x640 3 persons, 1 skateboard, 258.3ms
Speed: 5.3ms preprocess, 258.3ms inference, 3.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  12%|█▏        | 43/350 [00:20<02:44,  1.86it/s][A


0: 384x640 3 persons, 1 suitcase, 254.3ms
Speed: 8.3ms preprocess, 254.3ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  13%|█▎        | 44/350 [00:20<02:58,  1.71it/s][A


0: 384x640 3 persons, 1 suitcase, 241.5ms
Speed: 4.7ms preprocess, 241.5ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  13%|█▎        | 45/350 [00:21<03:05,  1.64it/s][A


0: 384x640 3 persons, 284.8ms
Speed: 8.0ms preprocess, 284.8ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  13%|█▎        | 46/350 [00:22<03:07,  1.62it/s][A


0: 384x640 3 persons, 1 skateboard, 227.1ms
Speed: 3.9ms preprocess, 227.1ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  13%|█▎        | 47/350 [00:22<03:10,  1.59it/s][A


0: 384x640 4 persons, 195.0ms
Speed: 3.9ms preprocess, 195.0ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  14%|█▎        | 48/350 [00:23<02:56,  1.71it/s][A


0: 384x640 3 persons, 1 skateboard, 175.7ms
Speed: 9.1ms preprocess, 175.7ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  14%|█▍        | 49/350 [00:23<02:47,  1.79it/s][A


0: 384x640 3 persons, 157.4ms
Speed: 5.5ms preprocess, 157.4ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  14%|█▍        | 50/350 [00:24<02:31,  1.98it/s][A


0: 384x640 3 persons, 1 skateboard, 150.5ms
Speed: 3.9ms preprocess, 150.5ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  15%|█▍        | 51/350 [00:24<02:23,  2.09it/s][A


0: 384x640 3 persons, 1 skateboard, 173.8ms
Speed: 4.5ms preprocess, 173.8ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  15%|█▍        | 52/350 [00:25<02:22,  2.10it/s][A


0: 384x640 3 persons, 168.7ms
Speed: 8.2ms preprocess, 168.7ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  15%|█▌        | 53/350 [00:25<02:15,  2.19it/s][A


0: 384x640 3 persons, 147.1ms
Speed: 4.1ms preprocess, 147.1ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  15%|█▌        | 54/350 [00:25<02:08,  2.31it/s][A


0: 384x640 3 persons, 157.3ms
Speed: 5.8ms preprocess, 157.3ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  16%|█▌        | 55/350 [00:26<02:02,  2.40it/s][A


0: 384x640 4 persons, 149.2ms
Speed: 3.8ms preprocess, 149.2ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  16%|█▌        | 56/350 [00:26<02:04,  2.37it/s][A


0: 384x640 4 persons, 173.1ms
Speed: 7.3ms preprocess, 173.1ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  16%|█▋        | 57/350 [00:27<02:08,  2.27it/s][A


0: 384x640 3 persons, 145.1ms
Speed: 5.6ms preprocess, 145.1ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  17%|█▋        | 58/350 [00:27<02:01,  2.39it/s][A


0: 384x640 2 persons, 154.1ms
Speed: 3.7ms preprocess, 154.1ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  17%|█▋        | 59/350 [00:27<01:54,  2.55it/s][A


0: 384x640 2 persons, 160.4ms
Speed: 3.8ms preprocess, 160.4ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  17%|█▋        | 60/350 [00:28<01:48,  2.67it/s][A


0: 384x640 5 persons, 151.4ms
Speed: 3.7ms preprocess, 151.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  17%|█▋        | 61/350 [00:28<01:58,  2.45it/s][A


0: 384x640 5 persons, 166.6ms
Speed: 4.1ms preprocess, 166.6ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  18%|█▊        | 62/350 [00:29<02:06,  2.28it/s][A


0: 384x640 4 persons, 153.9ms
Speed: 4.3ms preprocess, 153.9ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  18%|█▊        | 63/350 [00:29<02:06,  2.28it/s][A


0: 384x640 4 persons, 1 skateboard, 191.6ms
Speed: 3.9ms preprocess, 191.6ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  18%|█▊        | 64/350 [00:30<02:13,  2.14it/s][A


0: 384x640 4 persons, 143.3ms
Speed: 3.7ms preprocess, 143.3ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  19%|█▊        | 65/350 [00:30<02:10,  2.19it/s][A


0: 384x640 6 persons, 1 skateboard, 173.2ms
Speed: 4.3ms preprocess, 173.2ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  19%|█▉        | 66/350 [00:31<02:28,  1.92it/s][A


0: 384x640 4 persons, 201.8ms
Speed: 12.8ms preprocess, 201.8ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  19%|█▉        | 67/350 [00:31<02:26,  1.93it/s][A


0: 384x640 5 persons, 1 skateboard, 145.7ms
Speed: 3.5ms preprocess, 145.7ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  19%|█▉        | 68/350 [00:32<02:29,  1.88it/s][A


0: 384x640 4 persons, 1 skateboard, 156.1ms
Speed: 3.6ms preprocess, 156.1ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  20%|█▉        | 69/350 [00:32<02:27,  1.90it/s][A


0: 384x640 2 persons, 240.3ms
Speed: 3.9ms preprocess, 240.3ms inference, 3.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  20%|██        | 70/350 [00:33<02:25,  1.93it/s][A


0: 384x640 2 persons, 232.2ms
Speed: 3.6ms preprocess, 232.2ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  20%|██        | 71/350 [00:33<02:21,  1.97it/s][A


0: 384x640 2 persons, 229.6ms
Speed: 3.7ms preprocess, 229.6ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  21%|██        | 72/350 [00:34<02:18,  2.01it/s][A


0: 384x640 2 persons, 218.2ms
Speed: 3.8ms preprocess, 218.2ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  21%|██        | 73/350 [00:34<02:15,  2.05it/s][A


0: 384x640 2 persons, 228.5ms
Speed: 4.9ms preprocess, 228.5ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  21%|██        | 74/350 [00:35<02:14,  2.06it/s][A


0: 384x640 2 persons, 249.8ms
Speed: 4.2ms preprocess, 249.8ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  21%|██▏       | 75/350 [00:35<02:16,  2.01it/s][A


0: 384x640 2 persons, 240.8ms
Speed: 4.4ms preprocess, 240.8ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  22%|██▏       | 76/350 [00:36<02:16,  2.01it/s][A


0: 384x640 3 persons, 246.4ms
Speed: 3.8ms preprocess, 246.4ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  22%|██▏       | 77/350 [00:36<02:18,  1.98it/s][A


0: 384x640 2 persons, 153.6ms
Speed: 4.8ms preprocess, 153.6ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  22%|██▏       | 78/350 [00:37<02:02,  2.22it/s][A


0: 384x640 4 persons, 141.7ms
Speed: 3.6ms preprocess, 141.7ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  23%|██▎       | 79/350 [00:37<02:00,  2.25it/s][A


0: 384x640 4 persons, 153.6ms
Speed: 4.5ms preprocess, 153.6ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  23%|██▎       | 80/350 [00:38<02:00,  2.24it/s][A


0: 384x640 4 persons, 183.5ms
Speed: 3.8ms preprocess, 183.5ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  23%|██▎       | 81/350 [00:38<02:02,  2.20it/s][A


0: 384x640 3 persons, 1 skateboard, 153.8ms
Speed: 4.2ms preprocess, 153.8ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  23%|██▎       | 82/350 [00:38<02:01,  2.20it/s][A


0: 384x640 4 persons, 1 skateboard, 177.0ms
Speed: 3.8ms preprocess, 177.0ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  24%|██▎       | 83/350 [00:39<02:06,  2.11it/s][A


0: 384x640 4 persons, 1 skateboard, 165.3ms
Speed: 3.7ms preprocess, 165.3ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  24%|██▍       | 84/350 [00:39<02:09,  2.06it/s][A


0: 384x640 5 persons, 1 skateboard, 171.3ms
Speed: 9.8ms preprocess, 171.3ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  24%|██▍       | 85/350 [00:40<02:18,  1.91it/s][A


0: 384x640 4 persons, 1 skateboard, 189.6ms
Speed: 4.7ms preprocess, 189.6ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  25%|██▍       | 86/350 [00:41<02:20,  1.89it/s][A


0: 384x640 3 persons, 173.5ms
Speed: 10.0ms preprocess, 173.5ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  25%|██▍       | 87/350 [00:41<02:10,  2.01it/s][A


0: 384x640 2 persons, 191.0ms
Speed: 4.0ms preprocess, 191.0ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  25%|██▌       | 88/350 [00:41<02:00,  2.17it/s][A


0: 384x640 2 persons, 155.5ms
Speed: 3.9ms preprocess, 155.5ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  25%|██▌       | 89/350 [00:42<01:49,  2.39it/s][A


0: 384x640 3 persons, 156.2ms
Speed: 3.9ms preprocess, 156.2ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  26%|██▌       | 90/350 [00:42<01:46,  2.44it/s][A


0: 384x640 3 persons, 1 skateboard, 182.5ms
Speed: 11.4ms preprocess, 182.5ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  26%|██▌       | 91/350 [00:43<01:51,  2.33it/s][A


0: 384x640 3 persons, 150.2ms
Speed: 4.8ms preprocess, 150.2ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  26%|██▋       | 92/350 [00:43<01:46,  2.43it/s][A


0: 384x640 4 persons, 141.8ms
Speed: 6.4ms preprocess, 141.8ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  27%|██▋       | 93/350 [00:43<01:49,  2.35it/s][A


0: 384x640 4 persons, 156.1ms
Speed: 6.4ms preprocess, 156.1ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  27%|██▋       | 94/350 [00:44<01:49,  2.33it/s][A


0: 384x640 3 persons, 154.9ms
Speed: 7.0ms preprocess, 154.9ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  27%|██▋       | 95/350 [00:44<01:48,  2.35it/s][A


0: 384x640 3 persons, 180.9ms
Speed: 4.6ms preprocess, 180.9ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  27%|██▋       | 96/350 [00:45<01:47,  2.36it/s][A


0: 384x640 3 persons, 159.7ms
Speed: 6.2ms preprocess, 159.7ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  28%|██▊       | 97/350 [00:45<01:45,  2.40it/s][A


0: 384x640 4 persons, 220.0ms
Speed: 5.4ms preprocess, 220.0ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  28%|██▊       | 98/350 [00:46<01:52,  2.25it/s][A


0: 384x640 3 persons, 191.0ms
Speed: 3.5ms preprocess, 191.0ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  28%|██▊       | 99/350 [00:46<01:49,  2.28it/s][A


0: 384x640 4 persons, 172.2ms
Speed: 5.8ms preprocess, 172.2ms inference, 2.9ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  29%|██▊       | 100/350 [00:47<02:02,  2.04it/s][A


0: 384x640 4 persons, 226.1ms
Speed: 4.9ms preprocess, 226.1ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  29%|██▉       | 101/350 [00:47<02:14,  1.86it/s][A


0: 384x640 3 persons, 239.6ms
Speed: 3.6ms preprocess, 239.6ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  29%|██▉       | 102/350 [00:48<02:16,  1.82it/s][A


0: 384x640 3 persons, 233.8ms
Speed: 6.6ms preprocess, 233.8ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  29%|██▉       | 103/350 [00:49<02:20,  1.76it/s][A


0: 384x640 3 persons, 232.1ms
Speed: 3.9ms preprocess, 232.1ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  30%|██▉       | 104/350 [00:49<02:20,  1.75it/s][A


0: 384x640 3 persons, 262.8ms
Speed: 3.6ms preprocess, 262.8ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  30%|███       | 105/350 [00:50<02:21,  1.73it/s][A


0: 384x640 5 persons, 168.2ms
Speed: 8.0ms preprocess, 168.2ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  30%|███       | 106/350 [00:50<02:16,  1.79it/s][A


0: 384x640 3 persons, 191.3ms
Speed: 4.2ms preprocess, 191.3ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  31%|███       | 107/350 [00:51<02:06,  1.93it/s][A


0: 384x640 5 persons, 187.2ms
Speed: 3.7ms preprocess, 187.2ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  31%|███       | 108/350 [00:51<02:05,  1.93it/s][A


0: 384x640 5 persons, 177.3ms
Speed: 6.4ms preprocess, 177.3ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  31%|███       | 109/350 [00:52<02:04,  1.94it/s][A


0: 384x640 4 persons, 163.6ms
Speed: 3.7ms preprocess, 163.6ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  31%|███▏      | 110/350 [00:52<01:58,  2.03it/s][A


0: 384x640 5 persons, 162.1ms
Speed: 3.8ms preprocess, 162.1ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  32%|███▏      | 111/350 [00:53<01:57,  2.03it/s][A


0: 384x640 3 persons, 158.3ms
Speed: 4.2ms preprocess, 158.3ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  32%|███▏      | 112/350 [00:53<01:49,  2.18it/s][A


0: 384x640 4 persons, 155.6ms
Speed: 6.1ms preprocess, 155.6ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  32%|███▏      | 113/350 [00:53<01:48,  2.18it/s][A


0: 384x640 4 persons, 189.3ms
Speed: 3.7ms preprocess, 189.3ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  33%|███▎      | 114/350 [00:54<01:50,  2.14it/s][A


0: 384x640 3 persons, 154.4ms
Speed: 4.7ms preprocess, 154.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  33%|███▎      | 115/350 [00:54<01:45,  2.23it/s][A


0: 384x640 2 persons, 160.4ms
Speed: 5.5ms preprocess, 160.4ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  33%|███▎      | 116/350 [00:55<01:36,  2.42it/s][A


0: 384x640 2 persons, 166.2ms
Speed: 4.0ms preprocess, 166.2ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  33%|███▎      | 117/350 [00:55<01:31,  2.55it/s][A


0: 384x640 2 persons, 154.2ms
Speed: 4.2ms preprocess, 154.2ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  34%|███▎      | 118/350 [00:55<01:28,  2.63it/s][A


0: 384x640 3 persons, 159.7ms
Speed: 4.9ms preprocess, 159.7ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  34%|███▍      | 119/350 [00:56<01:28,  2.62it/s][A


0: 384x640 3 persons, 181.0ms
Speed: 10.9ms preprocess, 181.0ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  34%|███▍      | 120/350 [00:56<01:31,  2.51it/s][A


0: 384x640 3 persons, 197.4ms
Speed: 4.2ms preprocess, 197.4ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  35%|███▍      | 121/350 [00:57<01:34,  2.42it/s][A


0: 384x640 3 persons, 185.3ms
Speed: 3.7ms preprocess, 185.3ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  35%|███▍      | 122/350 [00:57<01:34,  2.41it/s][A


0: 384x640 4 persons, 150.5ms
Speed: 4.0ms preprocess, 150.5ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  35%|███▌      | 123/350 [00:57<01:37,  2.33it/s][A


0: 384x640 2 persons, 188.1ms
Speed: 3.7ms preprocess, 188.1ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  35%|███▌      | 124/350 [00:58<01:31,  2.46it/s][A


0: 384x640 3 persons, 168.4ms
Speed: 4.6ms preprocess, 168.4ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  36%|███▌      | 125/350 [00:58<01:30,  2.49it/s][A


0: 384x640 3 persons, 163.4ms
Speed: 3.7ms preprocess, 163.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  36%|███▌      | 126/350 [00:59<01:29,  2.51it/s][A


0: 384x640 3 persons, 170.7ms
Speed: 4.3ms preprocess, 170.7ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  36%|███▋      | 127/350 [00:59<01:29,  2.48it/s][A


0: 384x640 3 persons, 153.4ms
Speed: 3.7ms preprocess, 153.4ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  37%|███▋      | 128/350 [00:59<01:28,  2.51it/s][A


0: 384x640 3 persons, 160.9ms
Speed: 4.5ms preprocess, 160.9ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  37%|███▋      | 129/350 [01:00<01:28,  2.49it/s][A


0: 384x640 4 persons, 249.6ms
Speed: 4.2ms preprocess, 249.6ms inference, 2.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  37%|███▋      | 130/350 [01:00<01:44,  2.10it/s][A


0: 384x640 3 persons, 236.5ms
Speed: 4.6ms preprocess, 236.5ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  37%|███▋      | 131/350 [01:01<01:48,  2.01it/s][A


0: 384x640 3 persons, 239.0ms
Speed: 3.7ms preprocess, 239.0ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  38%|███▊      | 132/350 [01:02<01:52,  1.93it/s][A


0: 384x640 3 persons, 238.8ms
Speed: 4.9ms preprocess, 238.8ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  38%|███▊      | 133/350 [01:02<01:54,  1.89it/s][A


0: 384x640 3 persons, 245.4ms
Speed: 4.0ms preprocess, 245.4ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  38%|███▊      | 134/350 [01:03<01:58,  1.83it/s][A


0: 384x640 3 persons, 227.5ms
Speed: 4.3ms preprocess, 227.5ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  39%|███▊      | 135/350 [01:03<01:58,  1.81it/s][A


0: 384x640 3 persons, 220.5ms
Speed: 9.7ms preprocess, 220.5ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  39%|███▉      | 136/350 [01:04<01:51,  1.92it/s][A


0: 384x640 2 persons, 150.8ms
Speed: 4.4ms preprocess, 150.8ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  39%|███▉      | 137/350 [01:04<01:38,  2.17it/s][A


0: 384x640 2 persons, 169.3ms
Speed: 3.5ms preprocess, 169.3ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  39%|███▉      | 138/350 [01:04<01:31,  2.31it/s][A


0: 384x640 2 persons, 149.8ms
Speed: 6.8ms preprocess, 149.8ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  40%|███▉      | 139/350 [01:05<01:25,  2.48it/s][A


0: 384x640 2 persons, 151.9ms
Speed: 4.5ms preprocess, 151.9ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  40%|████      | 140/350 [01:05<01:19,  2.64it/s][A


0: 384x640 3 persons, 163.2ms
Speed: 3.7ms preprocess, 163.2ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  40%|████      | 141/350 [01:06<01:21,  2.55it/s][A


0: 384x640 2 persons, 199.2ms
Speed: 3.8ms preprocess, 199.2ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  41%|████      | 142/350 [01:06<01:20,  2.58it/s][A


0: 384x640 2 persons, 156.1ms
Speed: 4.3ms preprocess, 156.1ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  41%|████      | 143/350 [01:06<01:17,  2.67it/s][A


0: 384x640 3 persons, 201.0ms
Speed: 3.8ms preprocess, 201.0ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  41%|████      | 144/350 [01:07<01:20,  2.56it/s][A


0: 384x640 3 persons, 177.1ms
Speed: 3.6ms preprocess, 177.1ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  41%|████▏     | 145/350 [01:07<01:21,  2.52it/s][A


0: 384x640 3 persons, 197.3ms
Speed: 4.3ms preprocess, 197.3ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  42%|████▏     | 146/350 [01:08<01:22,  2.46it/s][A


0: 384x640 3 persons, 160.9ms
Speed: 3.5ms preprocess, 160.9ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  42%|████▏     | 147/350 [01:08<01:20,  2.52it/s][A


0: 384x640 4 persons, 155.6ms
Speed: 7.0ms preprocess, 155.6ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  42%|████▏     | 148/350 [01:08<01:23,  2.43it/s][A


0: 384x640 3 persons, 154.6ms
Speed: 4.3ms preprocess, 154.6ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  43%|████▎     | 149/350 [01:09<01:21,  2.47it/s][A


0: 384x640 3 persons, 158.3ms
Speed: 3.7ms preprocess, 158.3ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  43%|████▎     | 150/350 [01:09<01:19,  2.52it/s][A


0: 384x640 4 persons, 166.1ms
Speed: 3.7ms preprocess, 166.1ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  43%|████▎     | 151/350 [01:10<01:22,  2.41it/s][A


0: 384x640 4 persons, 178.7ms
Speed: 9.5ms preprocess, 178.7ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  43%|████▎     | 152/350 [01:10<01:25,  2.32it/s][A


0: 384x640 3 persons, 156.9ms
Speed: 3.7ms preprocess, 156.9ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  44%|████▎     | 153/350 [01:10<01:22,  2.37it/s][A


0: 384x640 3 persons, 149.5ms
Speed: 6.4ms preprocess, 149.5ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  44%|████▍     | 154/350 [01:11<01:20,  2.43it/s][A


0: 384x640 3 persons, 152.2ms
Speed: 4.2ms preprocess, 152.2ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  44%|████▍     | 155/350 [01:11<01:18,  2.48it/s][A


0: 384x640 3 persons, 163.5ms
Speed: 5.0ms preprocess, 163.5ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  45%|████▍     | 156/350 [01:12<01:17,  2.49it/s][A


0: 384x640 2 persons, 164.8ms
Speed: 5.0ms preprocess, 164.8ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  45%|████▍     | 157/350 [01:12<01:14,  2.60it/s][A


0: 384x640 2 persons, 151.9ms
Speed: 7.6ms preprocess, 151.9ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  45%|████▌     | 158/350 [01:12<01:11,  2.69it/s][A


0: 384x640 2 persons, 157.7ms
Speed: 4.7ms preprocess, 157.7ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  45%|████▌     | 159/350 [01:13<01:08,  2.78it/s][A


0: 384x640 2 persons, 166.2ms
Speed: 3.8ms preprocess, 166.2ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  46%|████▌     | 160/350 [01:13<01:07,  2.83it/s][A


0: 384x640 2 persons, 157.0ms
Speed: 3.8ms preprocess, 157.0ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  46%|████▌     | 161/350 [01:13<01:06,  2.85it/s][A


0: 384x640 2 persons, 162.3ms
Speed: 3.6ms preprocess, 162.3ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  46%|████▋     | 162/350 [01:14<01:07,  2.79it/s][A


0: 384x640 2 persons, 246.8ms
Speed: 3.5ms preprocess, 246.8ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  47%|████▋     | 163/350 [01:14<01:13,  2.55it/s][A


0: 384x640 2 persons, 237.7ms
Speed: 4.4ms preprocess, 237.7ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  47%|████▋     | 164/350 [01:15<01:17,  2.41it/s][A


0: 384x640 2 persons, 234.6ms
Speed: 3.6ms preprocess, 234.6ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  47%|████▋     | 165/350 [01:15<01:19,  2.33it/s][A


0: 384x640 2 persons, 227.4ms
Speed: 3.8ms preprocess, 227.4ms inference, 2.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  47%|████▋     | 166/350 [01:16<01:22,  2.23it/s][A


0: 384x640 2 persons, 239.1ms
Speed: 3.9ms preprocess, 239.1ms inference, 4.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  48%|████▊     | 167/350 [01:16<01:25,  2.15it/s][A


0: 384x640 2 persons, 231.7ms
Speed: 4.0ms preprocess, 231.7ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  48%|████▊     | 168/350 [01:17<01:25,  2.13it/s][A


0: 384x640 2 persons, 231.8ms
Speed: 4.9ms preprocess, 231.8ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  48%|████▊     | 169/350 [01:17<01:26,  2.08it/s][A


0: 384x640 2 persons, 234.4ms
Speed: 3.8ms preprocess, 234.4ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  49%|████▊     | 170/350 [01:18<01:26,  2.07it/s][A


0: 384x640 2 persons, 200.6ms
Speed: 7.6ms preprocess, 200.6ms inference, 2.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  49%|████▉     | 171/350 [01:18<01:20,  2.22it/s][A


0: 384x640 2 persons, 171.7ms
Speed: 3.7ms preprocess, 171.7ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  49%|████▉     | 172/350 [01:18<01:14,  2.38it/s][A


0: 384x640 2 persons, 171.6ms
Speed: 4.8ms preprocess, 171.6ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  49%|████▉     | 173/350 [01:19<01:09,  2.53it/s][A


0: 384x640 2 persons, 148.7ms
Speed: 3.7ms preprocess, 148.7ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  50%|████▉     | 174/350 [01:19<01:05,  2.69it/s][A


0: 384x640 2 persons, 159.4ms
Speed: 3.8ms preprocess, 159.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  50%|█████     | 175/350 [01:19<01:03,  2.75it/s][A


0: 384x640 4 persons, 164.7ms
Speed: 4.8ms preprocess, 164.7ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  50%|█████     | 176/350 [01:20<01:07,  2.56it/s][A


0: 384x640 2 persons, 186.5ms
Speed: 8.5ms preprocess, 186.5ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  51%|█████     | 177/350 [01:20<01:07,  2.56it/s][A


0: 384x640 3 persons, 195.6ms
Speed: 5.4ms preprocess, 195.6ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  51%|█████     | 178/350 [01:21<01:08,  2.50it/s][A


0: 384x640 3 persons, 183.0ms
Speed: 3.7ms preprocess, 183.0ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  51%|█████     | 179/350 [01:21<01:08,  2.48it/s][A


0: 384x640 3 persons, 167.6ms
Speed: 4.4ms preprocess, 167.6ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  51%|█████▏    | 180/350 [01:21<01:09,  2.45it/s][A


0: 384x640 3 persons, 153.9ms
Speed: 3.5ms preprocess, 153.9ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  52%|█████▏    | 181/350 [01:22<01:07,  2.49it/s][A


0: 384x640 3 persons, 163.7ms
Speed: 5.1ms preprocess, 163.7ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  52%|█████▏    | 182/350 [01:22<01:07,  2.49it/s][A


0: 384x640 2 persons, 177.5ms
Speed: 3.5ms preprocess, 177.5ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  52%|█████▏    | 183/350 [01:23<01:05,  2.57it/s][A


0: 384x640 3 persons, 165.0ms
Speed: 4.1ms preprocess, 165.0ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  53%|█████▎    | 184/350 [01:23<01:04,  2.58it/s][A


0: 384x640 3 persons, 185.6ms
Speed: 12.1ms preprocess, 185.6ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  53%|█████▎    | 185/350 [01:23<01:07,  2.44it/s][A


0: 384x640 3 persons, 154.0ms
Speed: 3.9ms preprocess, 154.0ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  53%|█████▎    | 186/350 [01:24<01:05,  2.51it/s][A


0: 384x640 3 persons, 156.6ms
Speed: 3.5ms preprocess, 156.6ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  53%|█████▎    | 187/350 [01:24<01:03,  2.55it/s][A


0: 384x640 3 persons, 177.4ms
Speed: 3.8ms preprocess, 177.4ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  54%|█████▎    | 188/350 [01:25<01:04,  2.50it/s][A


0: 384x640 3 persons, 158.2ms
Speed: 3.9ms preprocess, 158.2ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  54%|█████▍    | 189/350 [01:25<01:03,  2.55it/s][A


0: 384x640 3 persons, 158.1ms
Speed: 3.6ms preprocess, 158.1ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  54%|█████▍    | 190/350 [01:25<01:03,  2.53it/s][A


0: 384x640 3 persons, 153.1ms
Speed: 4.0ms preprocess, 153.1ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  55%|█████▍    | 191/350 [01:26<01:01,  2.58it/s][A


0: 384x640 3 persons, 150.9ms
Speed: 3.7ms preprocess, 150.9ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  55%|█████▍    | 192/350 [01:26<01:00,  2.61it/s][A


0: 384x640 3 persons, 154.0ms
Speed: 4.9ms preprocess, 154.0ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  55%|█████▌    | 193/350 [01:26<01:01,  2.53it/s][A


0: 384x640 3 persons, 149.9ms
Speed: 3.6ms preprocess, 149.9ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  55%|█████▌    | 194/350 [01:27<01:00,  2.58it/s][A


0: 384x640 3 persons, 155.1ms
Speed: 5.4ms preprocess, 155.1ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  56%|█████▌    | 195/350 [01:27<00:59,  2.61it/s][A


0: 384x640 3 persons, 187.8ms
Speed: 3.6ms preprocess, 187.8ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  56%|█████▌    | 196/350 [01:28<01:00,  2.55it/s][A


0: 384x640 3 persons, 217.0ms
Speed: 3.5ms preprocess, 217.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  56%|█████▋    | 197/350 [01:28<01:07,  2.28it/s][A


0: 384x640 3 persons, 238.2ms
Speed: 8.9ms preprocess, 238.2ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  57%|█████▋    | 198/350 [01:29<01:12,  2.10it/s][A


0: 384x640 3 persons, 230.5ms
Speed: 3.6ms preprocess, 230.5ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  57%|█████▋    | 199/350 [01:29<01:14,  2.02it/s][A


0: 384x640 3 persons, 247.4ms
Speed: 4.5ms preprocess, 247.4ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  57%|█████▋    | 200/350 [01:30<01:18,  1.92it/s][A


0: 384x640 3 persons, 245.6ms
Speed: 3.7ms preprocess, 245.6ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  57%|█████▋    | 201/350 [01:30<01:19,  1.87it/s][A


0: 384x640 3 persons, 248.5ms
Speed: 7.8ms preprocess, 248.5ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  58%|█████▊    | 202/350 [01:31<01:21,  1.81it/s][A


0: 384x640 3 persons, 236.5ms
Speed: 5.5ms preprocess, 236.5ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  58%|█████▊    | 203/350 [01:32<01:19,  1.86it/s][A


0: 384x640 3 persons, 168.4ms
Speed: 4.6ms preprocess, 168.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  58%|█████▊    | 204/350 [01:32<01:12,  2.03it/s][A


0: 384x640 3 persons, 164.4ms
Speed: 3.8ms preprocess, 164.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  59%|█████▊    | 205/350 [01:32<01:07,  2.14it/s][A


0: 384x640 3 persons, 154.5ms
Speed: 6.7ms preprocess, 154.5ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  59%|█████▉    | 206/350 [01:33<01:04,  2.24it/s][A


0: 384x640 3 persons, 153.5ms
Speed: 3.8ms preprocess, 153.5ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  59%|█████▉    | 207/350 [01:33<01:00,  2.35it/s][A


0: 384x640 3 persons, 184.4ms
Speed: 4.3ms preprocess, 184.4ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  59%|█████▉    | 208/350 [01:34<00:59,  2.37it/s][A


0: 384x640 3 persons, 160.8ms
Speed: 6.1ms preprocess, 160.8ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  60%|█████▉    | 209/350 [01:34<00:58,  2.42it/s][A


0: 384x640 3 persons, 150.1ms
Speed: 9.9ms preprocess, 150.1ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  60%|██████    | 210/350 [01:34<00:56,  2.46it/s][A


0: 384x640 3 persons, 161.4ms
Speed: 3.6ms preprocess, 161.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  60%|██████    | 211/350 [01:35<00:55,  2.49it/s][A


0: 384x640 3 persons, 169.3ms
Speed: 4.2ms preprocess, 169.3ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  61%|██████    | 212/350 [01:35<00:55,  2.50it/s][A


0: 384x640 3 persons, 168.1ms
Speed: 4.2ms preprocess, 168.1ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  61%|██████    | 213/350 [01:35<00:54,  2.51it/s][A


0: 384x640 4 persons, 160.3ms
Speed: 3.3ms preprocess, 160.3ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  61%|██████    | 214/350 [01:36<00:56,  2.42it/s][A


0: 384x640 4 persons, 182.9ms
Speed: 6.7ms preprocess, 182.9ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  61%|██████▏   | 215/350 [01:36<00:58,  2.29it/s][A


0: 384x640 4 persons, 149.3ms
Speed: 4.9ms preprocess, 149.3ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  62%|██████▏   | 216/350 [01:37<00:58,  2.29it/s][A


0: 384x640 3 persons, 185.8ms
Speed: 3.6ms preprocess, 185.8ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  62%|██████▏   | 217/350 [01:37<00:57,  2.30it/s][A


0: 384x640 3 persons, 163.4ms
Speed: 3.4ms preprocess, 163.4ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  62%|██████▏   | 218/350 [01:38<00:55,  2.39it/s][A


0: 384x640 4 persons, 158.4ms
Speed: 3.8ms preprocess, 158.4ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  63%|██████▎   | 219/350 [01:38<00:55,  2.35it/s][A


0: 384x640 4 persons, 1 skateboard, 212.8ms
Speed: 5.0ms preprocess, 212.8ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  63%|██████▎   | 220/350 [01:39<01:00,  2.15it/s][A


0: 384x640 4 persons, 150.5ms
Speed: 3.8ms preprocess, 150.5ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  63%|██████▎   | 221/350 [01:39<00:58,  2.19it/s][A


0: 384x640 4 persons, 172.1ms
Speed: 3.8ms preprocess, 172.1ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  63%|██████▎   | 222/350 [01:40<00:57,  2.21it/s][A


0: 384x640 3 persons, 1 skateboard, 158.5ms
Speed: 4.9ms preprocess, 158.5ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  64%|██████▎   | 223/350 [01:40<00:57,  2.22it/s][A


0: 384x640 2 persons, 1 skateboard, 144.8ms
Speed: 8.2ms preprocess, 144.8ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  64%|██████▍   | 224/350 [01:40<00:54,  2.30it/s][A


0: 384x640 3 persons, 170.7ms
Speed: 8.7ms preprocess, 170.7ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  64%|██████▍   | 225/350 [01:41<00:53,  2.34it/s][A


0: 384x640 2 persons, 194.9ms
Speed: 3.7ms preprocess, 194.9ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  65%|██████▍   | 226/350 [01:41<00:50,  2.46it/s][A


0: 384x640 2 persons, 169.7ms
Speed: 3.9ms preprocess, 169.7ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  65%|██████▍   | 227/350 [01:42<00:50,  2.44it/s][A


0: 384x640 2 persons, 228.9ms
Speed: 4.3ms preprocess, 228.9ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  65%|██████▌   | 228/350 [01:42<00:53,  2.29it/s][A


0: 384x640 3 persons, 261.7ms
Speed: 3.8ms preprocess, 261.7ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  65%|██████▌   | 229/350 [01:43<00:58,  2.08it/s][A


0: 384x640 2 persons, 245.9ms
Speed: 8.8ms preprocess, 245.9ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  66%|██████▌   | 230/350 [01:43<00:58,  2.04it/s][A


0: 384x640 2 persons, 2 skateboards, 255.4ms
Speed: 6.4ms preprocess, 255.4ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  66%|██████▌   | 231/350 [01:44<01:04,  1.84it/s][A


0: 384x640 2 persons, 1 skateboard, 268.3ms
Speed: 3.7ms preprocess, 268.3ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  66%|██████▋   | 232/350 [01:44<01:07,  1.75it/s][A


0: 384x640 2 persons, 1 skateboard, 278.1ms
Speed: 8.5ms preprocess, 278.1ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  67%|██████▋   | 233/350 [01:45<01:05,  1.78it/s][A


0: 384x640 2 persons, 199.2ms
Speed: 6.2ms preprocess, 199.2ms inference, 2.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  67%|██████▋   | 234/350 [01:45<00:59,  1.94it/s][A


0: 384x640 2 persons, 153.3ms
Speed: 3.6ms preprocess, 153.3ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  67%|██████▋   | 235/350 [01:46<00:52,  2.19it/s][A


0: 384x640 2 persons, 189.3ms
Speed: 8.5ms preprocess, 189.3ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  67%|██████▋   | 236/350 [01:46<00:49,  2.31it/s][A


0: 384x640 2 persons, 179.8ms
Speed: 6.6ms preprocess, 179.8ms inference, 2.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  68%|██████▊   | 237/350 [01:46<00:46,  2.41it/s][A


0: 384x640 2 persons, 1 skateboard, 149.9ms
Speed: 6.5ms preprocess, 149.9ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  68%|██████▊   | 238/350 [01:47<00:45,  2.48it/s][A


0: 384x640 2 persons, 175.9ms
Speed: 3.5ms preprocess, 175.9ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  68%|██████▊   | 239/350 [01:47<00:43,  2.58it/s][A


0: 384x640 2 persons, 200.0ms
Speed: 4.8ms preprocess, 200.0ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  69%|██████▊   | 240/350 [01:48<00:42,  2.59it/s][A


0: 384x640 2 persons, 153.7ms
Speed: 4.2ms preprocess, 153.7ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  69%|██████▉   | 241/350 [01:48<00:39,  2.73it/s][A


0: 384x640 2 persons, 154.3ms
Speed: 4.1ms preprocess, 154.3ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  69%|██████▉   | 242/350 [01:48<00:38,  2.80it/s][A


0: 384x640 2 persons, 180.1ms
Speed: 4.0ms preprocess, 180.1ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  69%|██████▉   | 243/350 [01:49<00:37,  2.83it/s][A


0: 384x640 2 persons, 155.6ms
Speed: 4.2ms preprocess, 155.6ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  70%|██████▉   | 244/350 [01:49<00:36,  2.91it/s][A


0: 384x640 2 persons, 157.2ms
Speed: 4.2ms preprocess, 157.2ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  70%|███████   | 245/350 [01:49<00:35,  2.92it/s][A


0: 384x640 2 persons, 179.7ms
Speed: 3.7ms preprocess, 179.7ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  70%|███████   | 246/350 [01:50<00:35,  2.91it/s][A


0: 384x640 2 persons, 144.8ms
Speed: 3.8ms preprocess, 144.8ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  71%|███████   | 247/350 [01:50<00:34,  2.96it/s][A


0: 384x640 2 persons, 154.2ms
Speed: 3.8ms preprocess, 154.2ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  71%|███████   | 248/350 [01:50<00:34,  2.99it/s][A


0: 384x640 2 persons, 180.8ms
Speed: 6.8ms preprocess, 180.8ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  71%|███████   | 249/350 [01:51<00:34,  2.93it/s][A


0: 384x640 2 persons, 147.9ms
Speed: 4.2ms preprocess, 147.9ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  71%|███████▏  | 250/350 [01:51<00:33,  3.01it/s][A


0: 384x640 2 persons, 154.7ms
Speed: 3.7ms preprocess, 154.7ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  72%|███████▏  | 251/350 [01:51<00:32,  3.01it/s][A


0: 384x640 2 persons, 2 skateboards, 166.5ms
Speed: 6.5ms preprocess, 166.5ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  72%|███████▏  | 252/350 [01:52<00:36,  2.70it/s][A


0: 384x640 2 persons, 172.0ms
Speed: 6.7ms preprocess, 172.0ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  72%|███████▏  | 253/350 [01:52<00:35,  2.75it/s][A


0: 384x640 2 persons, 174.1ms
Speed: 4.2ms preprocess, 174.1ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  73%|███████▎  | 254/350 [01:52<00:34,  2.77it/s][A


0: 384x640 2 persons, 167.8ms
Speed: 7.7ms preprocess, 167.8ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  73%|███████▎  | 255/350 [01:53<00:33,  2.83it/s][A


0: 384x640 2 persons, 160.7ms
Speed: 3.7ms preprocess, 160.7ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  73%|███████▎  | 256/350 [01:53<00:32,  2.89it/s][A


0: 384x640 2 persons, 172.2ms
Speed: 4.9ms preprocess, 172.2ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  73%|███████▎  | 257/350 [01:53<00:32,  2.86it/s][A


0: 384x640 2 persons, 165.4ms
Speed: 4.1ms preprocess, 165.4ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  74%|███████▎  | 258/350 [01:54<00:31,  2.90it/s][A


0: 384x640 2 persons, 150.6ms
Speed: 3.6ms preprocess, 150.6ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  74%|███████▍  | 259/350 [01:54<00:30,  2.98it/s][A


0: 384x640 2 persons, 177.7ms
Speed: 4.4ms preprocess, 177.7ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  74%|███████▍  | 260/350 [01:54<00:30,  2.95it/s][A


0: 384x640 2 persons, 168.3ms
Speed: 3.7ms preprocess, 168.3ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  75%|███████▍  | 261/350 [01:55<00:30,  2.95it/s][A


0: 384x640 2 persons, 207.3ms
Speed: 8.3ms preprocess, 207.3ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  75%|███████▍  | 262/350 [01:55<00:33,  2.65it/s][A


0: 384x640 2 persons, 1 skateboard, 243.2ms
Speed: 7.7ms preprocess, 243.2ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  75%|███████▌  | 263/350 [01:56<00:38,  2.29it/s][A


0: 384x640 2 persons, 243.5ms
Speed: 6.8ms preprocess, 243.5ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  75%|███████▌  | 264/350 [01:56<00:39,  2.18it/s][A


0: 384x640 2 persons, 242.9ms
Speed: 4.6ms preprocess, 242.9ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  76%|███████▌  | 265/350 [01:57<00:40,  2.10it/s][A


0: 384x640 2 persons, 226.6ms
Speed: 4.0ms preprocess, 226.6ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  76%|███████▌  | 266/350 [01:57<00:40,  2.09it/s][A


0: 384x640 2 persons, 245.5ms
Speed: 5.9ms preprocess, 245.5ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  76%|███████▋  | 267/350 [01:58<00:40,  2.05it/s][A


0: 384x640 2 persons, 234.6ms
Speed: 3.7ms preprocess, 234.6ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  77%|███████▋  | 268/350 [01:58<00:40,  2.04it/s][A


0: 384x640 2 persons, 1 skateboard, 229.4ms
Speed: 5.0ms preprocess, 229.4ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  77%|███████▋  | 269/350 [01:59<00:38,  2.09it/s][A


0: 384x640 2 persons, 170.3ms
Speed: 4.5ms preprocess, 170.3ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  77%|███████▋  | 270/350 [01:59<00:34,  2.30it/s][A


0: 384x640 2 persons, 1 skateboard, 173.5ms
Speed: 4.3ms preprocess, 173.5ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  77%|███████▋  | 271/350 [02:00<00:33,  2.35it/s][A


0: 384x640 2 persons, 155.8ms
Speed: 3.6ms preprocess, 155.8ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  78%|███████▊  | 272/350 [02:00<00:31,  2.51it/s][A


0: 384x640 2 persons, 164.1ms
Speed: 7.0ms preprocess, 164.1ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  78%|███████▊  | 273/350 [02:00<00:29,  2.65it/s][A


0: 384x640 2 persons, 179.2ms
Speed: 3.6ms preprocess, 179.2ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  78%|███████▊  | 274/350 [02:01<00:28,  2.71it/s][A


0: 384x640 2 persons, 151.0ms
Speed: 3.8ms preprocess, 151.0ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  79%|███████▊  | 275/350 [02:01<00:26,  2.82it/s][A


0: 384x640 2 persons, 163.0ms
Speed: 3.7ms preprocess, 163.0ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  79%|███████▉  | 276/350 [02:01<00:25,  2.89it/s][A


0: 384x640 2 persons, 178.3ms
Speed: 3.9ms preprocess, 178.3ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  79%|███████▉  | 277/350 [02:02<00:25,  2.87it/s][A


0: 384x640 2 persons, 174.1ms
Speed: 3.5ms preprocess, 174.1ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  79%|███████▉  | 278/350 [02:02<00:24,  2.90it/s][A


0: 384x640 2 persons, 159.1ms
Speed: 6.7ms preprocess, 159.1ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  80%|███████▉  | 279/350 [02:02<00:24,  2.93it/s][A


0: 384x640 2 persons, 183.5ms
Speed: 4.0ms preprocess, 183.5ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  80%|████████  | 280/350 [02:03<00:24,  2.90it/s][A


0: 384x640 2 persons, 152.9ms
Speed: 3.9ms preprocess, 152.9ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  80%|████████  | 281/350 [02:03<00:23,  2.98it/s][A


0: 384x640 2 persons, 160.0ms
Speed: 4.2ms preprocess, 160.0ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  81%|████████  | 282/350 [02:03<00:22,  2.99it/s][A


0: 384x640 2 persons, 179.6ms
Speed: 3.7ms preprocess, 179.6ms inference, 2.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  81%|████████  | 283/350 [02:04<00:22,  2.94it/s][A


0: 384x640 2 persons, 161.4ms
Speed: 3.6ms preprocess, 161.4ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  81%|████████  | 284/350 [02:04<00:22,  2.95it/s][A


0: 384x640 2 persons, 165.0ms
Speed: 4.9ms preprocess, 165.0ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  81%|████████▏ | 285/350 [02:04<00:22,  2.92it/s][A


0: 384x640 2 persons, 185.7ms
Speed: 4.3ms preprocess, 185.7ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  82%|████████▏ | 286/350 [02:05<00:22,  2.88it/s][A


0: 384x640 2 persons, 159.4ms
Speed: 3.7ms preprocess, 159.4ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  82%|████████▏ | 287/350 [02:05<00:21,  2.93it/s][A


0: 384x640 2 persons, 170.2ms
Speed: 3.7ms preprocess, 170.2ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  82%|████████▏ | 288/350 [02:05<00:21,  2.92it/s][A


0: 384x640 2 persons, 164.0ms
Speed: 7.6ms preprocess, 164.0ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  83%|████████▎ | 289/350 [02:06<00:20,  2.94it/s][A


0: 384x640 2 persons, 153.8ms
Speed: 3.7ms preprocess, 153.8ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  83%|████████▎ | 290/350 [02:06<00:19,  3.01it/s][A


0: 384x640 2 persons, 155.8ms
Speed: 3.7ms preprocess, 155.8ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  83%|████████▎ | 291/350 [02:06<00:19,  3.00it/s][A


0: 384x640 2 persons, 181.2ms
Speed: 5.8ms preprocess, 181.2ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  83%|████████▎ | 292/350 [02:07<00:19,  2.93it/s][A


0: 384x640 2 persons, 152.5ms
Speed: 3.6ms preprocess, 152.5ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  84%|████████▎ | 293/350 [02:07<00:18,  3.00it/s][A


0: 384x640 2 persons, 150.4ms
Speed: 4.3ms preprocess, 150.4ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  84%|████████▍ | 294/350 [02:07<00:18,  2.97it/s][A


0: 384x640 2 persons, 165.4ms
Speed: 3.7ms preprocess, 165.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  84%|████████▍ | 295/350 [02:08<00:18,  2.98it/s][A


0: 384x640 2 persons, 151.4ms
Speed: 3.3ms preprocess, 151.4ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  85%|████████▍ | 296/350 [02:08<00:17,  3.00it/s][A


0: 384x640 2 persons, 152.2ms
Speed: 4.7ms preprocess, 152.2ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  85%|████████▍ | 297/350 [02:08<00:17,  2.97it/s][A


0: 384x640 2 persons, 175.2ms
Speed: 3.8ms preprocess, 175.2ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  85%|████████▌ | 298/350 [02:09<00:18,  2.88it/s][A


0: 384x640 2 persons, 234.2ms
Speed: 4.2ms preprocess, 234.2ms inference, 2.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  85%|████████▌ | 299/350 [02:09<00:19,  2.58it/s][A


0: 384x640 1 person, 260.0ms
Speed: 3.7ms preprocess, 260.0ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  86%|████████▌ | 300/350 [02:10<00:19,  2.53it/s][A


0: 384x640 1 person, 259.0ms
Speed: 3.7ms preprocess, 259.0ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  86%|████████▌ | 301/350 [02:10<00:19,  2.51it/s][A


0: 384x640 1 person, 254.8ms
Speed: 3.8ms preprocess, 254.8ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  86%|████████▋ | 302/350 [02:10<00:19,  2.45it/s][A


0: 384x640 1 person, 271.7ms
Speed: 3.7ms preprocess, 271.7ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  87%|████████▋ | 303/350 [02:11<00:19,  2.43it/s][A


0: 384x640 1 person, 257.1ms
Speed: 3.8ms preprocess, 257.1ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  87%|████████▋ | 304/350 [02:11<00:18,  2.45it/s][A


0: 384x640 1 person, 321.6ms
Speed: 9.1ms preprocess, 321.6ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  87%|████████▋ | 305/350 [02:12<00:19,  2.31it/s][A


0: 384x640 1 person, 285.3ms
Speed: 3.7ms preprocess, 285.3ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  87%|████████▋ | 306/350 [02:12<00:19,  2.26it/s][A


0: 384x640 1 person, 258.9ms
Speed: 8.4ms preprocess, 258.9ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  88%|████████▊ | 307/350 [02:13<00:18,  2.35it/s][A


0: 384x640 1 person, 183.5ms
Speed: 3.6ms preprocess, 183.5ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  88%|████████▊ | 308/350 [02:13<00:16,  2.61it/s][A


0: 384x640 1 person, 185.2ms
Speed: 3.5ms preprocess, 185.2ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  88%|████████▊ | 309/350 [02:13<00:14,  2.80it/s][A


0: 384x640 1 person, 213.7ms
Speed: 4.1ms preprocess, 213.7ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  89%|████████▊ | 310/350 [02:13<00:13,  2.89it/s][A


0: 384x640 1 person, 198.1ms
Speed: 4.0ms preprocess, 198.1ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  89%|████████▉ | 311/350 [02:14<00:13,  2.99it/s][A


0: 384x640 1 person, 180.8ms
Speed: 3.7ms preprocess, 180.8ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  89%|████████▉ | 312/350 [02:14<00:12,  3.12it/s][A


0: 384x640 1 person, 189.6ms
Speed: 3.6ms preprocess, 189.6ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  89%|████████▉ | 313/350 [02:14<00:11,  3.14it/s][A


0: 384x640 1 person, 187.2ms
Speed: 3.6ms preprocess, 187.2ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  90%|████████▉ | 314/350 [02:15<00:11,  3.13it/s][A


0: 384x640 1 person, 177.4ms
Speed: 3.7ms preprocess, 177.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  90%|█████████ | 315/350 [02:15<00:10,  3.22it/s][A


0: 384x640 1 person, 189.9ms
Speed: 5.4ms preprocess, 189.9ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  90%|█████████ | 316/350 [02:15<00:10,  3.20it/s][A


0: 384x640 1 person, 178.7ms
Speed: 10.1ms preprocess, 178.7ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  91%|█████████ | 317/350 [02:16<00:10,  3.23it/s][A


0: 384x640 1 person, 208.0ms
Speed: 6.2ms preprocess, 208.0ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  91%|█████████ | 318/350 [02:16<00:09,  3.23it/s][A


0: 384x640 1 person, 177.5ms
Speed: 3.8ms preprocess, 177.5ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  91%|█████████ | 319/350 [02:16<00:09,  3.31it/s][A


0: 384x640 1 person, 203.7ms
Speed: 5.6ms preprocess, 203.7ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  91%|█████████▏| 320/350 [02:17<00:09,  3.24it/s][A


0: 384x640 1 person, 195.5ms
Speed: 3.8ms preprocess, 195.5ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  92%|█████████▏| 321/350 [02:17<00:08,  3.26it/s][A


0: 384x640 1 person, 183.4ms
Speed: 6.0ms preprocess, 183.4ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  92%|█████████▏| 322/350 [02:17<00:08,  3.30it/s][A


0: 384x640 1 person, 205.4ms
Speed: 4.5ms preprocess, 205.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  92%|█████████▏| 323/350 [02:17<00:08,  3.26it/s][A


0: 384x640 1 person, 200.1ms
Speed: 3.9ms preprocess, 200.1ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  93%|█████████▎| 324/350 [02:18<00:08,  3.22it/s][A


0: 384x640 1 person, 188.5ms
Speed: 7.9ms preprocess, 188.5ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  93%|█████████▎| 325/350 [02:18<00:07,  3.22it/s][A


0: 384x640 1 person, 211.3ms
Speed: 3.6ms preprocess, 211.3ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  93%|█████████▎| 326/350 [02:18<00:07,  3.14it/s][A


0: 384x640 1 person, 206.7ms
Speed: 3.6ms preprocess, 206.7ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  93%|█████████▎| 327/350 [02:19<00:07,  3.16it/s][A


0: 384x640 1 person, 185.4ms
Speed: 6.6ms preprocess, 185.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  94%|█████████▎| 328/350 [02:19<00:06,  3.20it/s][A


0: 384x640 1 person, 181.8ms
Speed: 3.7ms preprocess, 181.8ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  94%|█████████▍| 329/350 [02:19<00:06,  3.23it/s][A


0: 384x640 1 person, 189.9ms
Speed: 9.5ms preprocess, 189.9ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  94%|█████████▍| 330/350 [02:20<00:06,  3.21it/s][A


0: 384x640 1 person, 206.2ms
Speed: 3.6ms preprocess, 206.2ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  95%|█████████▍| 331/350 [02:20<00:05,  3.20it/s][A


0: 384x640 1 person, 195.5ms
Speed: 3.7ms preprocess, 195.5ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  95%|█████████▍| 332/350 [02:20<00:05,  3.16it/s][A


0: 384x640 1 person, 187.5ms
Speed: 7.6ms preprocess, 187.5ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  95%|█████████▌| 333/350 [02:21<00:05,  3.18it/s][A


0: 384x640 1 person, 185.9ms
Speed: 3.7ms preprocess, 185.9ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  95%|█████████▌| 334/350 [02:21<00:04,  3.21it/s][A


0: 384x640 1 person, 186.7ms
Speed: 7.2ms preprocess, 186.7ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  96%|█████████▌| 335/350 [02:21<00:04,  3.22it/s][A


0: 384x640 1 person, 214.1ms
Speed: 3.8ms preprocess, 214.1ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  96%|█████████▌| 336/350 [02:22<00:04,  3.16it/s][A


0: 384x640 1 person, 198.2ms
Speed: 3.8ms preprocess, 198.2ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  96%|█████████▋| 337/350 [02:22<00:04,  3.18it/s][A


0: 384x640 1 person, 194.1ms
Speed: 4.3ms preprocess, 194.1ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  97%|█████████▋| 338/350 [02:22<00:03,  3.20it/s][A


0: 384x640 1 person, 205.1ms
Speed: 4.1ms preprocess, 205.1ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  97%|█████████▋| 339/350 [02:22<00:03,  3.11it/s][A


0: 384x640 1 person, 277.6ms
Speed: 4.8ms preprocess, 277.6ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  97%|█████████▋| 340/350 [02:23<00:03,  2.79it/s][A


0: 384x640 1 person, 283.2ms
Speed: 6.5ms preprocess, 283.2ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  97%|█████████▋| 341/350 [02:23<00:03,  2.59it/s][A


0: 384x640 1 person, 265.0ms
Speed: 6.2ms preprocess, 265.0ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  98%|█████████▊| 342/350 [02:24<00:03,  2.53it/s][A


0: 384x640 1 person, 269.0ms
Speed: 3.7ms preprocess, 269.0ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  98%|█████████▊| 343/350 [02:24<00:02,  2.44it/s][A


0: 384x640 1 person, 269.3ms
Speed: 5.6ms preprocess, 269.3ms inference, 4.7ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  98%|█████████▊| 344/350 [02:25<00:02,  2.40it/s][A


0: 384x640 1 person, 264.9ms
Speed: 3.9ms preprocess, 264.9ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  99%|█████████▊| 345/350 [02:25<00:02,  2.37it/s][A


0: 384x640 1 person, 296.2ms
Speed: 7.6ms preprocess, 296.2ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  99%|█████████▉| 346/350 [02:26<00:01,  2.32it/s][A


0: 384x640 1 person, 279.1ms
Speed: 6.4ms preprocess, 279.1ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  99%|█████████▉| 347/350 [02:26<00:01,  2.32it/s][A


0: 384x640 1 person, 294.8ms
Speed: 6.7ms preprocess, 294.8ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video:  99%|█████████▉| 348/350 [02:26<00:00,  2.28it/s][A


0: 384x640 1 person, 236.9ms
Speed: 9.3ms preprocess, 236.9ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video: 100%|█████████▉| 349/350 [02:27<00:00,  2.42it/s][A


0: 384x640 1 person, 177.5ms
Speed: 3.8ms preprocess, 177.5ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)



Processing video: 100%|██████████| 350/350 [02:27<00:00,  2.37it/s]


**YOLOV8, DEEPSORT, SEMANTIC SEGMENTATION IMPLEMENTATION**

In [16]:
import cv2
import numpy as np
from tqdm import tqdm
import torch
import torchvision.transforms as transforms
from torchvision.models.segmentation import deeplabv3_resnet101
from torchvision.models.segmentation.deeplabv3 import DeepLabV3_ResNet101_Weights

# Initialize video capture
input_video_path = 'input.mp4'
output_video_path = 'output_yolo_deepsort_semantic.mp4'
cap = cv2.VideoCapture(input_video_path)

# Get video properties
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

# Initialize VideoWriter
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

# Initialize the progress bar
pbar = tqdm(total=total_frames, desc='Processing video')

# Load DeepLab model
segmentation_model = deeplabv3_resnet101(weights=DeepLabV3_ResNet101_Weights.DEFAULT)
segmentation_model.eval()

# Define transformations for DeepLab
transform_dl = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((height, width)),
    transforms.ToTensor(),
])

track_id_to_class_id = {}

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Semantic segmentation to get masks
    input_tensor = transform_dl(frame).unsqueeze(0)
    with torch.no_grad():
        output = segmentation_model(input_tensor)['out'][0]
    masks = torch.argmax(output, dim=0).cpu().numpy()

    # YOLO to detect objects
    results = model(frame)

    # Extract the detection results
    detections = results[0].boxes.data.cpu().numpy()  # assuming results is a list with one element containing the detections

    bbox_xywh = []
    confidences = []
    class_ids = []

    for detection in detections:
        x1, y1, x2, y2, conf, cls = detection[:6]
        x_center = (x1 + x2) / 2
        y_center = (y1 + y2) / 2
        width = x2 - x1
        height = y2 - y1
        bbox_xywh.append([x_center, y_center, width, height])
        confidences.append(conf)
        class_ids.append(int(cls))

    # Convert to numpy arrays
    bbox_xywh = np.array(bbox_xywh)
    confidences = np.array(confidences)
    class_ids = np.array(class_ids)

    # Update the DeepSORT tracker
    outputs = deep_sort.update(bbox_xywh, confidences, frame)

    # Draw the tracking results on the frame
    for output in outputs:
        x1, y1, x2, y2, track_id = output[:5]

        # Map the track ID to the class ID
        if track_id not in track_id_to_class_id:
            track_id_to_class_id[track_id] = class_ids[0]  # assign the first class ID for this example

        class_id = track_id_to_class_id[track_id]
        class_name = class_names[class_id]

        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(frame, f"ID: {track_id} {class_name}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)

    # Semantic segmentation masks to the frame
    for i in range(1, masks.max() + 1):  # Start from 1 to skip background
        mask = (masks == i).astype(np.uint8) * 255
        contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        cv2.drawContours(frame, contours, -1, (0, 0, 255), 2)  # Draw contours of each mask

    # Write Output video
    out.write(frame)

    # Progress bar
    pbar.update(1)

cap.release()
out.release()
cv2.destroyAllWindows()
pbar.close()

Processing video:   0%|          | 0/78 [01:18<?, ?it/s]



0: 384x640 2 persons, 203.2ms
Speed: 7.6ms preprocess, 203.2ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)


Processing video:   1%|▏         | 1/78 [00:17<22:14, 17.33s/it]


0: 384x640 4 persons, 157.0ms
Speed: 3.6ms preprocess, 157.0ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)


Processing video:   3%|▎         | 2/78 [00:33<20:44, 16.37s/it]


0: 384x640 3 persons, 239.4ms
Speed: 3.6ms preprocess, 239.4ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)


Processing video:   4%|▍         | 3/78 [00:49<20:31, 16.42s/it]


0: 384x640 4 persons, 171.8ms
Speed: 4.4ms preprocess, 171.8ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)


Processing video:   5%|▌         | 4/78 [01:05<20:13, 16.39s/it]


0: 384x640 5 persons, 189.5ms
Speed: 3.8ms preprocess, 189.5ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)


Processing video:   6%|▋         | 5/78 [01:22<19:56, 16.39s/it]


0: 384x640 5 persons, 169.6ms
Speed: 4.8ms preprocess, 169.6ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)


Processing video:   8%|▊         | 6/78 [01:38<19:30, 16.26s/it]


0: 384x640 5 persons, 156.4ms
Speed: 4.2ms preprocess, 156.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)


Processing video:   9%|▉         | 7/78 [01:54<19:13, 16.25s/it]


0: 384x640 3 persons, 249.7ms
Speed: 4.6ms preprocess, 249.7ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  10%|█         | 8/78 [02:11<19:17, 16.53s/it]


0: 384x640 3 persons, 1 skateboard, 191.9ms
Speed: 3.9ms preprocess, 191.9ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  12%|█▏        | 9/78 [02:28<18:58, 16.50s/it]


0: 384x640 3 persons, 153.9ms
Speed: 4.4ms preprocess, 153.9ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  13%|█▎        | 10/78 [02:43<18:22, 16.22s/it]


0: 384x640 4 persons, 161.3ms
Speed: 6.8ms preprocess, 161.3ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  14%|█▍        | 11/78 [02:59<18:00, 16.13s/it]


0: 384x640 2 persons, 157.6ms
Speed: 3.7ms preprocess, 157.6ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  15%|█▌        | 12/78 [03:15<17:36, 16.01s/it]


0: 384x640 2 persons, 1 bird, 194.7ms
Speed: 7.3ms preprocess, 194.7ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  17%|█▋        | 13/78 [03:32<17:46, 16.41s/it]


0: 384x640 3 persons, 1 bird, 172.8ms
Speed: 3.7ms preprocess, 172.8ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  18%|█▊        | 14/78 [03:48<17:22, 16.29s/it]


0: 384x640 3 persons, 1 dog, 173.6ms
Speed: 6.8ms preprocess, 173.6ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  19%|█▉        | 15/78 [04:04<17:05, 16.28s/it]


0: 384x640 3 persons, 159.6ms
Speed: 4.4ms preprocess, 159.6ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  21%|██        | 16/78 [04:20<16:40, 16.14s/it]


0: 384x640 3 persons, 267.5ms
Speed: 4.0ms preprocess, 267.5ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  22%|██▏       | 17/78 [04:37<16:32, 16.27s/it]


0: 384x640 3 persons, 1 skateboard, 154.4ms
Speed: 3.7ms preprocess, 154.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  23%|██▎       | 18/78 [04:53<16:15, 16.26s/it]


0: 384x640 3 persons, 1 skateboard, 157.8ms
Speed: 4.5ms preprocess, 157.8ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  24%|██▍       | 19/78 [05:09<15:55, 16.19s/it]


0: 384x640 3 persons, 1 skateboard, 168.9ms
Speed: 3.7ms preprocess, 168.9ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  26%|██▌       | 20/78 [05:25<15:29, 16.02s/it]


0: 384x640 3 persons, 207.4ms
Speed: 3.8ms preprocess, 207.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  27%|██▋       | 21/78 [05:40<15:08, 15.93s/it]


0: 384x640 3 persons, 243.2ms
Speed: 3.8ms preprocess, 243.2ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  28%|██▊       | 22/78 [05:56<14:49, 15.89s/it]


0: 384x640 4 persons, 1 skateboard, 157.5ms
Speed: 6.0ms preprocess, 157.5ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  29%|██▉       | 23/78 [06:13<14:49, 16.17s/it]


0: 384x640 3 persons, 1 backpack, 152.4ms
Speed: 4.9ms preprocess, 152.4ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  31%|███       | 24/78 [06:29<14:26, 16.04s/it]


0: 384x640 2 persons, 191.5ms
Speed: 3.9ms preprocess, 191.5ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  32%|███▏      | 25/78 [06:44<14:04, 15.94s/it]


0: 384x640 3 persons, 164.2ms
Speed: 4.0ms preprocess, 164.2ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  33%|███▎      | 26/78 [07:00<13:42, 15.82s/it]


0: 384x640 3 persons, 240.3ms
Speed: 3.7ms preprocess, 240.3ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  35%|███▍      | 27/78 [07:16<13:35, 15.99s/it]


0: 384x640 4 persons, 163.3ms
Speed: 3.9ms preprocess, 163.3ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  36%|███▌      | 28/78 [07:33<13:31, 16.23s/it]


0: 384x640 2 persons, 186.5ms
Speed: 7.3ms preprocess, 186.5ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  37%|███▋      | 29/78 [07:49<13:08, 16.09s/it]


0: 384x640 2 persons, 171.0ms
Speed: 4.7ms preprocess, 171.0ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  38%|███▊      | 30/78 [08:05<12:46, 15.96s/it]


0: 384x640 3 persons, 155.0ms
Speed: 4.8ms preprocess, 155.0ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  40%|███▉      | 31/78 [08:21<12:29, 15.95s/it]


0: 384x640 3 persons, 227.6ms
Speed: 3.6ms preprocess, 227.6ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  41%|████      | 32/78 [08:36<12:11, 15.91s/it]


0: 384x640 3 persons, 1 suitcase, 186.4ms
Speed: 4.3ms preprocess, 186.4ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  42%|████▏     | 33/78 [08:53<12:06, 16.15s/it]


0: 384x640 3 persons, 161.7ms
Speed: 4.8ms preprocess, 161.7ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  44%|████▎     | 34/78 [09:09<11:45, 16.03s/it]


0: 384x640 3 persons, 1 skateboard, 159.4ms
Speed: 3.9ms preprocess, 159.4ms inference, 2.1ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  45%|████▍     | 35/78 [09:25<11:29, 16.03s/it]


0: 384x640 3 persons, 166.5ms
Speed: 3.9ms preprocess, 166.5ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  46%|████▌     | 36/78 [09:41<11:12, 16.01s/it]


0: 384x640 3 persons, 334.4ms
Speed: 4.7ms preprocess, 334.4ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  47%|████▋     | 37/78 [09:58<11:08, 16.30s/it]


0: 384x640 3 persons, 170.5ms
Speed: 4.9ms preprocess, 170.5ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  49%|████▊     | 38/78 [10:14<10:54, 16.37s/it]


0: 384x640 3 persons, 1 skateboard, 151.0ms
Speed: 4.1ms preprocess, 151.0ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  50%|█████     | 39/78 [10:30<10:31, 16.20s/it]


0: 384x640 3 persons, 154.2ms
Speed: 5.2ms preprocess, 154.2ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  51%|█████▏    | 40/78 [10:46<10:09, 16.04s/it]


0: 384x640 3 persons, 189.9ms
Speed: 3.7ms preprocess, 189.9ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  53%|█████▎    | 41/78 [11:02<09:52, 16.02s/it]


0: 384x640 3 persons, 2 skateboards, 253.6ms
Speed: 3.6ms preprocess, 253.6ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  54%|█████▍    | 42/78 [11:18<09:44, 16.23s/it]


0: 384x640 3 persons, 1 skateboard, 168.8ms
Speed: 6.4ms preprocess, 168.8ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  55%|█████▌    | 43/78 [11:35<09:26, 16.20s/it]


0: 384x640 3 persons, 153.3ms
Speed: 5.6ms preprocess, 153.3ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  56%|█████▋    | 44/78 [11:50<09:07, 16.10s/it]


0: 384x640 3 persons, 1 suitcase, 192.0ms
Speed: 3.7ms preprocess, 192.0ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  58%|█████▊    | 45/78 [12:06<08:48, 16.00s/it]


0: 384x640 3 persons, 151.2ms
Speed: 5.9ms preprocess, 151.2ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  59%|█████▉    | 46/78 [12:22<08:27, 15.87s/it]


0: 384x640 3 persons, 1 skateboard, 223.3ms
Speed: 6.0ms preprocess, 223.3ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  60%|██████    | 47/78 [12:38<08:18, 16.07s/it]


0: 384x640 4 persons, 157.1ms
Speed: 4.4ms preprocess, 157.1ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  62%|██████▏   | 48/78 [12:54<08:01, 16.04s/it]


0: 384x640 4 persons, 1 skateboard, 200.3ms
Speed: 3.8ms preprocess, 200.3ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  63%|██████▎   | 49/78 [13:10<07:46, 16.09s/it]


0: 384x640 3 persons, 159.6ms
Speed: 5.4ms preprocess, 159.6ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  64%|██████▍   | 50/78 [13:26<07:26, 15.96s/it]


0: 384x640 3 persons, 172.3ms
Speed: 3.6ms preprocess, 172.3ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  65%|██████▌   | 51/78 [13:42<07:09, 15.92s/it]


0: 384x640 3 persons, 1 skateboard, 272.9ms
Speed: 5.2ms preprocess, 272.9ms inference, 2.2ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  67%|██████▋   | 52/78 [13:59<07:03, 16.28s/it]


0: 384x640 3 persons, 215.3ms
Speed: 5.5ms preprocess, 215.3ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  68%|██████▊   | 53/78 [14:16<06:48, 16.33s/it]


0: 384x640 3 persons, 150.7ms
Speed: 4.5ms preprocess, 150.7ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  69%|██████▉   | 54/78 [14:31<06:27, 16.16s/it]


0: 384x640 3 persons, 151.2ms
Speed: 6.3ms preprocess, 151.2ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  71%|███████   | 55/78 [14:47<06:09, 16.05s/it]


0: 384x640 3 persons, 152.0ms
Speed: 4.5ms preprocess, 152.0ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  72%|███████▏  | 56/78 [15:03<05:50, 15.92s/it]


0: 384x640 4 persons, 330.0ms
Speed: 3.7ms preprocess, 330.0ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  73%|███████▎  | 57/78 [15:20<05:43, 16.35s/it]


0: 384x640 3 persons, 162.0ms
Speed: 5.7ms preprocess, 162.0ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  74%|███████▍  | 58/78 [15:36<05:24, 16.24s/it]


0: 384x640 2 persons, 161.7ms
Speed: 3.6ms preprocess, 161.7ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  76%|███████▌  | 59/78 [15:52<05:07, 16.20s/it]


0: 384x640 2 persons, 182.5ms
Speed: 4.6ms preprocess, 182.5ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  77%|███████▋  | 60/78 [16:08<04:49, 16.06s/it]


0: 384x640 2 persons, 213.4ms
Speed: 6.5ms preprocess, 213.4ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  78%|███████▊  | 61/78 [16:24<04:32, 16.03s/it]


0: 384x640 4 persons, 237.7ms
Speed: 10.4ms preprocess, 237.7ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  79%|███████▉  | 62/78 [16:41<04:19, 16.25s/it]


0: 384x640 5 persons, 166.3ms
Speed: 4.9ms preprocess, 166.3ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  81%|████████  | 63/78 [16:57<04:03, 16.20s/it]


0: 384x640 4 persons, 146.6ms
Speed: 4.7ms preprocess, 146.6ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  82%|████████▏ | 64/78 [17:13<03:45, 16.09s/it]


0: 384x640 4 persons, 221.9ms
Speed: 5.9ms preprocess, 221.9ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  83%|████████▎ | 65/78 [17:29<03:29, 16.13s/it]


0: 384x640 5 persons, 234.8ms
Speed: 3.9ms preprocess, 234.8ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  85%|████████▍ | 66/78 [17:46<03:17, 16.45s/it]


0: 384x640 3 persons, 155.4ms
Speed: 6.2ms preprocess, 155.4ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  86%|████████▌ | 67/78 [18:03<03:03, 16.67s/it]


0: 384x640 4 persons, 1 skateboard, 171.2ms
Speed: 4.5ms preprocess, 171.2ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  87%|████████▋ | 68/78 [18:19<02:45, 16.55s/it]


0: 384x640 2 persons, 1 skateboard, 188.1ms
Speed: 3.8ms preprocess, 188.1ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  88%|████████▊ | 69/78 [18:35<02:27, 16.35s/it]


0: 384x640 2 persons, 153.2ms
Speed: 4.8ms preprocess, 153.2ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  90%|████████▉ | 70/78 [18:51<02:09, 16.13s/it]


0: 384x640 2 persons, 250.8ms
Speed: 4.2ms preprocess, 250.8ms inference, 3.5ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  91%|█████████ | 71/78 [19:07<01:53, 16.24s/it]


0: 384x640 2 persons, 156.4ms
Speed: 4.3ms preprocess, 156.4ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  92%|█████████▏| 72/78 [19:24<01:37, 16.31s/it]


0: 384x640 2 persons, 1 skateboard, 195.2ms
Speed: 3.7ms preprocess, 195.2ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  94%|█████████▎| 73/78 [19:40<01:21, 16.22s/it]


0: 384x640 2 persons, 160.2ms
Speed: 6.6ms preprocess, 160.2ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  95%|█████████▍| 74/78 [19:56<01:04, 16.05s/it]


0: 384x640 2 persons, 1 skateboard, 151.6ms
Speed: 5.6ms preprocess, 151.6ms inference, 1.5ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  96%|█████████▌| 75/78 [20:12<00:48, 16.04s/it]


0: 384x640 2 persons, 1 skateboard, 220.6ms
Speed: 3.9ms preprocess, 220.6ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  97%|█████████▋| 76/78 [20:28<00:32, 16.12s/it]


0: 384x640 2 persons, 191.2ms
Speed: 3.9ms preprocess, 191.2ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)


Processing video:  99%|█████████▊| 77/78 [20:44<00:16, 16.16s/it]


0: 384x640 2 persons, 1 skateboard, 179.6ms
Speed: 7.5ms preprocess, 179.6ms inference, 2.4ms postprocess per image at shape (1, 3, 384, 640)


Processing video: 100%|██████████| 78/78 [21:00<00:00, 16.16s/it]
