In [1]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd '/content/drive/MyDrive/yolov8-tensorrt'

/content/drive/MyDrive/yolov8-tensorrt


In [3]:
!pip install -r requirements.txt

Collecting tensorrt (from -r requirements.txt (line 1))
  Downloading tensorrt-10.7.0.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorrt_lean (from -r requirements.txt (line 2))
  Downloading tensorrt_lean-10.7.0.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorrt_dispatch (from -r requirements.txt (line 3))
  Downloading tensorrt_dispatch-10.7.0.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting onnx (from -r requirements.txt (line 4))
  Downloading onnx-1.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting onnxsim (from -r requirements.txt (line 5))
  Downloading onnxsim-0.4.36-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Collecting onnxruntime-gpu (from -r requirements.txt (line 6))
  Downloading onnxruntime_gpu-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting tensor

In [10]:
# TensorRT FP16
from ultralytics import YOLO
model = YOLO('yolo_11l_469.pt')
model.export(format="engine", half=True, device=0)
# !yolo export model=yolo_11l_469.pt format=engine half=True device=0 workspace=12


Ultralytics 8.3.58 🚀 Python-3.10.12 torch-2.5.1+cu121 CUDA:0 (Tesla T4, 15102MiB)
YOLO11l summary (fused): 464 layers, 25,287,022 parameters, 0 gradients, 86.6 GFLOPs

[34m[1mPyTorch:[0m starting from 'yolo_11l_469.pt' with input shape (1, 3, 640, 640) BCHW and output shape(s) (1, 14, 8400) (48.8 MB)

[34m[1mONNX:[0m starting export with onnx 1.17.0 opset 19...
[34m[1mONNX:[0m slimming with onnxslim 0.1.46...
[34m[1mONNX:[0m export success ✅ 5.1s, saved as 'yolo_11l_469.onnx' (96.8 MB)

[34m[1mTensorRT:[0m starting export with TensorRT 10.7.0...
[01/07/2025-02:24:12] [TRT] [I] [MemUsageChange] Init CUDA: CPU -2, GPU +0, now: CPU 1299, GPU 1123 (MiB)
[01/07/2025-02:24:14] [TRT] [I] [MemUsageChange] Init builder kernel library: CPU +955, GPU +194, now: CPU 2178, GPU 1317 (MiB)
[01/07/2025-02:24:14] [TRT] [I] ----------------------------------------------------------------
[01/07/2025-02:24:14] [TRT] [I] Input filename:   yolo_11l_469.onnx
[01/07/2025-02:24:14] [TRT] [I] ON

In [18]:
import cv2
import torch
import time
from utils import VisTrack
from PIL import ImageFont

from ultralytics import YOLO

class InferenceBaseline:
    def __init__(self,model_path):
        self.detection_model=None
        self.names={
            "0": "pedestrian",
            "1": "people",
            "2": "bicycle",
            "3": "car",
            "4": "van",
            "5": "truck",
            "6": "tricycle",
            "7": "awning-tricycle",
            "8": "bus",
            "9": "motor"
        }
        # utility function for drawing
        self.vis_track = VisTrack()

        # initialize the model
        self.detection_model=YOLO(model_path)
        # fuse pytorch model for faster inference
        if model_path[-2:] == "pt":
            self.detection_model.fuse()  # Fuse Conv2d + BatchNorm2d layers


    def inference_image(self,img_path):
        # read image
        image=cv2.imread(img_path)
        with torch.no_grad():  # Disable gradient calculation
            results = self.detection_model.predict(image,conf=0.35,device=0)

        results=results[0]  # results = list with 1 element
        bboxes=results.boxes.xyxy.cpu().numpy()
        ids=results.boxes.cls.cpu().numpy().astype(int)
        scores=results.boxes.conf.cpu().numpy()

        image=self.vis_track.draw_bounding_boxes(image,bboxes,ids,self.names,scores)
        cv2.imwrite(img_path[:-4]+"_yolo.png",image)


    def inference_video(self, video_path):

        cap = cv2.VideoCapture(video_path)
        assert cap.isOpened(), "Error reading video file"

        frame_count = 0
        width, height, fps = int(cap.get(3)), int(cap.get(4)), int(cap.get(5))
        num_frames = int(cap.get(7))
        print(f"Processing {num_frames} frames | Resolution: {width}x{height}")
        out = cv2.VideoWriter(video_path[:-4] + "_processed.mp4", cv2.VideoWriter_fourcc(*"mp4v"), fps//2, (width, height))

        start=time.time()

        while True:
            start_time = time.time()
            success, frame = cap.read()
            frame_count+=1

            # Skip the frame if it fails to read
            if not success:
                print(f"Warning: Skipping frame {frame_count + 1}")
                frame_count += 1
                continue

            frame_count += 1
            with torch.no_grad():  # Disable gradient calculation
              results = self.detection_model.predict(frame,conf=0.35,device=0,verbose=False)
            results=results[0] # results = list with 1 element

            boxes=results.boxes.xyxy.cpu().numpy()
            ids=results.boxes.cls.cpu().numpy()
            ids=ids.astype(int) # for suitability to VisTrack

            scores=results.boxes.conf.cpu().numpy()

            # Draw
            frame_processed = self.vis_track.draw_bounding_boxes(frame, boxes, ids, self.names, scores)

            # draw fps
            end_time=time.time()
            fps=1/(end_time-start_time)
            cv2.putText(frame_processed, f"FPS: {fps:.2f}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)
            # write
            out.write(frame_processed)

            # Break when all frames are processed
            if frame_count >= num_frames:
                break
        print(f"Process finished in {(time.time()-start)/60:.2f} minutes")
        out.release()
        cap.release()


    def inference_webcam(self, idx):
        if type(idx) is not int: # video input
            raise ValueError("Camera index must be an integer")
        cap = cv2.VideoCapture(idx)

        while True:
            start_time = time.time()
            success, frame = cap.read()
            assert success, "Fail to read frame"

            with torch.no_grad():  # Disable gradient calculation
              results = self.detection_model.predict(frame,conf=0.35,device=0)
            results=results[0] # results = list with 1 element

            boxes=results.boxes.xyxy.cpu().numpy()
            ids=results.boxes.cls.cpu().numpy()
            ids=ids.astype(int) # for suitability to VisTrack

            scores=results.boxes.conf.cpu().numpy()

            # Draw and write frames
            frame_processed = self.vis_track.draw_bounding_boxes(frame, boxes, ids, self.names, scores)

            # draw fps
            end_time=time.time()
            fps=1/(end_time-start_time)
            cv2.putText(frame_processed, f"{fps:.2f}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2, cv2.LINE_AA)

            # display
            cv2.imshow("Webcam Feed", frame_processed)

            # Check for ESC key press
            if cv2.waitKey(1) & 0xFF == 27:  # 27 is the ASCII code for ESC
                print("ESC pressed, exiting...")
                break


        cap.release()
        cv2.destroyAllWindows()


In [19]:
inference=InferenceBaseline("yolo_11l_469.engine")
inference.inference_video(video_path="raw.mp4")

Processing 184 frames | Resolution: 1904x1070
Loading yolo_11l_469.engine for TensorRT inference...
Process finished in 0.13 minutes


In [20]:
inference=InferenceBaseline("yolo_11l_469.pt")
inference.inference_video(video_path="raw.mp4")

YOLO11l summary (fused): 464 layers, 25,287,022 parameters, 0 gradients, 86.6 GFLOPs
Processing 184 frames | Resolution: 1904x1070
Process finished in 0.18 minutes
