# Method 1: CV2 background removal

In [None]:
import cv2
import numpy as np
from google.colab import files
from IPython.display import FileLink

input_video = 'cars_road.mp4'
if not os.path.exists(input_video):
    print("file not exist")

cap = cv2.VideoCapture(input_video)

frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
output_filename = 'output_1.mp4'


fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_filename, fourcc, fps, (frame_width, frame_height))


fgbg = cv2.createBackgroundSubtractorMOG2(history=500, varThreshold=16, detectShadows=True)

while True:
    ret, frame = cap.read()
    if not ret:
        break

    fgmask = fgbg.apply(frame)

    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
    fgmask = cv2.morphologyEx(fgmask, cv2.MORPH_OPEN, kernel)
    fgmask = cv2.dilate(fgmask, kernel, iterations=2)

    contours, _ = cv2.findContours(fgmask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    for cnt in contours:
        if cv2.contourArea(cnt) > 500:
            x, y, w, h = cv2.boundingRect(cnt)
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)

    out.write(frame)

cap.release()
out.release()

display(FileLink(output_filename))

: 

# Method 2: Yolo V8


In [None]:
!pip install ultralytics
!pip install opencv-python-headless

In [1]:
import cv2
import torch
import numpy as np
from ultralytics import YOLO
from google.colab import files
from IPython.display import FileLink

model = YOLO('yolov8n.pt')

vehicle_classes = ['car', 'bus', 'truck']

input_video = 'cars_road.mp4'
if not os.path.exists(input_video):
    print("file not exist")
    uploaded = files.upload()

cap = cv2.VideoCapture(input_video)

if not cap.isOpened():
    print("Error: Could not open video.")
    exit()

frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)
output_filename = 'output_2.mp4'

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_filename, fourcc, fps, (frame_width, frame_height))

frame_count = 0

while True:
    ret, frame = cap.read()
    if not ret:
        break

    results = model(frame)

    for r in results:
        boxes = r.boxes
        for box in boxes:
            class_id = int(box.cls[0])
            confidence = float(box.conf[0])
            if confidence < 0.3:
                continue

            label = model.names[class_id]
            # Draw bounding box for each vhicle
            if label in vehicle_classes:
                x1, y1, x2, y2 = map(int, box.xyxy[0])
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
                text = f"{label} {confidence:.2f}"
                cv2.putText(frame, text, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
    out.write(frame)
    frame_count += 1
    if frame_count % 50 == 0:
        print(f"Processed {frame_count} frames...")

cap.release()
out.release()

display(FileLink(output_filename))

Processing video. Please wait...

0: 384x640 2 persons, 3 cars, 151.1ms
Speed: 15.3ms preprocess, 151.1ms inference, 382.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 3 cars, 7.0ms
Speed: 2.4ms preprocess, 7.0ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 3 cars, 9.2ms
Speed: 3.3ms preprocess, 9.2ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 3 cars, 8.9ms
Speed: 3.1ms preprocess, 8.9ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 3 cars, 6.9ms
Speed: 2.7ms preprocess, 6.9ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 5 cars, 7.2ms
Speed: 3.0ms preprocess, 7.2ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 persons, 6 cars, 8.7ms
Speed: 2.3ms preprocess, 8.7ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 4 cars, 10.0ms
Speed: 2.2ms pr

# Method 3: Semantic segmentation with Segformer

In [2]:
!pip install transformers==4.30.0
!pip install opencv-python Pillow

Collecting transformers==4.30.0
  Downloading transformers-4.30.0-py3-none-any.whl.metadata (113 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/113.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.6/113.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.30.0)
  Downloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.30.0-py3-none-any.whl (7.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m83.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m96.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall

In [1]:
import cv2
import numpy as np
import torch
from transformers import SegformerFeatureExtractor, SegformerForSemanticSegmentation
from PIL import Image
import os
from google.colab import files
input_video = 'cars_road.mp4'

if not os.path.exists(input_video):
    print("file not exist")
    uploaded = files.upload()

model_name = "nvidia/segformer-b0-finetuned-cityscapes-1024-1024"

# Load feature extractor and model from Hugging Face
print("Loading SegFormer (Cityscapes) model...")
feature_extractor = SegformerFeatureExtractor.from_pretrained(model_name)
model = SegformerForSemanticSegmentation.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()


# standard Cityscapes color palette (19 classes)
cityscapes_palette = np.array([
    [128,  64, 128],   # road
    [244,  35, 232],   # sidewalk
    [ 70,  70,  70],   # building
    [102, 102, 156],   # wall
    [190, 153, 153],   # fence
    [153, 153, 153],   # pole
    [250, 170,  30],   # traffic light
    [220, 220,   0],   # traffic sign
    [107, 142,  35],   # vegetation
    [152, 251, 152],   # terrain
    [ 70, 130, 180],   # sky
    [220,  20,  60],   # person
    [255,   0,   0],   # rider
    [  0,   0, 142],   # car
    [  0,   0,  70],   # truck
    [  0,  60, 100],   # bus
    [  0,  80, 100],   # train
    [  0,   0, 230],   # motorcycle
    [119,  11,  32]    # bicycle
], dtype=np.uint8)

def decode_segmap(segmentation, palette):
    h, w = segmentation.shape
    color_image = np.zeros((h, w, 3), dtype=np.uint8)
    for label in range(len(palette)):
        color_image[segmentation == label] = palette[label]
    return color_image

cap = cv2.VideoCapture(input_video)
if not cap.isOpened():
    print("Error opening the video file!")
    exit()

width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps    = cap.get(cv2.CAP_PROP_FPS)
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter('output_3.mp4', fourcc, fps, (width, height))

frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
frame_idx = 0

while True:
    ret, frame = cap.read()
    if not ret:
        break

    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    pil_img = Image.fromarray(rgb_frame)

    inputs = feature_extractor(images=pil_img, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
    seg_map = outputs.logits.argmax(dim=1)[0].cpu().numpy()
    seg_map_resized = cv2.resize(seg_map.astype(np.uint8), (width, height), interpolation=cv2.INTER_NEAREST)
    seg_color = decode_segmap(seg_map_resized, cityscapes_palette)

    overlay = cv2.addWeighted(frame, 0.5, cv2.cvtColor(seg_color, cv2.COLOR_RGB2BGR), 0.5, 0)

    out.write(overlay)

    frame_idx += 1
    if frame_idx % 10 == 0:
        print(f"Processed {frame_idx}/{frame_count} frames")

cap.release()
out.release()
files.download('output_3.mp4')


Loading SegFormer (Cityscapes) model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  return torch.load(checkpoint_file, map_location="cpu")


Model loaded.
Processing video... Total frames: 411
Processed 10/411 frames
Processed 20/411 frames
Processed 30/411 frames
Processed 40/411 frames
Processed 50/411 frames
Processed 60/411 frames
Processed 70/411 frames
Processed 80/411 frames
Processed 90/411 frames
Processed 100/411 frames
Processed 110/411 frames
Processed 120/411 frames
Processed 130/411 frames
Processed 140/411 frames
Processed 150/411 frames
Processed 160/411 frames
Processed 170/411 frames
Processed 180/411 frames
Processed 190/411 frames
Processed 200/411 frames
Processed 210/411 frames
Processed 220/411 frames
Processed 230/411 frames
Processed 240/411 frames
Processed 250/411 frames
Processed 260/411 frames
Processed 270/411 frames
Processed 280/411 frames
Processed 290/411 frames
Processed 300/411 frames
Processed 310/411 frames
Processed 320/411 frames
Processed 330/411 frames
Processed 340/411 frames
Processed 350/411 frames
Processed 360/411 frames
Processed 370/411 frames
Processed 380/411 frames
Process

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>