Define useful boilerplate functions

In [None]:
import torch, torchvision
print(torch.__version__, torch.cuda.is_available())

torch.set_grad_enabled(False);

2.6.0+cu124 False


In [None]:
import torchvision.transforms as T

# standard PyTorch mean-std input image normalization
transform = T.Compose([
    T.Resize(800),
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# for output bounding box post-processing
def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(1)
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
         (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=1)

def rescale_bboxes(out_bbox, size):
    img_w, img_h = size
    b = box_cxcywh_to_xyxy(out_bbox)
    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
    return b

In [None]:
def filter_bboxes_from_outputs(outputs,
                               threshold=0.7):

  # keep only predictions with confidence above threshold
  probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]
  keep = probas.max(-1).values > threshold

  probas_to_keep = probas[keep]

  # convert boxes from [0; 1] to image scales
  bboxes_scaled = rescale_bboxes(outputs['pred_boxes'][0, keep], im.size)

  return probas_to_keep, bboxes_scaled

In [None]:
# COCO classes
CLASSES = [
    'N/A', 'person', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
    'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
    'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
    'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
    'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
    'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
    'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
    'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
    'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
    'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
    'N/A', 'N/A', 'N/A', 'tv', 'N/A', 'N/A', 'N/A', 'N/A',
    'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
    'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A',
    'N/A'
]

# colors for visualization
COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125],
          [0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]]

In [None]:
import matplotlib.pyplot as plt

def plot_results(pil_img, prob=None, boxes=None):
    plt.figure(figsize=(16,10))
    plt.imshow(pil_img)
    ax = plt.gca()
    colors = COLORS * 100
    if prob is not None and boxes is not None:
      for p, (xmin, ymin, xmax, ymax), c in zip(prob, boxes.tolist(), colors):
          ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
                                    fill=False, color=c, linewidth=3))
          cl = p.argmax()
          text = f'{CLASSES[cl]}: {p[cl]:0.2f}'
          ax.text(xmin, ymin, text, fontsize=15,
                  bbox=dict(facecolor='yellow', alpha=0.5))
    plt.axis('off')
    plt.show()

Load an image for a demo

In [None]:
model = torch.hub.load('facebookresearch/detr', 'detr_resnet50', pretrained=True)
model.eval();

Downloading: "https://github.com/facebookresearch/detr/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:01<00:00, 56.0MB/s]
Downloading: "https://dl.fbaipublicfiles.com/detr/detr-r50-e632da11.pth" to /root/.cache/torch/hub/checkpoints/detr-r50-e632da11.pth
100%|██████████| 159M/159M [00:01<00:00, 91.6MB/s]


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import cv2
import glob
from PIL import Image, ImageDraw
from tqdm import tqdm

In [None]:
import os
import cv2
import torch
from PIL import Image, ImageDraw
from tqdm import tqdm
import matplotlib.pyplot as plt
from IPython.display import HTML
from base64 import b64encode
import time

timestamp = time.strftime("%Y%m%d-%H%M%S")


#  Set up paths
input_video_path = "/content/drive/MyDrive/peddet_vid_1.mp4"  # <-- change to your video path
frames_folder = "/content/frames_0.5fps"
annotated_folder = "/content/frames_annotated"
output_video_path = f"/content/drive/MyDrive/output_detr_{timestamp}.mp4"

os.makedirs(frames_folder, exist_ok=True)
os.makedirs(annotated_folder, exist_ok=True)


In [None]:
cap = cv2.VideoCapture(input_video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
interval = int(fps * 2)  # 0.5 FPS = every 2 seconds

frame_count = 0
saved = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    if frame_count % interval == 0:
        # Resize shortest side to 800 while keeping aspect ratio
        h, w = frame.shape[:2]
        scale = 800 / min(h, w)
        new_size = (int(w * scale), int(h * scale))
        resized = cv2.resize(frame, new_size)
        out_path = os.path.join(frames_folder, f"frame_{saved:04d}.jpg")
        cv2.imwrite(out_path, resized)
        saved += 1
    frame_count += 1

cap.release()
print(f"Extracted and resized {saved} frames at 0.5 FPS.")


Extracted and resized 5 frames at 0.5 FPS.


In [None]:
frame_paths = sorted(os.listdir(frames_folder))

for frame_name in tqdm(frame_paths):
    frame_path = os.path.join(frames_folder, frame_name)
    im = Image.open(frame_path).convert("RGB")
    img = transform(im).unsqueeze(0)

    with torch.no_grad():
        outputs = model(img)

    probas_to_keep, bboxes_scaled = filter_bboxes_from_outputs(outputs, threshold=0.9)

    # Draw bounding boxes (only for 'person')
    draw = ImageDraw.Draw(im)
    for p, (xmin, ymin, xmax, ymax) in zip(probas_to_keep, bboxes_scaled.tolist()):
        cl = p.argmax().item()
        if CLASSES[cl] == 'person':
            draw.rectangle([xmin, ymin, xmax, ymax], outline='red', width=3)
            draw.text((xmin, ymin), f"person {p[cl]:.2f}", fill='red')

    im.save(os.path.join(annotated_folder, frame_name))


100%|██████████| 5/5 [00:36<00:00,  7.30s/it]


In [None]:
annotated_images = sorted(os.listdir(annotated_folder))
sample_frame = cv2.imread(os.path.join(annotated_folder, annotated_images[0]))
if sample_frame is not None:
    height, width = sample_frame.shape[:2]
else:
    raise ValueError("Failed to load the first frame to get dimensions.")
out = cv2.VideoWriter(output_video_path, cv2.VideoWriter_fourcc(*'mp4v'), 0.5, (width, height))
written = 0;
for img_name in annotated_images:
    frame = cv2.imread(os.path.join(annotated_folder, img_name))
    if frame is not None:
        out.write(frame)
        written +=1;
    else:
        print(f" Could not read frame: {img_name}")

out.release()
print(f"Video saved to: {output_video_path}")
print(f" Total frames written: {written}")


Video saved to: /content/drive/MyDrive/output_detr_20250613-030434.mp4
 Total frames written: 5


In [None]:
annotated_images = sorted(os.listdir(annotated_folder))
print(f"Found {len(annotated_images)} annotated frames.")

# Check if the first frame can be read
test_img = cv2.imread(os.path.join(annotated_folder, annotated_images[0]))
if test_img is None:
    print(" Failed to read the first annotated frame.")
else:
    print(" First frame shape:", test_img.shape)


Found 5 annotated frames.
 First frame shape: (1422, 800, 3)


In [None]:
os.path.getsize(output_video_path) / 1024  # size in KB


222.291015625

In [None]:
#to download to system
from google.colab import files
files.download(output_video_path)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from IPython.display import HTML
from base64 import b64encode
import os

#  Input and output paths
save_path = f"/content/drive/MyDrive/output_detr_{timestamp}.mp4"
          # Your original video (e.g., copied from Drive)
compressed_path = "/content/result_compressed.mp4"

#  Compress using ffmpeg (if not already compressed)
os.system(f"ffmpeg -y -i {save_path} -vcodec libx264 -crf 28 {compressed_path}")

#  Load and encode compressed video
with open(compressed_path, "rb") as f:
    video_data = f.read()
data_url = "data:video/mp4;base64," + b64encode(video_data).decode()

#  Display inline video
HTML(f"""
<video width="500" controls>
    <source src="{data_url}" type="video/mp4">
    Your browser does not support the video tag.
</video>
""")


In [None]:
#from PIL import Image
#import requests
#im = '/content/drive/MyDrive/peddet_2.jpg'





In [None]:
#from PIL import Image
#im = Image.open('/content/drive/MyDrive/peddet_2.jpg').convert("RGB")

In [None]:
# mean-std normalize the input image (batch-size: 1)
#img = transform(im).unsqueeze(0)

# propagate through the model
#outputs = model(img)

In [None]:
#for threshold in [0.9]:

 # probas_to_keep, bboxes_scaled = filter_bboxes_from_outputs(outputs,threshold=threshold)

  #plot_results(im, probas_to_keep, bboxes_scaled)