# Object Detection su Video con MobileNetV3 SSD

* **Studente**: Fabrizio Soppelsa
* **Insegnamento**: Visione Artificiale, UniPA, A.A. 2025/2026

Applicazione della pipeline MobileNetV3 SSD per object detection su video.

In [1]:
# Import necessari
import torch
import cv2
import numpy as np
from pathlib import Path
from PIL import Image
from IPython.display import Video
from tqdm.notebook import tqdm
import time
from torchvision.models.detection import ssdlite320_mobilenet_v3_large
from mobilenet import mobilenetv3_pipeline

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")



Device: cuda


In [2]:
# Classi COCO (91 classi totali, indice 0 è background)
COCO_CLASSES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

## Video con Animali

Testiamo la detection su un video con conigli, uccelli, farfalle e scoiattoli.

In [3]:
# Visualizza video con animali (Big Buck Bunny - 25s clip)
nature_video = Path('data/mobilenet/videos/nature_sample.mp4')

cap = cv2.VideoCapture(str(nature_video))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
duration = total_frames / fps
cap.release()

print(f"Video: {width}x{height}, {fps} fps, {duration:.1f}s ({total_frames} frames)")

Video(str(nature_video), width=800)

Video: 1280x720, 24 fps, 25.0s (600 frames)


In [4]:
# Processa video con animali (25s clip)
video_input = 'data/mobilenet/videos/nature_sample.mp4'
video_output = 'data/mobilenet/videos/nature_detected.mp4'

cap = cv2.VideoCapture(video_input)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

# Processa tutto il video (25s)
max_frames = total_frames

# Setup device e carica modello UNA VOLTA
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
SKIP_FRAMES = 2  # Processa 1 frame ogni 2 (migliore accuratezza)
print(f"Processing 25s clip ({max_frames} frames, every {SKIP_FRAMES}th) on {device}")

# CARICA MODELLO UNA VOLTA (non ad ogni frame!)
detection_model = ssdlite320_mobilenet_v3_large(weights='COCO_V1')
detection_model.to(device).eval()

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
writer = cv2.VideoWriter(video_output, fourcc, fps, (width, height))

# Verifica che il writer sia stato creato correttamente
if not writer.isOpened():
    print(f"ERRORE: Impossibile creare video writer per {video_output}")
    cap.release()
    raise RuntimeError(f"Cannot create video writer for {video_output}")
else:
    print(f"Video writer creato: {video_output}")

np.random.seed(42)
colors = [(np.random.randint(0, 255), np.random.randint(0, 255), np.random.randint(0, 255)) 
          for _ in range(len(COCO_CLASSES))]

frame_count = 0
total_objects = 0
times = []
last_result = None

try:
    for _ in tqdm(range(max_frames), desc="Video animali..."):
        ret, frame = cap.read()
        if not ret:
            print(f"\Frame corrotto a {frame_count}")
            break
        if frame_count >= max_frames:
            break
        
        frame_count += 1
        
        # Processa solo 1 frame ogni SKIP_FRAMES
        if frame_count % SKIP_FRAMES == 1:
            # Resize frame per velocizzare (640x360 invece di originale)
            frame_resized = cv2.resize(frame, (640, 360))
            frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
            pil_image = Image.fromarray(frame_rgb)
            
            start = time.time()
            results = mobilenetv3_pipeline(pil_image, device=device, conf_threshold=0.6, model=detection_model)
            detection_time = (time.time() - start) * 1000
            times.append(detection_time)
            
            # Scala boxes al frame originale
            scale_x = width / 640
            scale_y = height / 360
            if results['num_detections'] > 0:
                results['boxes'][:, [0, 2]] *= scale_x
                results['boxes'][:, [1, 3]] *= scale_y
            
            last_result = results
        else:
            results = last_result if last_result else {'num_detections': 0, 'boxes': [], 'labels': [], 'scores': []}
        
        num_objects = results['num_detections']
        if frame_count % SKIP_FRAMES == 1:
            total_objects += num_objects
        
        for i in range(num_objects):
            x1, y1, x2, y2 = results['boxes'][i].astype(int)
            x1, y1 = max(0, x1), max(0, y1)
            x2, y2 = min(width, x2), min(height, y2)
            
            label_idx = int(results['labels'][i])
            class_name = COCO_CLASSES[label_idx] if label_idx < len(COCO_CLASSES) else f"class_{label_idx}"
            score = results['scores'][i]
            color = colors[label_idx % len(colors)]
            
            cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
            
            label_text = f"{class_name}: {score:.2f}"
            (tw, th), baseline = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
            cv2.rectangle(frame, (x1, y1 - th - baseline - 5), (x1 + tw, y1), color, -1)
            cv2.putText(frame, label_text, (x1, y1 - baseline - 2),
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
        
        info = f"Frame {frame_count}/{max_frames} | Objects: {num_objects}"
        cv2.putText(frame, info, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
        
        writer.write(frame)

except Exception as e:
    print(f"\n\nERRORE durante il processing al frame {frame_count}: {type(e).__name__}: {e}")
    import traceback
    traceback.print_exc()
finally:
    cap.release()
    writer.release()
    print(f"\nRisorse rilasciate (cap & writer)")

# Verifica che il file esista
if Path(video_output).exists():
    file_size = Path(video_output).stat().st_size / (1024*1024)
    print(f"Video salvato: {video_output} ({file_size:.1f} MB)")
else:
    print(f"ERRORE: Il file {video_output} non è stato creato!")

# Converti in H.264 per compatibilità browser
print("\nConverto a H.264 per ipynbb...")
import subprocess
temp_output = video_output.replace('.mp4', '_temp.mp4')
result = subprocess.run([
    'ffmpeg', '-i', video_output, 
    '-c:v', 'libx264', '-preset', 'fast', '-crf', '23', '-pix_fmt', 'yuv420p',
    temp_output, '-y'
], capture_output=True, text=True)

if result.returncode == 0:
    import os
    os.replace(temp_output, video_output)
    print(f"OK!")
else:
    print(f"Errore di conversione: {result.stderr}")

Processing 25s clip (600 frames, every 2th) on cuda
Video writer creato: data/mobilenet/videos/nature_detected.mp4


Video animali...:   0%|          | 0/600 [00:00<?, ?it/s]


Risorse rilasciate (cap & writer)
Video salvato: data/mobilenet/videos/nature_detected.mp4 (10.7 MB)

Converto a H.264 per ipynbb...
OK!


In [5]:
# Visualizza video processato con animali rilevati
nature_detected = Path('data/mobilenet/videos/nature_detected.mp4')

Video(str(nature_detected), width=800)

## Video con Uccelli (Song birds)

Testiamo su un video più lungo (1 minuto) con scene di uccelli e altri animali.

In [6]:
# Visualizza video uccelli (Big Buck Bunny - 1 minuto)
birds_video = Path('data/mobilenet/videos/song_birds.mp4')

cap = cv2.VideoCapture(str(birds_video))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
duration = total_frames / fps
cap.release()

print(f"Video: {width}x{height}, {fps:.0f} fps, {duration:.1f}s ({total_frames} frames)")

Video(str(birds_video), width=800)

Video: 360x240, 29 fps, 72.7s (2107 frames)


In [7]:
# Processa video uccelli (1 minuto Big Buck Bunny)
video_input = 'data/mobilenet/videos/song_birds.mp4'
video_output = 'data/mobilenet/videos/song_birds_detected.mp4'

cap = cv2.VideoCapture(video_input)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
max_frames = total_frames

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
SKIP_FRAMES = 2
print(f"Birds ({max_frames} frames, every {SKIP_FRAMES}th) on {device}")

# Riusa modello già caricato se disponibile
if 'detection_model' not in locals():
    detection_model = ssdlite320_mobilenet_v3_large(weights='COCO_V1')
    detection_model.to(device).eval()

fourcc = cv2.VideoWriter_fourcc(*'mp4v')
writer = cv2.VideoWriter(video_output, fourcc, fps, (width, height))

if not writer.isOpened():
    print(f"ERRORE: Impossibile creare video writer")
    cap.release()
    raise RuntimeError(f"Cannot create video writer")
else:
    print(f"Video writer creato")

np.random.seed(42)
colors = [(np.random.randint(0, 255), np.random.randint(0, 255), np.random.randint(0, 255)) 
          for _ in range(len(COCO_CLASSES))]

frame_count = 0
total_objects = 0
times = []
last_result = None

try:
    for _ in tqdm(range(max_frames), desc="Uccelli cartoon"):
        ret, frame = cap.read()
        if not ret:
            break
        
        frame_count += 1
        
        if frame_count % SKIP_FRAMES == 1:
            frame_resized = cv2.resize(frame, (640, 360))
            frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
            pil_image = Image.fromarray(frame_rgb)
            
            start = time.time()
            results = mobilenetv3_pipeline(pil_image, device=device, conf_threshold=0.6, model=detection_model)
            detection_time = (time.time() - start) * 1000
            times.append(detection_time)
            
            scale_x = width / 640
            scale_y = height / 360
            if results['num_detections'] > 0:
                results['boxes'][:, [0, 2]] *= scale_x
                results['boxes'][:, [1, 3]] *= scale_y
            
            last_result = results
        else:
            results = last_result if last_result else {'num_detections': 0, 'boxes': [], 'labels': [], 'scores': []}
        
        num_objects = results['num_detections']
        if frame_count % SKIP_FRAMES == 1:
            total_objects += num_objects
        
        for i in range(num_objects):
            x1, y1, x2, y2 = results['boxes'][i].astype(int)
            x1, y1 = max(0, x1), max(0, y1)
            x2, y2 = min(width, x2), min(height, y2)
            
            label_idx = int(results['labels'][i])
            class_name = COCO_CLASSES[label_idx] if label_idx < len(COCO_CLASSES) else f"class_{label_idx}"
            score = results['scores'][i]
            color = colors[label_idx % len(colors)]
            
            cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
            
            label_text = f"{class_name}: {score:.2f}"
            (tw, th), baseline = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
            cv2.rectangle(frame, (x1, y1 - th - baseline - 5), (x1 + tw, y1), color, -1)
            cv2.putText(frame, label_text, (x1, y1 - baseline - 2),
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1)
        
        info = f"Frame {frame_count}/{max_frames} | Objects: {num_objects}"
        cv2.putText(frame, info, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
        
        writer.write(frame)

except Exception as e:
    print(f"\n\nERRORE: {type(e).__name__}: {e}")
finally:
    cap.release()
    writer.release()
    print(f"\nRisorse rilasciate")

if Path(video_output).exists():
    file_size = Path(video_output).stat().st_size / (1024*1024)
    print(f"Video salvato: {file_size:.1f} MB")

# Converti in H.264
print("Converto a H.264...")
import subprocess
temp_output = video_output.replace('.mp4', '_temp.mp4')
result = subprocess.run([
    'ffmpeg', '-i', video_output, 
    '-c:v', 'libx264', '-preset', 'fast', '-crf', '23', '-pix_fmt', 'yuv420p',
    temp_output, '-y'
], capture_output=True, text=True)

if result.returncode == 0:
    import os
    os.replace(temp_output, video_output)
    print(f"OK!")
    if times:
        avg_time = np.mean(times)
        avg_fps = 1000 / avg_time if avg_time > 0 else 0
        print(f"\n{len(times)} frames processati, {total_objects} oggetti | {avg_time:.1f}ms/frame, {avg_fps:.1f} fps")
else:
    print(f"Errore: {result.stderr}")

Birds (2107 frames, every 2th) on cuda
Video writer creato


Uccelli cartoon:   0%|          | 0/2107 [00:00<?, ?it/s]


Risorse rilasciate
Video salvato: 13.0 MB
Converto a H.264...
OK!

1049 frames processati, 402 oggetti | 178.2ms/frame, 5.6 fps


In [8]:
from pathlib import Path
from IPython.display import Video

# Visualizza video processato uccelli
birds_detected = Path('data/mobilenet/videos/song_birds_detected.mp4')

Video(str(birds_detected), width=800)