In [1]:
import torch
from PIL import Image
import torchvision.transforms as T
import matplotlib.pyplot as plt
import cv2
import numpy as np
import os
from scipy.optimize import linear_sum_assignment



In [2]:
model = torch.hub.load('facebookresearch/detr:main', 'detr_resnet50', pretrained=True)

Using cache found in /home/gabriele/.cache/torch/hub/facebookresearch_detr_main


In [3]:
CLASSES = [
    'N/A', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A',
    'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse',
    'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack',
    'umbrella', 'N/A', 'N/A', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis',
    'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove',
    'skateboard', 'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass',
    'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich',
    'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake',
    'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', 'N/A',
    'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
    'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A',
    'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier',
    'toothbrush'
]

In [4]:
COLORS = [[0.000, 0.447, 0.741], [0.850, 0.325, 0.098], [0.929, 0.694, 0.125],
          [0.494, 0.184, 0.556], [0.466, 0.674, 0.188], [0.301, 0.745, 0.933]]

In [5]:
def box_cxcywh_to_xyxy(x):
    x_c, y_c, w, h = x.unbind(1)
    b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
         (x_c + 0.5 * w), (y_c + 0.5 * h)]
    return torch.stack(b, dim=1)

'''def rescale_bboxes(out_bbox, size):
    img_w, img_h, _ = size
    b = box_cxcywh_to_xyxy(out_bbox)
    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
    return b'''

def plot_results(pil_img, prob, boxes):
    plt.figure(figsize=(16,10))
    plt.imshow(pil_img)
    ax = plt.gca()
    for p, (xmin, ymin, xmax, ymax), c in zip(prob, boxes.tolist(), COLORS * 100):
        ax.add_patch(plt.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin,
                                   fill=False, color=c, linewidth=3))
        cl = p.argmax()
        text = f'{CLASSES[cl]}: {p[cl]:0.2f}'
        ax.text(xmin, ymin, text, fontsize=15,
                bbox=dict(facecolor='yellow', alpha=0.5))
    plt.axis('off')
    plt.show()
    

'''def detect(model, im, transform = None, threshold_confidence = 0.7):
    if transform is None:
        # standard PyTorch mean-std input image normalization
        transform = T.Compose([
        T.Resize(800),
        T.ToTensor(),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])

    img = transform(im).unsqueeze(0)

    # demo model only support by default images with aspect ratio between 0.5 and 2
    # if you want to use images with an aspect ratio outside this range
    # rescale your image so that the maximum size is at most 1333 for best results
    assert img.shape[-2] <= 1600 and img.shape[-1] <= 1600, 'demo model only supports images up to 1600 pixels on each side'

    # propagate through the model
    outputs = model(img)

    # keep only predictions with a confidence > threshold_confidence
    probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]
    keep = probas.max(-1).values > threshold_confidence

    # convert boxes from [0; 1] to image scales
    bboxes_scaled = rescale_bboxes(outputs['pred_boxes'][0, keep], im.size)
    return probas[keep], bboxes_scaled'''

"def detect(model, im, transform = None, threshold_confidence = 0.7):\n    if transform is None:\n        # standard PyTorch mean-std input image normalization\n        transform = T.Compose([\n        T.Resize(800),\n        T.ToTensor(),\n        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])\n        ])\n\n    img = transform(im).unsqueeze(0)\n\n    # demo model only support by default images with aspect ratio between 0.5 and 2\n    # if you want to use images with an aspect ratio outside this range\n    # rescale your image so that the maximum size is at most 1333 for best results\n    assert img.shape[-2] <= 1600 and img.shape[-1] <= 1600, 'demo model only supports images up to 1600 pixels on each side'\n\n    # propagate through the model\n    outputs = model(img)\n\n    # keep only predictions with a confidence > threshold_confidence\n    probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]\n    keep = probas.max(-1).values > threshold_confidence\n\n    # convert bo

In [6]:
def rescale_bboxes(boxes, size):

    img_h, img_w, _ = size

    b = box_cxcywh_to_xyxy(boxes)
    b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
    
    return b

In [7]:
# Funzione per rilevare i pedoni (modifica di quella della prof)
def detect_pedestrians(model, im, transform = None, threshold_confidence = 0.7):
    if transform is None:
        # standard PyTorch mean-std input image normalization
        transform = T.Compose([
        T.ToPILImage(),
        T.Resize(800),
        T.ToTensor(),
        T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ])

    img = transform(im).unsqueeze(0)

    # demo model only support by default images with aspect ratio between 0.5 and 2
    # if you want to use images with an aspect ratio outside this range
    # rescale your image so that the maximum size is at most 1333 for best results
    assert img.shape[-2] <= 1600 and img.shape[-1] <= 1600, 'demo model only supports images up to 1600 pixels on each side'

    outputs = model(img)

    # keep only predictions with a confidence > threshold_confidence
    probas = outputs['pred_logits'].softmax(-1)[0, :, :-1]
    keep = probas.max(-1).values > threshold_confidence
    labels = probas.argmax(-1)

    # Filter by pedestrian
    keep = keep & (labels == 1)

    # convert boxes from [0; 1] to image scales
    bboxes_scaled = rescale_bboxes(outputs['pred_boxes'][0, keep], im.shape)
    return bboxes_scaled.detach().numpy(), labels

In [8]:
class Tracker:

    def __init__(self):

        # Lista di tracker
        self.trackers = []

        # Contatore per assegnare ID univoci ai pedoni
        self.track_counter = 0

        # Quanti frame devo aspettare prima che il tracker venga rimosso dall'immagine
        self.max_lost_frames = 30

    def update_tracker(self, detections):

        # Se non ci sono tracker, significa che è il primo insieme di rilevamenti, quindi bisogna aggiungere ogni nuovo oggetto tracciato
        if not self.trackers:
            for detection in detections:
                self.trackers.append({'bbox': detection, 'id': self.track_counter, 'lost': 0})
                self.track_counter += 1
        
        # In caso contrario, bisogna gestire i rilevamenti esistenti
        else:

            # Memorizza i rilevamenti esistenti
            current_bboxes = [tracker['bbox'] for tracker in self.trackers]

            # Calcola la matrice di costo
            cost_matrix = np.zeros((len(current_bboxes), len(detections)))

            for tracker_index, tracker in enumerate(current_bboxes):
                for detection_index, detection in enumerate(detections):
                    cost_matrix[tracker_index, detection_index] = self.compute_cost(tracker, detection)
            
            row_indices, col_indices = linear_sum_assignment(cost_matrix)

            # Crea una lista di coppie con le corrispondenze ottimali
            matched_indices = list(zip(row_indices, col_indices))

            # Crea dei set di detection e tracker non matchati
            unmatched_detections = set(range(len(detections))) - set(col_indices)
            unmatched_trackers = set(range(len(current_bboxes))) - set(row_indices)

            # Iterasu tutte le coppie di indici corrispondenti ottenute dall'algoritmo ungherese.
            # Per ogni coppia aggiorna il bounding box e rimposta il contatore di fotogrammi persi a zero perché il rilevamento dell'oggetto continua.
            for t_idx, d_idx in matched_indices:
                self.trackers[t_idx]['bbox'] = detections[d_idx]
                self.trackers[t_idx]['lost'] = 0

            # Aggiungi nuovi rilevamenti che non hanno corrispondenze precedenti alla lista dei tracker
            for d_idx in unmatched_detections:
                self.trackers.append({'bbox': detections[d_idx], 'id': self.track_counter, 'lost': 0})
                self.track_counter += 1

            # Aumenta il contatore per i tracker persi
            for t_idx in unmatched_trackers:
                self.trackers[t_idx]['lost'] += 1

            # Rimuovi gli oggetti non tracciati per troppo tempo
            self.trackers = [t for t in self.trackers if t['lost'] <= self.max_lost_frames]
    
    def compute_cost(self, tracker, detection):
        t_x1, t_y1, t_x2, t_y2 = tracker
        d_x1, d_y1, d_x2, d_y2 = detection
        iou = self.iou(tracker, detection)
        dist = np.linalg.norm(np.array([(t_x1+t_x2)/2, (t_y1+t_y2)/2]) - np.array([(d_x1+d_x2)/2, (d_y1+d_y2)/2]))
        return 1 - iou + 0.5 * dist

    def iou(self, box1, box2):
        x1 = max(box1[0], box2[0])
        y1 = max(box1[1], box2[1])
        x2 = min(box1[2], box2[2])
        y2 = min(box1[3], box2[3])
        inter_area = max(0, x2 - x1) * max(0, y2 - y1)
        box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
        box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
        union_area = box1_area + box2_area - inter_area
        return inter_area / union_area if union_area != 0 else 0

In [9]:
def process_video(video_path):
    cap = cv2.VideoCapture(video_path)
    tracker = Tracker()
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        detections, labels = detect_pedestrians(im=frame, model=model)
        tracker.update_tracker(detections)
        
        for track in tracker.trackers:
            x1, y1, x2, y2 = map(int, track['bbox'])
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(frame, f'ID: {track["id"]}', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        
        cv2.imshow('Multi-Object Tracking', frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    
    cap.release()
    cv2.destroyAllWindows()

In [10]:
def process_image_folder(folder_path, frame_size=(640, 360), detection_interval=5):
    tracker = Tracker()
    frame_files = sorted([os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(('.png', '.jpg', '.jpeg'))])
    frame_count = 0
    
    for frame_file in frame_files:
        frame = cv2.imread(frame_file)
        frame = cv2.resize(frame, frame_size)
        
        if frame_count % detection_interval == 0:
            detections, labels = detect_pedestrians(im=frame, model=model)
        tracker.update_tracker(detections)
        
        # for track in tracker.trackers:
        #     x1, y1, x2, y2 = map(int, track['bbox'])
        #     cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
        #     cv2.putText(frame, f'ID: {track["id"]}', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        
        # cv2.imshow('Multi-Object Tracking', frame)
        # if cv2.waitKey(1) & 0xFF == ord('q'):
        #     break
        
        frame_count += 1
    
    # cv2.destroyAllWindows()

In [11]:
'''image_path ="/home/ivan/Unipa/Materie/Visione artificiale/Assignments/Assignment 3/dataset/test/MOT17-01-DPM/img1/000001.jpg"
model = torch.hub.load('facebookresearch/detr:main', 'detr_resnet50', pretrained=True)
img = Image.open(image_path)
prob, bboxes_scaled = detect_pedestrians(model, img)
plot_results(img, prob, bboxes_scaled)'''

'image_path ="/home/ivan/Unipa/Materie/Visione artificiale/Assignments/Assignment 3/dataset/test/MOT17-01-DPM/img1/000001.jpg"\nmodel = torch.hub.load(\'facebookresearch/detr:main\', \'detr_resnet50\', pretrained=True)\nimg = Image.open(image_path)\nprob, bboxes_scaled = detect_pedestrians(model, img)\nplot_results(img, prob, bboxes_scaled)'

In [12]:
# process_video('test.mp4')

In [14]:
process_image_folder('../MOT17/test/MOT17-08-FRCNN/img1', frame_size=(640, 360), detection_interval=5)

KeyboardInterrupt: 