## Imports

In [1]:
import subprocess
subprocess.run(["git", "clone", "https://github.com/ultralytics/yolov5.git"], check=True)
subprocess.run(["pip", "install", "-r", "yolov5/requirements.txt", "--quiet"], check=True)

Cloning into 'yolov5'...

[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: pip install --upgrade pip


CompletedProcess(args=['pip', 'install', '-r', 'yolov5/requirements.txt', '--quiet'], returncode=0)

In [2]:
import cv2
import os
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import yaml
import urllib
import torch.nn as nn
from utils_esteban import * # utils file for custom datasets and models
from torchvision.ops import box_iou, nms
import torch.nn.functional as F
from Levenshtein import distance as levenshtein_distance
import pandas as pd
import numpy as np

In [3]:
import sys
sys.path.append("yolov5")

from models.yolo import Model
from utils.loss import ComputeLoss

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## YOLO model from paper

In [5]:
# loads the trained weights
model = Model('yolov5/models/yolov5s.yaml', ch=3, nc=1).to(device)

state_dict = torch.load("model_weights/paper_based/my_yolov5.pth", map_location=device)
model.load_state_dict(state_dict) 

Overriding model.yaml nc=80 with nc=1

                 from  n    params  module                                  arguments                     
  0                -1  1      3520  models.common.Conv                      [3, 32, 6, 2, 2]              
  1                -1  1     18560  models.common.Conv                      [32, 64, 3, 2]                
  2                -1  1     18816  models.common.C3                        [64, 64, 1]                   
  3                -1  1     73984  models.common.Conv                      [64, 128, 3, 2]               
  4                -1  2    115712  models.common.C3                        [128, 128, 2]                 
  5                -1  1    295424  models.common.Conv                      [128, 256, 3, 2]              
  6                -1  3    625152  models.common.C3                        [256, 256, 3]                 
  7                -1  1   1180672  models.common.Conv                      [256, 512, 3, 2]             

<All keys matched successfully>

In [6]:
%rm -rf yolov5

In [7]:
# creates the dataset and dataloader
# ATTENTION: the dataset is not the same as train, the evaluation is over an OOD dataset
dataset = Data_Yolo("../../CCPD2019/ccpd_base")
limited_dataset = torch.utils.data.Subset(dataset, indices=range(1000))
loader = DataLoader(limited_dataset, batch_size=8, shuffle=False)

In [8]:
def xywh_to_xyxy(boxes):
    """Convert [x_center, y_center, w, h] → [x1, y1, x2, y2]"""
    x_c, y_c, w, h = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3]
    x1 = x_c - w / 2
    y1 = y_c - h / 2
    x2 = x_c + w / 2
    y2 = y_c + h / 2
    return torch.stack([x1, y1, x2, y2], dim=1)

def evaluate_yolo(model, dataloader, device, conf_thres=0.25, iou_thres=0.45, iou_eval_thres=0.7, img_size=640):
    model.eval()
    correct_detections = 0
    total_images = 0

    with torch.no_grad():
        for images, _, gt_bboxes in tqdm(dataloader):
            images = images.to(device)
            preds_raw = model(images)[0]

            for i, pred in enumerate(preds_raw):
                # filter by confidence
                pred = pred[pred[:, 4] >= conf_thres]
                if pred.size(0) == 0:
                    total_images += 1
                    continue

                # non-maximum suppression (as in paper)
                boxes_xywh = pred[:, :4]
                scores = pred[:, 4]
                boxes_xyxy = xywh_to_xyxy(boxes_xywh)
                keep = nms(boxes_xyxy, scores, iou_thres)
                pred_boxes = boxes_xyxy[keep].cpu()

                # resizing to ground truth bbox
                gt_box_norm = gt_bboxes[i]
                x1 = gt_box_norm[0].item() * img_size
                y1 = gt_box_norm[1].item() * img_size
                x2 = gt_box_norm[2].item() * img_size
                y2 = gt_box_norm[3].item() * img_size
                gt_box_abs = torch.tensor([[x1, y1, x2, y2]], dtype=torch.float32)

                # IoU
                iou_matrix = box_iou(pred_boxes, gt_box_abs)
                max_iou = iou_matrix.max().item()

                if max_iou >= iou_eval_thres:
                    correct_detections += 1

                total_images += 1


    accuracy = 100.0 * correct_detections / total_images
    print(f"YOLO Detection Accuracy (IoU > {iou_eval_thres}): {accuracy:.2f}%")
    return accuracy


In [9]:
evaluate_yolo(model, loader, device)

100%|██████████| 125/125 [01:40<00:00,  1.24it/s]

YOLO Detection Accuracy (IoU > 0.7): 93.40%





93.4

## PDLPR from paper

In [10]:
model_pdlpr = PDLPRModel()
model_pdlpr.load_state_dict(torch.load("model_weights/paper_based/pdlpr_model_weights.pth", map_location=device))

<All keys matched successfully>

In [11]:
# utils for decoding the labels

provinces = ["皖", "沪", "津", "渝", "冀", "晋", "蒙", "辽", "吉", "黑", "苏", "浙", "京", "闽", "赣", "鲁", "豫", "鄂", "湘", "粤", "桂", "琼", "川", "贵", "云", "藏", "陕", "甘", "青", "宁", "新", "警", "学", "O"]
alphabets = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
             'X', 'Y', 'Z', 'O']
ads = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
       'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'O']

full_charset = provinces[:-1] + alphabets[:-1] + ads[:-1]
char_to_idx = {char: idx+1 for idx, char in enumerate(full_charset)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

# decodes plate from the model
def decode_plate_model(indices):
    return ''.join([idx_to_char.get(idx, '') for idx in indices if idx != 0])


In [12]:
def evaluate_yolo_and_pdlpr(model_yolo, model_pdlpr, dataloader_yolo, dataloader_pdlpr, device, conf_thres=0.25, iou_thres=0.45, iou_eval_thres=0.7, img_size=640):
    model_yolo.eval()
    model_pdlpr.eval()

    correct_detections = 0
    total_images = 0

    correct_plates = 0
    total_plates = 0
    avg_levenshtein = 0

    with torch.no_grad():
        for (images_y, gt_texts, gt_bboxes), (images_p, _, _) in zip(tqdm(dataloader_yolo, desc="Evaluating"), dataloader_pdlpr):
            images_y = images_y.to(device)
            preds_raw = model_yolo(images_y)[0]

            for i, pred in enumerate(preds_raw):
                pred = pred[pred[:, 4] >= conf_thres]
                if pred.size(0) == 0:
                    total_images += 1
                    continue
                
                # non-maximum suppression (as in paper)
                boxes_xywh = pred[:, :4]
                scores = pred[:, 4]
                boxes_xyxy = xywh_to_xyxy(boxes_xywh)
                keep = nms(boxes_xyxy, scores, iou_thres)
                pred_boxes = boxes_xyxy[keep].cpu()

                # ground truth bbox in pixels
                gt_box_norm = gt_bboxes[i]
                x1 = gt_box_norm[0].item() * img_size
                y1 = gt_box_norm[1].item() * img_size
                x2 = gt_box_norm[2].item() * img_size
                y2 = gt_box_norm[3].item() * img_size
                gt_box_abs = torch.tensor([[x1, y1, x2, y2]], dtype=torch.float32)

                # IoU
                iou_matrix = box_iou(pred_boxes, gt_box_abs)
                max_iou = iou_matrix.max().item()
                if max_iou >= iou_eval_thres:
                    correct_detections += 1

                    # crop and resize best bbox
                    best_idx = torch.argmax(iou_matrix[:, 0]).item()
                    pred_box = pred_boxes[best_idx].int()
                    x1_, y1_, x2_, y2_ = pred_box.tolist()
                    crop = images_p[i, :, y1_:y2_, x1_:x2_] # cropping the other dataset image

                    if crop.numel() == 0:
                        continue

                    crop_resized = F.interpolate(crop.unsqueeze(0), size=(48, 144), mode='bilinear')

                    # predict plate text
                    output = model_pdlpr(crop_resized.to(device))
                    pred_indices = output.argmax(dim=-1).squeeze().tolist()
                    pred_text = decode_plate_model(pred_indices)
                    gt_text = gt_texts[i]
                    total_plates += 1
                    if pred_text == gt_text:
                        correct_plates += 1
                    avg_levenshtein += levenshtein_distance(pred_text, gt_text) / max(len(gt_text), 1)

                total_images += 1

    detection_accuracy = 100.0 * correct_detections / total_images if total_images > 0 else 0.0
    recognition_accuracy = 100.0 * correct_plates / total_plates if total_plates > 0 else 0.0
    avg_levenshtein /= total_plates if total_plates > 0 else 1 # normalized

    metrics_df = pd.DataFrame({
        "YOLO detection accuracy (IoU > {:.2f})".format(iou_eval_thres): [detection_accuracy],
        "PDLPR recognition accuracy": [recognition_accuracy],
        "Normalized Levenshtein distance": [avg_levenshtein]
    })
    print(metrics_df)
    return detection_accuracy, recognition_accuracy, avg_levenshtein


In [13]:
# creates the dataset and dataloader
# ATTENTION: the dataset is not the same as train, the evaluation is over an OOD dataset
dataset_yolo = Data_Yolo("../../CCPD2019/ccpd_base")
limited_dataset_yolo = torch.utils.data.Subset(dataset_yolo, indices=range(1000))
loader_yolo = DataLoader(limited_dataset, batch_size=8, shuffle=False)

# different dataset because of different transformations
dataset_pdlpr = Data_Yolo("../../CCPD2019/ccpd_base")
limited_dataset_pdlpr = torch.utils.data.Subset(dataset_pdlpr, indices=range(1000))
loader_pdlpr = DataLoader(limited_dataset_pdlpr, batch_size=8, shuffle=False)

In [14]:
evaluate_yolo_and_pdlpr(model, model_pdlpr, loader_yolo, loader_pdlpr, device)

Evaluating: 100%|██████████| 125/125 [02:13<00:00,  1.07s/it]

   YOLO detection accuracy (IoU > 0.70)  PDLPR recognition accuracy  \
0                                  93.4                         0.0   

   Normalized Levenshtein distance  
0                              1.0  





(93.4, 0.0, 1.0)

PDLPR model colapsed even with low CTC loss, as the model is known to work, the most probable reason is that the training is not enough (1000 epochs in paper, 300 epochs done here because of limited ressources), or the weights initialization is problematic. 

93.4% of accuracy obtained on YOLO over an out of distribution dataset.

## Baseline

In [15]:
# creates the dataset and dataloader
# ATTENTION: the dataset is not the same as train, the evaluation is over an OOD dataset
dataset_base = BaselineData("../../CCPD2019/ccpd_base")
limited_dataset_base = torch.utils.data.Subset(dataset_base, indices=range(1000))
loader_base = DataLoader(limited_dataset_base, batch_size=8, shuffle=False)

In [16]:
# utils for seq2seq

# utils for the OCR model
CHARS = [
    "皖", "沪", "津", "渝", "冀", "晋", "蒙", "辽", "吉", "黑", "苏", "浙", "京", "闽", "赣", "鲁", "豫", "鄂", "湘", "粤",
    "桂", "琼", "川", "贵", "云", "藏", "陕", "甘", "青", "宁", "新", "警", "学",
    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S',
    'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
]

# needed for the Seq2Seq model
SPECIAL = ['<PAD>', '<BOS>', '<EOS>']
VOCAB = SPECIAL + sorted(set(CHARS))
char2idx = {c: i for i, c in enumerate(VOCAB)}
idx2char = {i: c for c, i in char2idx.items()}

PAD_IDX = char2idx['<PAD>']
BOS_IDX = char2idx['<BOS>']
EOS_IDX = char2idx['<EOS>']
VOCAB_SIZE = len(VOCAB)

In [17]:
model_cnn = BoundingBoxCNN()
model_cnn.load_state_dict(torch.load("model_weights/baseline/bounding_boxes_baseline.pth", map_location=device))

model_seq2seq = LicensePlateSeq2Seq(vocab_size=VOCAB_SIZE, max_len=8)
model_seq2seq.load_state_dict(torch.load("model_weights/baseline/ocr_model.pth", map_location=device))

<All keys matched successfully>

In [18]:
def evaluate_baseline(model_cnn, model_ocr, dataloader, device, iou_eval_thres=0.7):
    model_cnn.eval()
    model_ocr.eval()

    correct_detections = 0
    total_images = 0

    correct_plates = 0
    total_plates = 0
    avg_levenshtein = 0

    with torch.no_grad():
        for images, gt_texts, gt_bboxes in tqdm(dataloader, desc="Evaluating"):
            images = images.to(device)
            preds = model_cnn(images)
            bboxes = preds.squeeze().cpu().numpy()

            if bboxes.ndim == 1:
                bboxes = bboxes[np.newaxis, :]

            H, W = 1160, 720

            for i in range(len(bboxes)):
                x1, y1, x2, y2 = bboxes[i]
                x1, y1, x2, y2 = int(x1 * W), int(y1 * H), int(x2 * W), int(y2 * H)
                pred_boxes = torch.tensor([[x1, y1, x2, y2]], dtype=torch.float32)

                gt_box_norm = gt_bboxes[i]
                x1 = gt_box_norm[0].item() * W
                y1 = gt_box_norm[1].item() * H
                x2 = gt_box_norm[2].item() * W
                y2 = gt_box_norm[3].item() * H
                gt_box_abs = torch.tensor([[x1, y1, x2, y2]], dtype=torch.float32)

                # IoU
                iou_matrix = box_iou(pred_boxes, gt_box_abs)
                max_iou = iou_matrix.max().item()
                if max_iou >= iou_eval_thres:
                    correct_detections += 1

                    # crop and resize best bbox
                    best_idx = torch.argmax(iou_matrix[:, 0]).item()
                    pred_box = pred_boxes[best_idx].int()
                    x1_, y1_, x2_, y2_ = pred_box.tolist()
                    crop = images[i, :, y1_:y2_, x1_:x2_] # cropping the other dataset image

                    if crop.numel() == 0:
                        continue

                    crop_resized = F.interpolate(crop.unsqueeze(0), size=(48, 144), mode='bilinear')

                    # predict plate text
                    output = model_ocr(crop_resized.to(device), teacher_forcing=False)
                    pred_indices_batch = output.argmax(dim=-1).tolist()

                    batch_size = len(pred_indices_batch)
                    for i in range(batch_size):
                        pred_indices = pred_indices_batch[i]
                        pred_text = decode_plate_model(pred_indices)
                        gt_text = gt_texts[i]

                        total_plates += 1

                        if pred_text == gt_text:
                            correct_plates += 1

                        # avoid division by zero
                        max_len = max(len(gt_text), 1)
                        avg_levenshtein += levenshtein_distance(pred_text, gt_text) / max_len
                                

                total_images += 1

    detection_accuracy = 100.0 * correct_detections / total_images if total_images > 0 else 0.0
    recognition_accuracy = 100.0 * correct_plates / total_plates if total_plates > 0 else 0.0
    avg_levenshtein /= total_plates if total_plates > 0 else 1 # normalized

    metrics_df = pd.DataFrame({
        "CNN detection accuracy (IoU > {:.2f})".format(iou_eval_thres): [detection_accuracy],
        "Seq2Seq recognition accuracy": [recognition_accuracy],
        "Normalized Levenshtein distance": [avg_levenshtein]
    })
    print(metrics_df)
    return detection_accuracy, recognition_accuracy, avg_levenshtein


In [19]:
evaluate_baseline(model_cnn, model_seq2seq, loader_base, device)

Evaluating: 100%|██████████| 125/125 [03:29<00:00,  1.68s/it]

   CNN detection accuracy (IoU > 0.70)  Seq2Seq recognition accuracy  \
0                                  5.9                           0.0   

   Normalized Levenshtein distance  
0                         0.861985  





(5.9, 0.0, 0.8619854721549628)

Baseline was already observed to be bad in the training, but obtains 5.9% of accuracy and 0 for recognition, but it can be observed that the Levenshtein distance is not 1, in fact, the model predicts the two first characters of the majority of the plates.