# License Plate OCR Model Testing & Validation

We are benchmarking four distinct approaches here: EasyOCR, PyTesseract, your Custom OCR, and the Refined Custom OCR.

The core of this evaluation is the Intersection over Union (IoU) calculation. This metric tells us how much our predicted bounding box overlaps with the ground truth. If the overlap is over our threshold (standard 0.5), it's a hit (True Positive). Anything else is either a miss (False Negative) or a ghost (False Positive). We're also calculating Mean Average Precision (mAP) across multiple thresholds to see which model has the most "stamina" when we tighten the requirements for accuracy.

Dataset: CatEye-ALPR-v3-3 (pre-cropped license plates)

# Benchmarking Dataset

### Merge the train, test, and valid folders into one folder

In [18]:
import os
import shutil

src_root = 'CatEye-ALPR-v3-3'
dst_root = 'CatEye-ALPR-v3-3-Merged'

subfolders = ['images', 'labels']
splits = ['train', 'test', 'valid']

# Create merged directories
for subfolder in subfolders:
    os.makedirs(os.path.join(dst_root, subfolder), exist_ok=True)

# Merge files from each split into the merged folder
for split in splits:
    for subfolder in subfolders:
        src_dir = os.path.join(src_root, split, subfolder)
        dst_dir = os.path.join(dst_root, subfolder)
        if os.path.exists(src_dir):
            for fname in os.listdir(src_dir):
                src_file = os.path.join(src_dir, fname)
                dst_file = os.path.join(dst_dir, fname)
                if not os.path.exists(dst_file):
                    shutil.copy2(src_file, dst_file)

This folder will be used for benchmarking the dataset.

## Model Initialization and Configuration

In this section, we set up our hardware acceleration and load our pre-trained weights. We are using a standard ImageNet normalization for our custom models to ensure the input distribution matches what they saw during training.

In [4]:
import torch
import ultralytics
import easyocr
import pytesseract
from PIL import Image
import numpy as np
from torchvision import transforms
import time
from collections import defaultdict
import matplotlib.pyplot as plt

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load custom models
# Setting weights_only=False bypasses the security check
custom_model = torch.load('models/custom_ocr.pt', map_location=device, weights_only=False)
refined_model = torch.load('models/custom_ocr_refined.pt', map_location=device, weights_only=False)

# Initialize EasyOCR
reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())

# Preprocessing pipeline
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

Using CPU. Note: This module is much faster with a GPU.
Downloading detection model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% Complete

Downloading recognition model, please wait. This may take several minutes depending upon your network connection.


Progress: |██████████████████████████████████████████████████| 100.0% Complete

## Inference Wrappers

These functions standardize the output from different libraries. No matter the engine, we want a dictionary containing bounding boxes, confidence scores, and the clock time it took to finish.

In [5]:
def get_easyocr_predictions(image_path):
    """
    Wraps EasyOCR detection logic.
    Inputs: image_path (str)
    Returns: dict(boxes: np.ndarray, scores: np.ndarray, inference_time: float)
    """
    start = time.time()
    results = reader.readtext(image_path)
    inference_time = time.time() - start
    
    boxes = []
    confidences = []
    for bbox, text, conf in results:
        x_coords = [p[0] for p in bbox]
        y_coords = [p[1] for p in bbox]
        x1, y1 = min(x_coords), min(y_coords)
        x2, y2 = max(x_coords), max(y_coords)
        boxes.append([x1, y1, x2, y2])
        confidences.append(conf)
    
    return {
        'boxes': np.array(boxes) if boxes else np.array([]).reshape(0, 4),
        'scores': np.array(confidences) if confidences else np.array([]),
        'inference_time': inference_time
    }

def get_pytesseract_predictions(image_path):
    """
    Wraps Tesseract OCR engine.
    Inputs: image_path (str)
    Returns: dict(boxes: np.ndarray, scores: np.ndarray, inference_time: float)
    """
    start = time.time()
    img = Image.open(image_path)
    data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
    inference_time = time.time() - start
    
    boxes = []
    confidences = []
    for i in range(len(data['text'])):
        if int(data['conf'][i]) > 0:
            x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
            boxes.append([x, y, x+w, y+h])
            confidences.append(data['conf'][i] / 100.0)
    
    return {
        'boxes': np.array(boxes) if boxes else np.array([]).reshape(0, 4),
        'scores': np.array(confidences) if confidences else np.array([]),
        'inference_time': inference_time
    }

def get_custom_model_predictions(model, image_path, conf_threshold=0.5):
    """
    Inference for custom PyTorch detection models.
    Inputs: model (nn.Module), image_path (str), conf_threshold (float)
    Returns: dict(boxes: np.ndarray, scores: np.ndarray, inference_time: float)
    """
    start = time.time()
    img = Image.open(image_path).convert('RGB')
    img_tensor = transform(img).unsqueeze(0).to(device)
    
    with torch.no_grad():
        output = model(img_tensor)
    
    inference_time = time.time() - start
    
    if isinstance(output, dict):
        boxes = output['boxes'][0].cpu().numpy()
        scores = output['scores'][0].cpu().numpy()
    else:
        boxes = output[0]['boxes'].cpu().numpy()
        scores = output[0]['scores'].cpu().numpy()
    
    mask = scores >= conf_threshold
    boxes = boxes[mask]
    scores = scores[mask]
    
    return {
        'boxes': boxes,
        'scores': scores,
        'inference_time': inference_time
    }

## Metric Calculation Logic
This is the scorecard. We are calculating IoU for overlap, standard Precision/Recall, and Mean Average Precision (mAP) to ensure our models aren't just getting lucky.

In [6]:
def calculate_iou(box1, box2):
    """
    Calculates Intersection over Union between two boxes.
    Inputs: box1 (list/np.array), box2 (list/np.array)
    Returns: float (IoU score)
    """
    x1 = max(box1[0], box2[0])
    y1 = max(box1[1], box2[1])
    x2 = min(box1[2], box2[2])
    y2 = min(box1[3], box2[3])
    
    intersection = max(0, x2 - x1) * max(0, y2 - y1)
    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
    union = area1 + area2 - intersection
    
    return intersection / union if union > 0 else 0

def calculate_metrics(pred_boxes, pred_scores, gt_boxes, iou_threshold=0.5):
    """
    Computes PR metrics for a single image.
    Inputs: pred_boxes (np.array), pred_scores (np.array), gt_boxes (np.array), iou_threshold (float)
    Returns: dict(precision: float, recall: float, tp: int, fp: int, fn: int)
    """
    if len(pred_boxes) == 0:
        return {'precision': 0.0, 'recall': 0.0 if len(gt_boxes) > 0 else 1.0, 'tp': 0, 'fp': 0, 'fn': len(gt_boxes)}
    
    if len(gt_boxes) == 0:
        return {'precision': 0.0, 'recall': 1.0, 'tp': 0, 'fp': len(pred_boxes), 'fn': 0}
    
    matched_gt = set()
    tp, fp = 0, 0
    sorted_indices = np.argsort(pred_scores)[::-1]
    
    for idx in sorted_indices:
        pred_box = pred_boxes[idx]
        max_iou, max_gt_idx = 0, -1
        
        for gt_idx, gt_box in enumerate(gt_boxes):
            if gt_idx in matched_gt:
                continue
            iou = calculate_iou(pred_box, gt_box)
            if iou > max_iou:
                max_iou, max_gt_idx = iou, gt_idx
        
        if max_iou >= iou_threshold:
            tp += 1
            matched_gt.add(max_gt_idx)
        else:
            fp += 1
    
    fn = len(gt_boxes) - len(matched_gt)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    
    return {'precision': precision, 'recall': recall, 'tp': tp, 'fp': fp, 'fn': fn}

def calculate_map(pred_boxes, pred_scores, gt_boxes, iou_thresholds=[0.5, 0.75]):
    """
    Computes mAP across specified IoU thresholds.
    Inputs: pred_boxes (np.array), pred_scores (np.array), gt_boxes (np.array), iou_thresholds (list)
    Returns: float (mAP score)
    """
    aps = []
    for iou_thresh in iou_thresholds:
        if len(pred_boxes) == 0 or len(gt_boxes) == 0:
            aps.append(0.0)
            continue
        
        sorted_indices = np.argsort(pred_scores)[::-1]
        sorted_boxes = pred_boxes[sorted_indices]
        matched_gt = [False] * len(gt_boxes)
        tp, fp = np.zeros(len(sorted_boxes)), np.zeros(len(sorted_boxes))
        
        for pred_idx, pred_box in enumerate(sorted_boxes):
            max_iou, max_gt_idx = 0, -1
            for gt_idx, gt_box in enumerate(gt_boxes):
                iou = calculate_iou(pred_box, gt_box)
                if iou > max_iou:
                    max_iou, max_gt_idx = iou, gt_idx
            
            if max_iou >= iou_thresh and not matched_gt[max_gt_idx]:
                tp[pred_idx], matched_gt[max_gt_idx] = 1, True
            else:
                fp[pred_idx] = 1
        
        tp_cumsum, fp_cumsum = np.cumsum(tp), np.cumsum(fp)
        recalls = tp_cumsum / len(gt_boxes)
        precisions = tp_cumsum / (tp_cumsum + fp_cumsum)
        recalls = np.concatenate(([0], recalls, [1]))
        precisions = np.concatenate(([0], precisions, [0]))
        
        for i in range(len(precisions) - 1, 0, -1):
            precisions[i - 1] = max(precisions[i - 1], precisions[i])
        
        indices = np.where(recalls[1:] != recalls[:-1])[0]
        ap = np.sum((recalls[indices + 1] - recalls[indices]) * precisions[indices + 1])
        aps.append(ap)
    
    return np.mean(aps)

## Benchmarking Execution
We'll iterate through our test dataset and gather the stats for every model.

In [None]:
import os
import torch
from PIL import Image
from torch.utils.data import Dataset, Subset
from tqdm.auto import tqdm

class LicensePlateDataset(Dataset):
    """
    Custom Dataset for License Plate Detection benchmarking.
    Inputs: 
        root_dir (str): Path to merged dataset folder
        transform (callable): Preprocessing transforms
    Returns: 
        image (Tensor), target (dict with 'boxes' key)
    """
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_dir = os.path.join(root_dir, 'images')
        self.label_dir = os.path.join(root_dir, 'labels')
        
        self.imgs = [os.path.join(self.image_dir, f) for f in sorted(os.listdir(self.image_dir)) 
                     if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
        
        self.targets = []
        self._load_targets()

    def _load_targets(self):
        """
        Parses YOLO format labels. Optimized to use image headers for speed.
        """
        for img_path in tqdm(self.imgs, desc="Preparing Dataset Metadata"):
            # Fast header-only read for dimensions
            with Image.open(img_path) as img:
                w, h = img.size
            
            label_path = os.path.join(self.label_dir, os.path.basename(img_path).rsplit('.', 1)[0] + '.txt')
            boxes = []
            
            if os.path.exists(label_path):
                with open(label_path, 'r') as f:
                    for line in f:
                        # YOLO: class, x_center, y_center, width, height
                        _, x_c, y_c, bw, bh = map(float, line.split())
                        
                        # Scale to absolute pixels
                        x1 = (x_c - bw / 2) * w
                        y1 = (y_c - bh / 2) * h
                        x2 = (x_c + bw / 2) * w
                        y2 = (y_c + bh / 2) * h
                        boxes.append([x1, y1, x2, y2])
            
            self.targets.append({'boxes': torch.tensor(boxes)})

    def __len__(self):
        return len(self.imgs)

    def __getitem__(self, idx):
        img_path = self.imgs[idx]
        img = Image.open(img_path).convert("RGB")
        target = self.targets[idx]
        
        if self.transform:
            img = self.transform(img)
            
        return img, target

# Execute the preparation
full_dataset = LicensePlateDataset(root_dir='CatEye-ALPR-v3-3-Merged', transform=transform)
indices = list(range(len(full_dataset)))
test_dataset = Subset(full_dataset, indices)

KeyboardInterrupt: 

In [7]:
def benchmark_model(model_name, predict_fn, test_data, iou_threshold=0.5):
    """
    Runs the full benchmarking loop for a model.
    Inputs: model_name (str), predict_fn (callable), test_data (list), iou_threshold (float)
    Returns: dict (aggregated metrics)
    """
    all_metrics, all_times, all_precisions, all_recalls, all_maps = [], [], [], [], []
    
    for image_path, gt_boxes in test_data:
        predictions = predict_fn(image_path)
        metrics = calculate_metrics(predictions['boxes'], predictions['scores'], gt_boxes, iou_threshold)
        map_score = calculate_map(predictions['boxes'], predictions['scores'], gt_boxes)
        
        all_metrics.append(metrics)
        all_times.append(predictions['inference_time'])
        all_precisions.append(metrics['precision'])
        all_recalls.append(metrics['recall'])
        all_maps.append(map_score)
    
    return {
        'model': model_name,
        'avg_precision': np.mean(all_precisions),
        'avg_recall': np.mean(all_recalls),
        'avg_map': np.mean(all_maps),
        'avg_inference_time': np.mean(all_times),
        'total_tp': sum(m['tp'] for m in all_metrics),
        'total_fp': sum(m['fp'] for m in all_metrics),
        'total_fn': sum(m['fn'] for m in all_metrics)
    }

# Preparation of test data
test_data = []
for idx in range(len(test_dataset)):
    img_path = test_dataset.dataset.imgs[test_dataset.indices[idx]]
    gt_boxes = test_dataset.dataset.targets[test_dataset.indices[idx]]['boxes'].numpy()
    test_data.append((img_path, gt_boxes))

# Execute benchmarks
results = []
results.append(benchmark_model('EasyOCR', get_easyocr_predictions, test_data))
results.append(benchmark_model('Pytesseract', get_pytesseract_predictions, test_data))
results.append(benchmark_model('Custom OCR', lambda x: get_custom_model_predictions(custom_model, x), test_data))
results.append(benchmark_model('Custom OCR Refined', lambda x: get_custom_model_predictions(refined_model, x), test_data))

NameError: name 'test_dataset' is not defined