## Imports

In [1]:
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import os
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.optim import Adam
from tqdm import tqdm
import numpy as np
import torch.nn.functional as F
from torchvision.transforms import Resize
from torchvision.transforms.functional import to_pil_image

## Data

In [3]:
# utils for decoding the labels

provinces = ["皖", "沪", "津", "渝", "冀", "晋", "蒙", "辽", "吉", "黑", "苏", "浙", "京", "闽", "赣", "鲁", "豫", "鄂", "湘", "粤", "桂", "琼", "川", "贵", "云", "藏", "陕", "甘", "青", "宁", "新", "警", "学", "O"]
alphabets = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
             'X', 'Y', 'Z', 'O']
ads = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
       'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'O']

# decodes the plate from the file name
def decode_plate(label_str):
    indices = list(map(int, label_str.split('_')))
    province = provinces[indices[0]]
    alphabet = alphabets[indices[1]]
    ad = ''
    for i in range(2, len(indices)):
        ad += ads[indices[i]]

    return province + alphabet + ad


In [4]:
# torch dataset
class LicensePlateCCPDDataset(Dataset):
    def __init__(self, image_dir):
        self.image_dir = image_dir
        self.image_files = [f for f in os.listdir(image_dir) if f.endswith('.jpg')]

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        filename = self.image_files[idx]
        path = os.path.join(self.image_dir, filename)

        # image reading
        image = cv2.imread(path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = torch.tensor(image, dtype=torch.float32).permute(2, 0, 1) / 255.0

        # bounding box from file name
        parts = filename.split('-')
        bbox_part = parts[2]
        x1y1, x2y2 = bbox_part.split('_')
        x1, y1 = map(int, x1y1.split('~'))
        x2, y2 = map(int, x2y2.split('~'))

        _, img_height, img_width = image.shape
        
        # normalize the bounding box
        x1 = x1 / img_width
        x2 = x2 / img_width
        y1 = y1 / img_height
        y2 = y2 / img_height

        bbox = torch.tensor([x1, y1, x2, y2], dtype=torch.float32)

        # decodes the plate
        plate_raw = parts[4]
        plate_text = decode_plate(plate_raw)
        
        return image, plate_text, bbox

In [6]:
# creates the dataset and dataloader
dataset = LicensePlateCCPDDataset("/kaggle/input/ccpd-weather/ccpd_weather")
loader = DataLoader(dataset, batch_size=32, shuffle=True)

## Bounding box model

In [7]:
# CNN model for the bounding box
class BoundingBoxCNN(nn.Module):
    def __init__(self):
        super(BoundingBoxCNN, self).__init__()
        
        # convolutions
        self.features = nn.Sequential(
        nn.Conv2d(3, 32, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
        nn.Conv2d(32, 64, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
        nn.Conv2d(64, 128, kernel_size=3, padding=1), nn.ReLU(), nn.MaxPool2d(2),
        nn.AdaptiveAvgPool2d((1, 1))
        )

        # fully connected layers
        self.regressor = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128, 256), nn.ReLU(),
            nn.Linear(256, 4),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.features(x)
        x = self.regressor(x)
        return x

In [8]:
# creates model + utils for training
model = BoundingBoxCNN()
criterion = nn.MSELoss()
optimizer = Adam(model.parameters(), lr=1e-3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BoundingBoxCNN(
  (features): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (9): AdaptiveAvgPool2d(output_size=(1, 1))
  )
  (regressor): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=128, out_features=256, bias=True)
    (2): ReLU()
    (3): Linear(in_features=256, out_features=4, bias=True)
    (4): Sigmoid()
  )
)

In [None]:
# training loop
for epoch in range(20):
    model.train()
    running_loss = 0.0
    for images, _, bboxes in tqdm(loader):
        images, bboxes = images.to(device), bboxes.to(device)

        optimizer.zero_grad()
        outputs = model(images)

        loss = criterion(outputs, bboxes)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    print(f"Epoch {epoch+1}, MSE Loss: {running_loss/len(loader):.4f}")

100%|██████████| 313/313 [08:17<00:00,  1.59s/it]


Epoch 1, MSE Loss: 0.0062


 10%|▉         | 31/313 [00:41<06:16,  1.34s/it]

In [None]:
# saves the model
torch.save(model.state_dict(), "bounding_boxes_baseline.pth")

In [None]:
# inference on an image

# image reading and processing
image_bgr = cv2.imread("/kaggle/input/ccpd-weather/ccpd_weather/0088-0_1-284~433_435~482-434~481_284~482_285~434_435~433-0_12_22_2_30_28_33-125-27.jpg")
image = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
image = torch.tensor(image, dtype=torch.float32).permute(2, 0, 1) / 255.0
image = image.unsqueeze(0).to(device)

# inference
model.eval()
with torch.no_grad():
    output = model(image)

# bounding box processing
bbox = output.squeeze().cpu().numpy()

x1, y1, x2, y2 = bbox
x1, y1, x2, y2 = int(x1*720), int(y1*1160), int(x2*720), int(y2*1160)
print(x1, x2, y1, y2)
img_copy = image_bgr.copy()

# plot of the image with the bounding box
cv2.rectangle(img_copy, (x1, y1), (x2, y2), color=(0, 255, 0), thickness=2)
img_rgb = cv2.cvtColor(img_copy, cv2.COLOR_BGR2RGB)

plt.figure(figsize=(8, 6))
plt.imshow(img_rgb)
plt.axis('off')
plt.show()

The model works, but it is underfitting as the net is not complex enough, result is the mean of all bounding boxes.

## OCR model

In [None]:
# utils for the OCR model
CHARS = [
    "皖", "沪", "津", "渝", "冀", "晋", "蒙", "辽", "吉", "黑", "苏", "浙", "京", "闽", "赣", "鲁", "豫", "鄂", "湘", "粤",
    "桂", "琼", "川", "贵", "云", "藏", "陕", "甘", "青", "宁", "新", "警", "学",
    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S',
    'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
]

# needed for the Seq2Seq model
SPECIAL = ['<PAD>', '<BOS>', '<EOS>']
VOCAB = SPECIAL + sorted(set(CHARS))
char2idx = {c: i for i, c in enumerate(VOCAB)}
idx2char = {i: c for c, i in char2idx.items()}

PAD_IDX = char2idx['<PAD>']
BOS_IDX = char2idx['<BOS>']
EOS_IDX = char2idx['<EOS>']
VOCAB_SIZE = len(VOCAB)

In [None]:
# encodes sequences with the labels for Seq2Seq
def encode_sequences(seqs, char2idx, max_len):

    bos = char2idx['<BOS>']
    eos = char2idx['<EOS>']
    pad = char2idx['<PAD>']

    encoded = []
    for s in seqs:
        ids = [bos] + [char2idx.get(c, pad) for c in s] + [eos]
        ids = ids[:max_len]  # Truncar si es necesario
        ids += [pad] * (max_len - len(ids))  # Padding
        encoded.append(ids)

    return torch.tensor(encoded, dtype=torch.long)

In [None]:
# OCR model
class LicensePlateSeq2Seq(nn.Module):
    def __init__(self, vocab_size, embed_dim=256, hidden_dim=512, max_len=10):
        super().__init__()
        self.max_len = max_len

        # CNNs as encoders
        self.encoder = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),  # 24x72
            nn.Conv2d(64, 128, 3, padding=1), nn.ReLU(), nn.MaxPool2d(2),  # 12x36
            nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(), nn.AdaptiveAvgPool2d((1, 1))  # 1x1
        )
        self.flatten = nn.Flatten()
        self.fc_enc = nn.Linear(256, hidden_dim)

        # LSTM for the text 
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=PAD_IDX)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=1, batch_first=True)
        self.output = nn.Linear(hidden_dim, vocab_size)

    def forward(self, img, target_seq=None, teacher_forcing=True):
        # encoder
        feat = self.encoder(img)              
        feat = self.flatten(feat)
        # initial states for the LSTM
        h0 = torch.tanh(self.fc_enc(feat))    
        h0 = h0.unsqueeze(0)                  
        c0 = torch.zeros_like(h0)

        B = img.size(0)
        outputs = []
        input_token = torch.full((B, 1), BOS_IDX, dtype=torch.long, device=img.device)  # [B, 1]

        # generates the tokens through LSTM
        for t in range(self.max_len):
            embed = self.embedding(input_token)  
            out, (h0, c0) = self.lstm(embed, (h0, c0))
            logits = self.output(out.squeeze(1))  
            outputs.append(logits.unsqueeze(1))   

            # set to true in training to help, uses the true sequence
            if teacher_forcing:
                input_token = target_seq[:, t].unsqueeze(1)  
            else:
                input_token = logits.argmax(1).unsqueeze(1) # only for inference

        return torch.cat(outputs, dim=1)


In [None]:
# initializes the model
model = LicensePlateSeq2Seq(vocab_size=VOCAB_SIZE, max_len=8).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

In [None]:
resize_crop = Resize((48, 144)) # image size considered (biased from the paper)

# training loop
for epoch in range(20):
    model.train()
    total_loss = 0

    for img, label_str, bbox in tqdm(loader):
        B = img.size(0)
        cropped_imgs = []

        for i in range(B):
            # invert the normalization
            x1 = int(bbox[i][0].item() * 720)
            y1 = int(bbox[i][1].item() * 1160)
            x2 = int(bbox[i][2].item() * 720)
            y2 = int(bbox[i][3].item() * 1160)

            # crops and resizes the image
            crop_tensor = img[i][:, y1:y2, x1:x2]
            resized = resize_crop(crop_tensor)
            cropped_imgs.append(resized)

        # recreates a tensor, translates the labels
        images = torch.stack(cropped_imgs).to(device)
        target_seq = encode_sequences(label_str, char2idx, max_len=8).to(device)

        # inference
        output = model(images, target_seq=target_seq, teacher_forcing=True)  # [B, 10, C]
        loss = criterion(output.view(-1, VOCAB_SIZE), target_seq.view(-1))   # [B*10, C] vs [B*10]

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"[Epoch {epoch}] Loss: {total_loss / len(loader):.4f}")


In [None]:
# saves the model
torch.save(model.state_dict(), "ocr_model.pth")

In [None]:
# inference for one image
model.eval()

img, label_str, bbox = next(iter(loader)) 
img0 = img[1]  # for just one image
label = label_str[1]
bbox0 = bbox[1]

# crops and resize
x1 = int(bbox0[0].item() * 720)
y1 = int(bbox0[1].item() * 1160)
x2 = int(bbox0[2].item() * 720)
y2 = int(bbox0[3].item() * 1160)

crop_tensor = img0[:, y1:y2, x1:x2]
crop_resized = resize_crop(crop_tensor)
image = crop_resized.unsqueeze(0).to(device) 

# inference
with torch.no_grad():
    output = model(image, teacher_forcing=False) 
    pred_indices = output.argmax(2).squeeze(0).tolist()

# decoding
pred_text = ''
for idx in pred_indices:
    char = idx2char.get(idx, '')
    if char == '<EOS>':
        break
    if char not in ['<PAD>', '<BOS>']:
        pred_text += char

# plot
print(pred_text)
pil_img = to_pil_image(crop_tensor)
plt.imshow(pil_img)
plt.axis("off")
plt.show()


Model colapses as the data is totally unbalanced. 