In [18]:
import os
import cv2
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from collections import Counter
from tqdm import tqdm

In [25]:
train_path = "data/final_yolo_dataset_v3/images/train"
val_path = "data/final_yolo_dataset_v3/images/val"

#creating character dataset
char_dataset_root = "data/char_dataset"
os.makedirs(char_dataset_root, exist_ok=True)

#character labels
import string
CHARSET = string.ascii_lowercase + string.digits  # 'abcdefghijklmnopqrstuvwxyz0123456789'
char_to_idx = {char: idx for idx, char in enumerate(CHARSET)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
NUM_CLASSES = len(CHARSET)

In [22]:
#segmentation function (preprocessing)
def segment_characters(image):
    #convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    #median blur to reduce salt-and-pepper noise
    blurred = cv2.medianBlur(gray, 3)

    #adaptive thresholding to get binary image
    _, binary = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    #extra
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    opened = cv2.morphologyEx(binary, cv2.MORPH_OPEN, kernel, iterations=1)
    dilated = cv2.dilate(opened, kernel, iterations=1)

    #find contours
    contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    #extracting bounding boxes
    boxes = []
    h_img, w_img = gray.shape

    for c in contours:
        x, y, w, h = cv2.boundingRect(c)  # bounding rect for contour
        area = w * h

        # filter small noise boxes and extreme aspect ratios:
        if area < 50:            # too small
            continue
        if h < 10:               # too short
            continue
        if w/h > 3.5:            # too wide (likely connected characters or lines)
            continue

        # expand box a little (context)
        pad_x = max(1, int(0.05 * w))
        pad_y = max(1, int(0.1 * h))
        x1 = max(0, x - pad_x)
        y1 = max(0, y - pad_y)
        x2 = min(w_img, x + w + pad_x)
        y2 = min(h_img, y + h + pad_y)

        crop = gray[y1:y2, x1:x2]
        boxes.append((x1, y1, x2 - x1, y2 - y1, crop))

    # sort boxes left->right (by x)
    boxes = sorted(boxes, key=lambda b: b[0])
    return boxes

In [23]:
def build_char_dataset_from_captchas(captcha_folder, out_root):
    """
    Processes all captcha images in captcha_folder, segments them,
    and writes each character crop into folder out_root/<char>/
    """
    os.makedirs(out_root, exist_ok=True)
    files = [f for f in os.listdir(captcha_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

    unmatched = 0
    total = 0

    for fname in files:
        total += 1
        path = os.path.join(captcha_folder, fname)
        img = cv2.imread(path)
        boxes = segment_characters(img)

        # ground-truth label string (part before '-')
        label_str = fname.split('-')[0]

        # if we got exactly same number of boxes as characters, map 1:1
        if len(boxes) == len(label_str):
            for i, ch in enumerate(label_str):
                _, _, _, _, crop = boxes[i]
                # resize small crop to fixed size for classifier
                crop_resized = cv2.resize(crop, (28, 28))
                # save under out_root/ch/
                ch_dir = os.path.join(out_root, ch)
                os.makedirs(ch_dir, exist_ok=True)
                # name = originalfile_index.png
                out_name = os.path.join(ch_dir, f"{fname.replace('.','_')}_{i}.png")
                cv2.imwrite(out_name, crop_resized)
        else:
            # heuristic: if more boxes than characters, choose largest N by width (likely characters)
            if len(boxes) > len(label_str) and len(label_str) > 0:
                # sort boxes by width descending and take top N, then sort those by x ascending
                boxes_by_width = sorted(boxes, key=lambda b: b[2], reverse=True)[:len(label_str)]
                boxes_by_width = sorted(boxes_by_width, key=lambda b: b[0])
                for i, ch in enumerate(label_str):
                    _, _, _, _, crop = boxes_by_width[i]
                    crop_resized = cv2.resize(crop, (28, 28))
                    ch_dir = os.path.join(out_root, ch)
                    os.makedirs(ch_dir, exist_ok=True)
                    out_name = os.path.join(ch_dir, f"{fname.replace('.','_')}_{i}.png")
                    cv2.imwrite(out_name, crop_resized)
            else:
                # if fewer boxes than characters, skip or try splitting wide boxes (skip for now)
                unmatched += 1
                # optional: save the original captcha to an 'unmatched' folder for manual inspection
                os.makedirs(os.path.join(out_root, "_unmatched"), exist_ok=True)
                cv2.imwrite(os.path.join(out_root, "_unmatched", fname), img)

    print(f"Processed {total} captchas; unmatched/skipped: {unmatched}")

# build for train and val
build_char_dataset_from_captchas(train_path, os.path.join(char_dataset_root, "train"))
build_char_dataset_from_captchas(val_path,   os.path.join(char_dataset_root, "val"))

Processed 5600 captchas; unmatched/skipped: 3320
Processed 1400 captchas; unmatched/skipped: 828


In [24]:
class CharFolderDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.samples = []
        self.transform = transform
        # root_dir contains subfolders named by character
        for ch in os.listdir(root_dir):
            ch_dir = os.path.join(root_dir, ch)
            if not os.path.isdir(ch_dir) or ch == "_unmatched":
                continue
            for fname in os.listdir(ch_dir):
                if fname.lower().endswith(('.png', '.jpg', '.jpeg')):
                    self.samples.append((os.path.join(ch_dir, fname), ch))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, ch = self.samples[idx]
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        img = img.astype("float32") / 255.0  # normalize to 0-1
        img = np.expand_dims(img, 0)  # (C=1,H,W)
        tensor = torch.tensor(img, dtype=torch.float32)
        label = char_to_idx[ch]
        return tensor, torch.tensor(label, dtype=torch.long)

# create datasets and loaders
train_char_root = os.path.join(char_dataset_root, "train")
val_char_root   = os.path.join(char_dataset_root, "val")

train_char_ds = CharFolderDataset(train_char_root)
val_char_ds   = CharFolderDataset(val_char_root)

print("train char samples:", len(train_char_ds))
print("val char samples:", len(val_char_ds))

train_loader = DataLoader(train_char_ds, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_char_ds, batch_size=64, shuffle=False)


train char samples: 13097
val char samples: 3296


In [26]:
class CharCNN(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 32, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2),                # 14x14
            nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(),
            nn.MaxPool2d(2),                # 7x7
        )
        self.fc = nn.Sequential(
            nn.Linear(64 * 7 * 7, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        x = self.conv(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CharCNN(NUM_CLASSES).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# training loop
EPOCHS = 10
for epoch in range(EPOCHS):
    model.train()
    running = 0.0
    for imgs, labels in train_loader:
        imgs = imgs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        logits = model(imgs)
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()
        running += loss.item() * imgs.size(0)
    train_loss = running / len(train_loader.dataset)

    # validation char accuracy
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for imgs, labels in val_loader:
            imgs = imgs.to(device)
            labels = labels.to(device)
            logits = model(imgs)
            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    val_acc = 100.0 * correct / total if total>0 else 0.0
    print(f"Epoch {epoch+1}/{EPOCHS} - TrainLoss {train_loss:.4f} - ValCharAcc {val_acc:.2f}%")


Epoch 1/10 - TrainLoss 3.0793 - ValCharAcc 41.02%
Epoch 2/10 - TrainLoss 2.1217 - ValCharAcc 54.40%
Epoch 3/10 - TrainLoss 1.8492 - ValCharAcc 57.52%
Epoch 4/10 - TrainLoss 1.7120 - ValCharAcc 58.80%
Epoch 5/10 - TrainLoss 1.6161 - ValCharAcc 59.28%
Epoch 6/10 - TrainLoss 1.5360 - ValCharAcc 61.95%
Epoch 7/10 - TrainLoss 1.4704 - ValCharAcc 63.47%
Epoch 8/10 - TrainLoss 1.4069 - ValCharAcc 63.29%
Epoch 9/10 - TrainLoss 1.3507 - ValCharAcc 64.56%
Epoch 10/10 - TrainLoss 1.3112 - ValCharAcc 64.99%


In [27]:
def predict_chars_from_crops(crops, model, device):
    """
    crops: list of (crop_gray numpy arrays)
    returns: list of predicted characters in order
    """
    model.eval()
    preds = []
    with torch.no_grad():
        for crop in crops:
            # resize to same size used in training (28x28)
            crop = cv2.resize(crop, (28, 28)).astype("float32") / 255.0
            tensor = torch.tensor(crop).unsqueeze(0).unsqueeze(0).to(device)  # (1,1,H,W)
            logits = model(tensor)
            p = torch.argmax(logits, dim=1).item()
            preds.append(idx_to_char[p])
    return preds

# run over validation captchas
total_chars = 0
correct_chars = 0
total_captchas = 0
correct_captchas = 0

val_files = [f for f in os.listdir(val_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

for fname in val_files:
    total_captchas += 1
    path = os.path.join(val_path, fname)
    img = cv2.imread(path)
    boxes = segment_characters(img)  # list of (x,y,w,h,crop) sorted left->right
    crops = [b[4] for b in boxes]

    label_str = fname.split('-')[0]

    # if counts mismatch, try heuristic to select most likely N crops
    if len(crops) != len(label_str):
        if len(crops) > len(label_str) and len(label_str) > 0:
            # pick largest width crops
            boxes_sorted = sorted(boxes, key=lambda b: b[2], reverse=True)[:len(label_str)]
            boxes_sorted = sorted(boxes_sorted, key=lambda b: b[0])
            crops = [b[4] for b in boxes_sorted]
        else:
            # skip this captcha for captcha-level accuracy (but include character-level if we can align)
            # we'll attempt to align by resizing and predicting anyway, but mark as mismatch if lengths differ
            pass

    # predict each crop
    pred_chars = predict_chars_from_crops(crops, model, device)

    # compute character-level stats:
    # align by index; only up to min(len(pred), len(gt))
    n_compare = min(len(pred_chars), len(label_str))
    for i in range(n_compare):
        total_chars += 1
        if pred_chars[i] == label_str[i]:
            correct_chars += 1

    # captcha-level: require same length and all equal
    if len(pred_chars) == len(label_str) and "".join(pred_chars) == label_str:
        correct_captchas += 1

# results
char_acc = 100.0 * correct_chars / total_chars if total_chars>0 else 0.0
captcha_acc = 100.0 * correct_captchas / total_captchas if total_captchas>0 else 0.0

print(f"Character-level accuracy: {char_acc:.2f}% ({correct_chars}/{total_chars})")
print(f"Captcha-level accuracy: {captcha_acc:.2f}% ({correct_captchas}/{total_captchas})")


Character-level accuracy: 48.45% (3094/6386)
Captcha-level accuracy: 10.00% (140/1400)
