In [1]:
import numpy as np
import os
import logging

from pathlib import Path
from collections import Counter

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as transforms

from PIL import Image

import cv2

from tqdm import tqdm

import albumentations as A

In [2]:
data_dir1 = Path("/home/ishan/Python proj/Captcha reader/Datasets/DS1")
data_dir2 = Path("/home/ishan/Python proj/Captcha reader/Datasets/DS2")
data_dir3 = Path("/home/ishan/Python proj/Captcha reader/Datasets/DS3/archive/CAPTCHA/synthesized_data")
data_dir4 = Path("/home/ishan/Python proj/Captcha reader/Datasets/DS4/archive")
data_dir5 = Path("/home/ishan/Python proj/Captcha reader/Datasets/DS5/archive/captcha_images_v2")
data_dir6 = Path("/home/ishan/Python proj/Captcha reader/Datasets/DS6/imgs/archive/comprasnet_imagensacerto")
data_dir7 = Path("/home/ishan/Python proj/Captcha reader/Datasets/DS7/archive (6)/Captcha/Captcha")

images1 = sorted([str(p) for p in data_dir1.glob("*.jpg")])
labels1 = [Path(p).stem for p in images1]

images2 = sorted([str(p) for p in data_dir2.glob("*.jpg")])
labels2 = [Path(p).stem for p in images2]

images3 = sorted([str(p) for p in data_dir3.glob("*.png")])
labels3 = [Path(p).stem for p in images3]

images4 = sorted([str(p) for p in data_dir4.glob("*.jpg")])
labels4 = [Path(p).stem for p in images4]

images5 = sorted([str(p) for p in data_dir5.glob("*.png")])
labels5 = [Path(p).stem for p in images5]

images6 = sorted([str(p) for p in data_dir6.glob("*.png")])
labels6 = [Path(p).stem for p in images6]

images7 = sorted([str(p) for p in data_dir7.glob("*.png")])
labels7 = [Path(p).stem for p in images7]

images = images1 + images2 + images3 + images4 + images5 + images6 + images7
labels = labels1 + labels2 + labels3 + labels4 + labels5 + labels6 + labels7

characters = sorted(set(char for label in labels for char in label))
print("Number of images found: ", len(images))
print("Number of labels found: ", len(labels))
print("Number of unique characters: ", len(characters))
print("Characters present: ", characters)

batch_size = 128

img_width = 150
img_height = 40

max_length = max([len(label) for label in labels])

Number of images found:  357160
Number of labels found:  357160
Number of unique characters:  63
Characters present:  ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [3]:
logging.basicConfig(
    filename="/home/ishan/Python proj/Captcha reader/Model/CTC on 200K images, val = 0.1%/training_log.txt",         # file to save logs
    level=logging.INFO,              # or DEBUG for more verbosity
    filemode="a", 
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(images, labels, test_size=0.1, random_state=42)
char_to_num = {ch: i for i, ch in enumerate(characters)}
char_to_num['_'] = len(characters)
num_to_char = {i: ch for ch, i in char_to_num.items()}

class CaptchaDataset(Dataset):
    def __init__(self, images, labels, char_to_num, img_width, img_height):
        self.images = images
        self.labels = labels
        self.char_to_num = char_to_num
        self.img_width = img_width
        self.img_height = img_height
        self.aug = A.Compose([A.Normalize(always_apply = True)])

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.labels[idx]

        # Preprocess image
        image = cv2.imread(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, (self.img_width, self.img_height), interpolation=cv2.INTER_LANCZOS4)
        image = np.array(image)
        image = self.aug(image=image)['image']
        image = np.transpose(image, (2, 0, 1))

        # Encode label
        label = [self.char_to_num[c] for c in label]
        label = np.array(label)
        target_len = len(label)


        return {
            "image": torch.tensor(image, dtype=torch.float),
            "label": torch.tensor(label, dtype=torch.long),
            "target_length": target_len
        }


In [5]:
def captcha_collate_fn(batch):
    images = []
    labels = []
    target_lengths = []

    for item in batch:
        images.append(item["image"].unsqueeze(0))  # add batch dim
        labels.extend(item["label"])               # flatten all labels
        target_lengths.append(item["target_length"])

    images = torch.cat(images, dim=0)  # (B, C, H, W)
    labels = torch.tensor(labels, dtype=torch.long)  # 1D tensor of all labels
    target_lengths = torch.tensor(target_lengths, dtype=torch.long)

    return {
        "image": images,
        "label": labels,
        "target_length": target_lengths
    }


In [6]:
class captcha_CNN(nn.Module):
    def __init__(self, num_classes, img_channels, hidden_size=256):
        super(captcha_CNN, self).__init__()
        self.conv1 = nn.Conv2d(img_channels, 64, kernel_size=3, stride=1, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)

        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
        self.conv2a = nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1)
        self.bn2a = nn.BatchNorm2d(128)
        self.pool2 = nn.MaxPool2d(2, 2)

        self.conv3 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
        self.bn3 = nn.BatchNorm2d(256)

        self.conv4 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
        self.pool3 = nn.MaxPool2d((2, 1), (2, 1))

        self.conv5 = nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1)
        self.bn5 = nn.BatchNorm2d(512)

        self.conv6 = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
        self.conv6a = nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1)
        self.pool4 = nn.MaxPool2d((2, 1), (2, 1))

        self.conv7 = nn.Conv2d(512, 512, kernel_size=2, stride=1, padding=0)

        self.rnn = nn.LSTM(
            input_size=512,
            hidden_size=hidden_size,
            num_layers=2,
            batch_first=True,
            bidirectional=True,
            dropout=0.5
        )

        self.dropout = nn.Dropout(0.3)

        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool1(x)
        x = self.dropout(x)

        x = F.relu(self.conv2(x))
        x = self.conv2a(x)
        x = F.relu(self.bn2a(x))
        x = self.pool2(x)
        x = self.dropout(x)

        x = F.relu(self.bn3(self.conv3(x)))

        x = F.relu(self.conv4(x))
        x = self.pool3(x)

        x = F.relu(self.bn5(self.conv5(x)))

        x = F.relu(self.conv6(x))
        x = F.relu(self.conv6a(x))
        x = self.pool4(x)

        x = F.relu(self.conv7(x))

        B, C, H, W = x.size()
        assert H == 1, "Height should be 1 after convolution layers"

        x = x.squeeze(2).permute(0, 2, 1)  # (B, W, C)

        x, _ = self.rnn(x)  # (B, W, hidden_size * 2)

        x = self.fc(x)  # (B, W, num_classes)
        return x




In [7]:
def load_last_checkpoint(model, optimizer, checkpoint_dir, device='cuda'):
    """
    Loads the last saved checkpoint (by epoch number) from a directory.
    
    Args:
        model: PyTorch model
        optimizer: Optimizer
        checkpoint_dir: Directory containing checkpoint files
        device: Device to map model to ('cuda' or 'cpu')

    Returns:
        model, optimizer, start_epoch, loss_at_checkpoint
    """
    if not os.path.exists(checkpoint_dir):
        print("⚠️ No checkpoint directory found. Starting fresh.")
        return model, optimizer, 0, None

    checkpoints = [f for f in os.listdir(checkpoint_dir) if f.endswith(".pth")]
    if not checkpoints:
        print("⚠️ No checkpoints found in directory. Starting fresh.")
        return model, optimizer, 0, None

    # Sort and pick the last one
    checkpoints.sort(key=lambda x: int(x.split("epoch_")[-1].split(".")[0]))
    last_checkpoint = checkpoints[-1]
    checkpoint_path = os.path.join(checkpoint_dir, last_checkpoint)

    print(f"✅ Loading last checkpoint: {checkpoint_path}")
    checkpoint = torch.load(checkpoint_path, map_location=device)

    # Load model + optimizer
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    start_epoch = checkpoint['epoch'] + 1  # resume from next epoch
    last_loss = checkpoint.get('loss', None)

    return model, optimizer, start_epoch, last_loss


In [8]:
def train_model_with_visualization(model, train_loader, val_loader, criterion, optimizer,  num_epochs, start_epoch=0, device="cuda"):
    model = model.to(device)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=0.1)

    epoch_train_losses = []
    epoch_val_losses = []

    for epoch in range(start_epoch, num_epochs):
        # ---------------- TRAIN ----------------
        model.train()
        running_train_loss = 0.0
        train_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")

        for batch in train_bar:
            images = batch["image"].to(device)
            labels = batch["label"].to(device)
            target_lengths = batch["target_length"].to(device)

            optimizer.zero_grad()
            outputs = model(images)  # (B, W, C)
            B, W, C = outputs.size()

            log_probs = F.log_softmax(outputs, dim=2).permute(1, 0, 2)  # (W, B, C)
            input_lengths = torch.full((B,), W, dtype=torch.long, device=device)

            loss = criterion(log_probs, labels, input_lengths, target_lengths)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
            optimizer.step()

            running_train_loss += loss.item()
            train_bar.set_postfix({"CTC Loss": f"{loss.item():.6f}"})

        avg_train_loss = running_train_loss / len(train_loader)
        epoch_train_losses.append(avg_train_loss)

        # ---------------- VALIDATION ----------------
        model.eval()
        running_val_loss = 0.0
        val_bar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]")

        with torch.no_grad():
            for batch_idx, batch in enumerate(val_bar):
                images = batch["image"].to(device)
                labels = batch["label"]
                target_lengths = batch["target_length"].to(device)

                outputs = model(images)
                B, W, C = outputs.size()
                input_lengths = torch.full((B,), W, dtype=torch.long, device=device)
                log_probs = F.log_softmax(outputs, dim=2).permute(1, 0, 2)

                loss = criterion(log_probs, labels.to(device), input_lengths, target_lengths)
                running_val_loss += loss.item()
                val_bar.set_postfix({'CTC Loss': f'{loss.item():.6f}'})

                # Decode some predictions
                if batch_idx == 0:
                    preds = log_probs.permute(1, 0, 2).argmax(dim=2).cpu()  # (B, T)
                    decoded_texts = []
                    for pred in preds:
                        prev = -1
                        seq = []
                        for p in pred:
                            p = p.item()
                            if p != prev and p != 0:  # skip blanks (assuming blank=0)
                                seq.append(num_to_char[p])
                            prev = p
                        decoded_texts.append("".join(seq))

                    # Ground truth
                    true_texts = []
                    idx = 0
                    for length in target_lengths:
                        l = labels[idx: idx + length].tolist()
                        true_texts.append("".join([num_to_char[c] for c in l]))
                        idx += length

                    print("\nSample predictions:")
                    for t, p in zip(true_texts[:5], decoded_texts[:5]):
                        print(f"GT: {t} | Pred: {p}")

        avg_val_loss = running_val_loss / len(val_loader)
        epoch_val_losses.append(avg_val_loss)
        scheduler.step(avg_val_loss)
        lr = optimizer.param_groups[0]['lr']

        msg = (f"Epoch [{epoch+1}/{num_epochs}] "
        f"Train Loss: {avg_train_loss:.4f} | "
        f"Val Loss: {avg_val_loss:.4f} | "
        f"LR: {lr:.6f}")
        logging.info(msg)
        print(msg)


        if (epoch + 1) % 5 == 0 or avg_val_loss == min(epoch_val_losses):
            checkpoint_path = f"/home/ishan/Python proj/Captcha reader/Model/CTC on 200K images, val = 0.1%/ctc_model_epoch_{epoch+1}.pth"
            torch.save({
                'epoch': epoch,  # current epoch
                'model_state_dict': model.state_dict(),  # model weights
                'optimizer_state_dict': optimizer.state_dict(),  # optimizer state
                'loss': loss.item(),  # last batch loss
            }, checkpoint_path)
    print("Training complete.")
    return epoch_train_losses, epoch_val_losses



In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")
model = captcha_CNN(num_classes=len(characters) + 1, img_channels=3).to(device)
ctc_loss = nn.CTCLoss(blank=len(characters), zero_infinity=True)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

train_dataset = CaptchaDataset(X_train, Y_train, char_to_num, img_width, img_height)
val_dataset = CaptchaDataset(X_test, Y_test, char_to_num, img_width, img_height)

train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=captcha_collate_fn
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=captcha_collate_fn
)

Using device: cuda


  self.aug = A.Compose([A.Normalize(always_apply = True)])


In [10]:
checkpoint_dir = "/home/ishan/Python proj/Captcha reader/Model/CTC on 200K images, val = 0.1%"

model, optimizer, start_epoch, last_loss = load_last_checkpoint(
    model, optimizer, checkpoint_dir, device="cuda"
)

print(f"Resuming training from epoch {start_epoch}, last loss = {last_loss}")


⚠️ No checkpoints found in directory. Starting fresh.
Resuming training from epoch 0, last loss = None


In [11]:
train_losses, val_losses = train_model_with_visualization(
    model,
    train_loader,
    val_loader,
    ctc_loss,
    optimizer,
    start_epoch=start_epoch,
    num_epochs=1000,
    device='cuda'
)

Epoch 1/1000 [Train]: 100%|██████████| 2512/2512 [07:44<00:00,  5.41it/s, CTC Loss=3.721050]
Epoch 1/1000 [Val]:   1%|          | 3/280 [00:00<00:26, 10.29it/s, CTC Loss=3.588468]


Sample predictions:
GT: D6cbdr | Pred: Ydpnh_
GT: D8cNGH | Pred: DwN_
GT: glrU4 | Pred: 4wNn_
GT: 7RvnfJSc_2792 | Pred: 1j_
GT: HYmO_9602 | Pred: mW_


Epoch 1/1000 [Val]: 100%|██████████| 280/280 [00:26<00:00, 10.75it/s, CTC Loss=3.939458]


Epoch [1/1000] Train Loss: 3.9905 | Val Loss: 3.6623 | LR: 0.000100


Epoch 2/1000 [Train]: 100%|██████████| 2512/2512 [07:15<00:00,  5.77it/s, CTC Loss=1.945286]
Epoch 2/1000 [Val]:   1%|          | 2/280 [00:00<00:25, 10.78it/s, CTC Loss=1.799905]


Sample predictions:
GT: RxSjY | Pred: RXSxY_
GT: 56HnO | Pred: 56H6Q_
GT: kzd6F | Pred: kZd6F_
GT: wn7dnb_7104 | Pred: wnd_2
GT: t06EZ | Pred: tD6EZ_


Epoch 2/1000 [Val]: 100%|██████████| 280/280 [00:24<00:00, 11.49it/s, CTC Loss=1.464478]


Epoch [2/1000] Train Loss: 2.7678 | Val Loss: 1.6713 | LR: 0.000100


Epoch 3/1000 [Train]: 100%|██████████| 2512/2512 [07:09<00:00,  5.85it/s, CTC Loss=0.760922]
Epoch 3/1000 [Val]:   1%|          | 2/280 [00:00<00:25, 11.07it/s, CTC Loss=0.833766]


Sample predictions:
GT: ehU49p | Pred: ehU49p_
GT: ctBW9U | Pred: ciBW9U_
GT: Hrkqi | Pred: Hrkqi_
GT: qaphM | Pred: qaphM_
GT: dCEw1m | Pred: dCEw1m_


Epoch 3/1000 [Val]: 100%|██████████| 280/280 [00:24<00:00, 11.64it/s, CTC Loss=0.761804]


Epoch [3/1000] Train Loss: 1.1403 | Val Loss: 0.7488 | LR: 0.000100


Epoch 4/1000 [Train]: 100%|██████████| 2512/2512 [07:16<00:00,  5.75it/s, CTC Loss=0.556492]
Epoch 4/1000 [Val]:   1%|          | 3/280 [00:00<00:24, 11.41it/s, CTC Loss=0.432633]


Sample predictions:
GT: 5t52b | Pred: 5t5_2b_
GT: t4Ih1t_6520 | Pred: t4Ih1t_7_
GT: eL5lO | Pred: aL5_lO_
GT: MSXcI_1951 | Pred: MSX_cI_7_6
GT: oOBz5 | Pred: oOBz5_


Epoch 4/1000 [Val]: 100%|██████████| 280/280 [00:23<00:00, 11.88it/s, CTC Loss=0.366630]


Epoch [4/1000] Train Loss: 0.6044 | Val Loss: 0.4446 | LR: 0.000100


Epoch 5/1000 [Train]: 100%|██████████| 2512/2512 [07:14<00:00,  5.78it/s, CTC Loss=0.282414]
Epoch 5/1000 [Val]:   1%|          | 2/280 [00:00<00:24, 11.39it/s, CTC Loss=0.320197]


Sample predictions:
GT: GQMdG | Pred: GQM_dG_
GT: wdKUF | Pred: wdK_UF_
GT: nXVIXP_4916 | Pred: nXV_IXP_2_2
GT: zR2KL | Pred: zR2_Kl_
GT: hg62O | Pred: hg6_2O_


Epoch 5/1000 [Val]: 100%|██████████| 280/280 [00:23<00:00, 11.77it/s, CTC Loss=0.229069]


Epoch [5/1000] Train Loss: 0.4015 | Val Loss: 0.3367 | LR: 0.000100


Epoch 6/1000 [Train]: 100%|██████████| 2512/2512 [07:10<00:00,  5.83it/s, CTC Loss=0.250559]
Epoch 6/1000 [Val]:   1%|          | 2/280 [00:00<00:24, 11.56it/s, CTC Loss=0.252278]


Sample predictions:
GT: Dspa1i | Pred: Dspa1i_
GT: 9EFfR | Pred: 9EF_Fr_
GT: D3bWhK7K_2083 | Pred: D3bWh7K_8
GT: P57acT | Pred: P57acT_
GT: Seppm | Pred: Sep_pm_


Epoch 6/1000 [Val]: 100%|██████████| 280/280 [00:23<00:00, 12.14it/s, CTC Loss=0.178653]


Epoch [6/1000] Train Loss: 0.3208 | Val Loss: 0.2848 | LR: 0.000100


Epoch 7/1000 [Train]: 100%|██████████| 2512/2512 [06:58<00:00,  6.00it/s, CTC Loss=0.316409]
Epoch 7/1000 [Val]:   1%|          | 2/280 [00:00<00:24, 11.46it/s, CTC Loss=0.328112]


Sample predictions:
GT: vXt9r_647 | Pred: Vxt9r_61_6
GT: idOFv | Pred: idOFv_
GT: xrZIjs_6822 | Pred: xrzI_js_6_6
GT: eUnzm | Pred: eUn_zm_
GT: w8ymT | Pred: w8y_mT_


Epoch 7/1000 [Val]: 100%|██████████| 280/280 [00:22<00:00, 12.39it/s, CTC Loss=0.152595]


Epoch [7/1000] Train Loss: 0.2703 | Val Loss: 0.2517 | LR: 0.000100


Epoch 8/1000 [Train]: 100%|██████████| 2512/2512 [07:12<00:00,  5.81it/s, CTC Loss=0.243366]
Epoch 8/1000 [Val]:   1%|          | 2/280 [00:00<00:24, 11.21it/s, CTC Loss=0.225698]


Sample predictions:
GT: zyiYD | Pred: zyi_YD_
GT: it45d | Pred: it45d_
GT: wVRal | Pred: wVRal_
GT: g3fxp6_1482 | Pred: g3_fxp6_7_4
GT: fNcD_7839 | Pred: fNc_D_74_4


Epoch 8/1000 [Val]: 100%|██████████| 280/280 [00:23<00:00, 11.84it/s, CTC Loss=0.090582]


Epoch [8/1000] Train Loss: 0.2358 | Val Loss: 0.2312 | LR: 0.000100


Epoch 9/1000 [Train]: 100%|██████████| 2512/2512 [07:10<00:00,  5.84it/s, CTC Loss=0.285175]
Epoch 9/1000 [Val]:   1%|          | 2/280 [00:00<00:24, 11.47it/s, CTC Loss=0.251989]


Sample predictions:
GT: s3fMb | Pred: s3fMb_
GT: FsJ6S | Pred: FsJ_6S_
GT: asMUV | Pred: asM_UV_
GT: JgDqd | Pred: JgD_qd_
GT: 74PFv | Pred: 74P_FV_


Epoch 9/1000 [Val]: 100%|██████████| 280/280 [00:22<00:00, 12.32it/s, CTC Loss=0.226466]


Epoch [9/1000] Train Loss: 0.2118 | Val Loss: 0.2190 | LR: 0.000100


Epoch 10/1000 [Train]: 100%|██████████| 2512/2512 [06:58<00:00,  6.01it/s, CTC Loss=0.192436]
Epoch 10/1000 [Val]:   1%|          | 2/280 [00:00<00:26, 10.68it/s, CTC Loss=0.208765]


Sample predictions:
GT: DamZA | Pred: DamZA_
GT: r2aqD | Pred: r2a_qD_
GT: eVTfn1 | Pred: eVTfn1_
GT: AHrT_441 | Pred: AHr_T_5_8
GT: CC15I | Pred: C_15I_


Epoch 10/1000 [Val]: 100%|██████████| 280/280 [00:22<00:00, 12.35it/s, CTC Loss=0.435227]


Epoch [10/1000] Train Loss: 0.1943 | Val Loss: 0.2082 | LR: 0.000100


Epoch 11/1000 [Train]: 100%|██████████| 2512/2512 [07:01<00:00,  5.96it/s, CTC Loss=0.170006]
Epoch 11/1000 [Val]:   1%|          | 2/280 [00:00<00:25, 10.70it/s, CTC Loss=0.210566]


Sample predictions:
GT: 6WVSBi | Pred: 6W_VSBi_
GT: 9v24r | Pred: 9v2_4R_
GT: zanbc4 | Pred: zanbc4_
GT: 37iMg | Pred: 37i_Mg_
GT: pa7J1 | Pred: pa7_1l_


Epoch 11/1000 [Val]: 100%|██████████| 280/280 [00:23<00:00, 11.80it/s, CTC Loss=0.266810]


Epoch [11/1000] Train Loss: 0.1813 | Val Loss: 0.2036 | LR: 0.000100


Epoch 12/1000 [Train]:   5%|▌         | 128/2512 [00:21<06:47,  5.86it/s, CTC Loss=0.162085]


KeyboardInterrupt: 