In [1]:
import glob
import os

import albumentations
import numpy as np
import torch
from PIL import Image
from PIL import ImageFile
from sklearn import metrics
from sklearn import model_selection
from sklearn import preprocessing
from torch import nn
from torch.nn import functional as F
from tqdm import tqdm

In [2]:
!wget https://github.com/AakashKumarNain/CaptchaCracker/raw/master/captcha_images_v2.zip

In [3]:
!unzip -qq captcha_images_v2.zip

### Dataloader

In [4]:
ImageFile.LOAD_TRUNCATED_IMAGES = True

In [5]:
class ClassificationDataset:
    def __init__(self, image_paths, targets, resize=None):
        # resize = (height, width)
        self.image_paths = image_paths
        self.targets = targets
        self.resize = resize

        mean = (0.485, 0.456, 0.406)
        std = (0.229, 0.224, 0.225)
        self.aug = albumentations.Compose(
            [
                albumentations.Normalize(
                    mean, std, max_pixel_value=255.0, always_apply=True
                )
            ]
        )

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, item):
        image = Image.open(self.image_paths[item]).convert("RGB")
        targets = self.targets[item]

        if self.resize is not None:
            image = image.resize(
                (self.resize[1], self.resize[0]), resample=Image.BILINEAR
            )

        image = np.array(image)
        augmented = self.aug(image=image)
        image = augmented["image"]
        image = np.transpose(image, (2, 0, 1)).astype(np.float32)

        return {
            "images": torch.tensor(image, dtype=torch.float),
            "targets": torch.tensor(targets, dtype=torch.long),
        }

### Model

In [6]:
class CaptchaModel(nn.Module):
    def __init__(self, num_chars):
        super(CaptchaModel, self).__init__()
        self.conv_1 = nn.Conv2d(3, 128, kernel_size=(3, 6), padding=(1, 1))
        self.pool_1 = nn.MaxPool2d(kernel_size=(2, 2))
        self.conv_2 = nn.Conv2d(128, 64, kernel_size=(3, 6), padding=(1, 1))
        self.pool_2 = nn.MaxPool2d(kernel_size=(2, 2))
        self.linear_1 = nn.Linear(1152, 64)
        self.drop_1 = nn.Dropout(0.2)
        self.lstm = nn.GRU(64, 32, bidirectional=True, num_layers=2, dropout=0.25, batch_first=True)
        self.output = nn.Linear(64, num_chars + 1)

    def forward(self, images, targets=None):
        bs, _, _, _ = images.size()
        x = F.relu(self.conv_1(images))
        x = self.pool_1(x)
        x = F.relu(self.conv_2(x))
        x = self.pool_2(x)
        x = x.permute(0, 3, 1, 2)
        x = x.view(bs, x.size(1), -1)
        x = F.relu(self.linear_1(x))
        x = self.drop_1(x)
        x, _ = self.lstm(x)
        x = self.output(x)
        x = x.permute(1, 0, 2)

        if targets is not None:
            log_probs = F.log_softmax(x, 2)
            input_lengths = torch.full(
                size=(bs,), fill_value=log_probs.size(0), dtype=torch.int32
            )
            target_lengths = torch.full(
                size=(bs,), fill_value=targets.size(1), dtype=torch.int32
            )
            loss = nn.CTCLoss(blank=0)(
                log_probs, targets, input_lengths, target_lengths
            )
            return x, loss

        return x, None

### Train

In [7]:
DATA_DIR = "captcha_images_v2"
BATCH_SIZE = 2
IMAGE_WIDTH = 100
IMAGE_HEIGHT = 75
NUM_WORKERS = 8
EPOCHS = 10
DEVICE = "cuda"

In [8]:
def train_fn(model, data_loader, optimizer):
    model.train()
    fin_loss = 0
    tk0 = tqdm(data_loader, total=len(data_loader))
    for data in tk0:
        for key, value in data.items():
            data[key] = value.to(DEVICE)
        optimizer.zero_grad()
        _, loss = model(**data)
        loss.backward()
        optimizer.step()
        fin_loss += loss.item()
    return fin_loss / len(data_loader)


def eval_fn(model, data_loader):
    model.eval()
    fin_loss = 0
    fin_preds = []
    tk0 = tqdm(data_loader, total=len(data_loader))
    for data in tk0:
        for key, value in data.items():
            data[key] = value.to(DEVICE)
        batch_preds, loss = model(**data)
        fin_loss += loss.item()
        fin_preds.append(batch_preds)
    return fin_preds, fin_loss / len(data_loader)



def remove_duplicates(x):
    if len(x) < 2:
        return x
    fin = ""
    for j in x:
        if fin == "":
            fin = j
        else:
            if j == fin[-1]:
                continue
            else:
                fin = fin + j
    return fin


def decode_predictions(preds, encoder):
    preds = preds.permute(1, 0, 2)
    preds = torch.softmax(preds, 2)
    preds = torch.argmax(preds, 2)
    preds = preds.detach().cpu().numpy()
    cap_preds = []
    for j in range(preds.shape[0]):
        temp = []
        for k in preds[j, :]:
            k = k - 1
            if k == -1:
                temp.append("§")
            else:
                p = encoder.inverse_transform([k])[0]
                temp.append(p)
        tp = "".join(temp).replace("§", "")
        cap_preds.append(remove_duplicates(tp))
    return cap_preds


In [9]:
def run_training():
    image_files = glob.glob(os.path.join(DATA_DIR, "*.png"))
    targets_orig = [x.split("/")[-1][:-4] for x in image_files]
    targets = [[c for c in x] for x in targets_orig]
    targets_flat = [c for clist in targets for c in clist]

    lbl_enc = preprocessing.LabelEncoder()
    lbl_enc.fit(targets_flat)
    targets_enc = [lbl_enc.transform(x) for x in targets]
    targets_enc = np.array(targets_enc)
    targets_enc = targets_enc + 1

    (
        train_imgs,
        test_imgs,
        train_targets,
        test_targets,
        _,
        test_targets_orig,
    ) = model_selection.train_test_split(
        image_files, targets_enc, targets_orig, test_size=0.1, random_state=42
    )

    train_dataset = ClassificationDataset(
        image_paths=train_imgs,
        targets=train_targets,
        resize=(IMAGE_HEIGHT, IMAGE_WIDTH),
    )
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        shuffle=True,
    )
    test_dataset = ClassificationDataset(
        image_paths=test_imgs,
        targets=test_targets,
        resize=(IMAGE_HEIGHT, IMAGE_WIDTH),
    )
    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        shuffle=False,
    )

    model = CaptchaModel(num_chars=len(lbl_enc.classes_))
    model.to(DEVICE)

    optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, factor=0.8, patience=5, verbose=True
    )
    for epoch in range(EPOCHS):
        train_loss = train_fn(model, train_loader, optimizer)
        valid_preds, test_loss = eval_fn(model, test_loader)
        valid_captcha_preds = []
        for vp in valid_preds:
            current_preds = decode_predictions(vp, lbl_enc)
            valid_captcha_preds.extend(current_preds)
        combined = list(zip(test_targets_orig, valid_captcha_preds))
        print(combined[:10])
        test_dup_rem = [remove_duplicates(c) for c in test_targets_orig]
        accuracy = metrics.accuracy_score(test_dup_rem, valid_captcha_preds)
        print(
            f"Epoch={epoch}, Train Loss={train_loss}, Test Loss={test_loss} Accuracy={accuracy}"
        )
        scheduler.step(test_loss)

In [10]:
run_training()

100%|██████████| 468/468 [00:03<00:00, 122.23it/s]
100%|██████████| 52/52 [00:00<00:00, 85.87it/s]
  0%|          | 0/468 [00:00<?, ?it/s]

[('ygenn', ''), ('43xfe', ''), ('x347n', ''), ('33f7m', ''), ('mn5c4', ''), ('c6f8g', ''), ('77387', ''), ('3xcgg', ''), ('25w53', ''), ('gd8fb', '')]
Epoch=0, Train Loss=3.4861614795831533, Test Loss=3.2792163628798265 Accuracy=0.0


100%|██████████| 468/468 [00:03<00:00, 119.52it/s]
100%|██████████| 52/52 [00:00<00:00, 78.12it/s]
  0%|          | 0/468 [00:00<?, ?it/s]

[('ygenn', ''), ('43xfe', ''), ('x347n', ''), ('33f7m', ''), ('mn5c4', ''), ('c6f8g', ''), ('77387', ''), ('3xcgg', ''), ('25w53', ''), ('gd8fb', '')]
Epoch=1, Train Loss=3.2418808101588845, Test Loss=3.21640598315459 Accuracy=0.0


100%|██████████| 468/468 [00:03<00:00, 119.04it/s]
100%|██████████| 52/52 [00:00<00:00, 88.02it/s]
  0%|          | 0/468 [00:00<?, ?it/s]

[('ygenn', ''), ('43xfe', ''), ('x347n', ''), ('33f7m', ''), ('mn5c4', ''), ('c6f8g', ''), ('77387', ''), ('3xcgg', ''), ('25w53', ''), ('gd8fb', '')]
Epoch=2, Train Loss=3.057123135297726, Test Loss=2.6681850048211904 Accuracy=0.0


100%|██████████| 468/468 [00:03<00:00, 121.57it/s]
100%|██████████| 52/52 [00:00<00:00, 84.08it/s]
  0%|          | 0/468 [00:00<?, ?it/s]

[('ygenn', ''), ('43xfe', '43f'), ('x347n', '347'), ('33f7m', '3f'), ('mn5c4', '54'), ('c6f8g', 'f8g'), ('77387', '587'), ('3xcgg', '5g'), ('25w53', '25'), ('gd8fb', 'g8f')]
Epoch=3, Train Loss=2.241186129486459, Test Loss=1.5904359783117588 Accuracy=0.009615384615384616


100%|██████████| 468/468 [00:03<00:00, 120.95it/s]
100%|██████████| 52/52 [00:00<00:00, 84.37it/s]
  0%|          | 0/468 [00:00<?, ?it/s]

[('ygenn', 'gen'), ('43xfe', '43fe'), ('x347n', 'n347n'), ('33f7m', '3f7n'), ('mn5c4', 'n5c4'), ('c6f8g', 'cf8g'), ('77387', '7587'), ('3xcgg', '3ncg'), ('25w53', '25n53'), ('gd8fb', 'gd8fb')]
Epoch=4, Train Loss=1.227692259491509, Test Loss=0.7275623197738941 Accuracy=0.28846153846153844


100%|██████████| 468/468 [00:03<00:00, 123.89it/s]
100%|██████████| 52/52 [00:00<00:00, 97.28it/s]
  0%|          | 0/468 [00:00<?, ?it/s]

[('ygenn', 'ygen'), ('43xfe', '43xfe'), ('x347n', 'x347n'), ('33f7m', '3f7n'), ('mn5c4', 'n5c4'), ('c6f8g', 'cf8g'), ('77387', '7387'), ('3xcgg', '5xcg'), ('25w53', '25w53'), ('gd8fb', 'gd8fb')]
Epoch=5, Train Loss=0.7195539942854999, Test Loss=0.5107996243123825 Accuracy=0.5096153846153846


100%|██████████| 468/468 [00:03<00:00, 130.23it/s]
100%|██████████| 52/52 [00:00<00:00, 91.28it/s]
  0%|          | 0/468 [00:00<?, ?it/s]

[('ygenn', 'ygen'), ('43xfe', '43xfe'), ('x347n', 'x347n'), ('33f7m', '3f7m'), ('mn5c4', 'n5c4'), ('c6f8g', 'e6f8g'), ('77387', '7387'), ('3xcgg', '3xcg'), ('25w53', '25w53'), ('gd8fb', 'gd8fb')]
Epoch=6, Train Loss=0.48422338390070147, Test Loss=0.2518737767464839 Accuracy=0.7596153846153846


100%|██████████| 468/468 [00:03<00:00, 128.64it/s]
100%|██████████| 52/52 [00:00<00:00, 94.32it/s]
  0%|          | 0/468 [00:00<?, ?it/s]

[('ygenn', 'ygen'), ('43xfe', '43xfe'), ('x347n', 'x347n'), ('33f7m', '3f7m'), ('mn5c4', 'n5c4'), ('c6f8g', 'c6f8g'), ('77387', '7387'), ('3xcgg', '3xcg'), ('25w53', '25w53'), ('gd8fb', 'gd8fb')]
Epoch=7, Train Loss=0.31348752652286976, Test Loss=0.17322492019201702 Accuracy=0.7692307692307693


100%|██████████| 468/468 [00:03<00:00, 127.78it/s]
100%|██████████| 52/52 [00:00<00:00, 91.63it/s]
  0%|          | 0/468 [00:00<?, ?it/s]

[('ygenn', 'ygen'), ('43xfe', '43xfe'), ('x347n', 'x347n'), ('33f7m', '3f7m'), ('mn5c4', 'mn5c4'), ('c6f8g', 'c6f8g'), ('77387', '7387'), ('3xcgg', '3xcg'), ('25w53', '25w53'), ('gd8fb', 'gd8fb')]
Epoch=8, Train Loss=0.25010242597319376, Test Loss=0.13368436701309222 Accuracy=0.8173076923076923


100%|██████████| 468/468 [00:03<00:00, 132.23it/s]
100%|██████████| 52/52 [00:00<00:00, 94.63it/s]


[('ygenn', 'ygen'), ('43xfe', '43xfe'), ('x347n', 'x347n'), ('33f7m', '3f7m'), ('mn5c4', 'mn5c4'), ('c6f8g', 'c6f8g'), ('77387', '7387'), ('3xcgg', '3xcg'), ('25w53', '25w53'), ('gd8fb', 'gd8fb')]
Epoch=9, Train Loss=0.1863732427979509, Test Loss=0.09827832032281619 Accuracy=0.9038461538461539
