The generative captcha model with no noise and smaller letter dictionary

In [None]:
import random
import string
import os
import torch.nn as nn
import torch
import torch.nn.functional as F
import pyro
import numpy as np
import pyro.optim as optim
import pyro.distributions as dist
from pyro.infer import SVI, Trace_ELBO, TraceGraph_ELBO
from PIL import Image
from claptchagen.claptcha import Claptcha
from torch.distributions import constraints
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import matplotlib.pyplot as plt

In [None]:
captcha_folder = 'generated_captchas' # folder to save generated captchas
captchaHeight = 32
captchaWidth = 100
captchaMarginX = 4
captchaMarginY = 4
batch_size = 8
char_dict = "abc" # dictionary only has 3 letters
USE_CUDA = True
MAX_N = 4 # the max number of letters in a captcha
smoke_test = False
num_steps = 10000 if not smoke_test else 10
TrainingSample = 6000 if not smoke_test else 100 # number of examples will be generated for training

In [None]:
def randomString():
    """
    return a string with <k> random letters, where k is a random int from 1 to MAX_N, inclusive both
    """
    k = random.randint(1, MAX_N)
    
    rndLetters = (random.choice(char_dict) for _ in range(k))
    return "".join(rndLetters)

def generate_random_captcha(n, save=False):
    """
    generate n random captchas,
    return a list of texts on the captchas
    """
    # Initialize Claptcha object with random text, FreeMono as font, of size
    # 100x32px, using bicubic resampling filter and adding a bit of white noise
    c = Claptcha(randomString, "fonts/FreeSans.ttf", (captchaWidth, captchaHeight), (captchaMarginX, captchaMarginY),
             resample=Image.BILINEAR, noise=0)
    captcha_generated = [ [] for i in range(MAX_N)]
    for i in range(n):
        if save:
            text, _ = c.write(os.path.join(captcha_folder, 'captcha{}.png'.format(i)))
            os.rename(os.path.join(captcha_folder, 'captcha{}.png'.format(i)),os.path.join(captcha_folder, '{}.png'.format(text + "_" + str(i))))
        text, image = c.image
        image = np.array(image)[:, :, 0] # the generator is gray scale, only keep one channel is enough
        captcha_generated[len(text) - 1].append((text, image))
    return captcha_generated
        
captcha_generated = generate_random_captcha(TrainingSample, save=False)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
c (32, 100)
a (32, 100)
a (32, 100)
c (32, 100)
a (32, 100)
b (32, 100)
a (32, 100)
a (32, 100)
a (32, 100)
a (32, 100)
c (32, 100)
a (32, 100)
c (32, 100)
b (32, 100)
a (32, 100)
c (32, 100)
a (32, 100)
b (32, 100)
c (32, 100)
c (32, 100)
a (32, 100)
c (32, 100)
c (32, 100)
c (32, 100)
c (32, 100)
a (32, 100)
c (32, 100)
a (32, 100)
b (32, 100)
b (32, 100)
a (32, 100)
c (32, 100)
c (32, 100)
c (32, 100)
c (32, 100)
a (32, 100)
c (32, 100)
a (32, 100)
c (32, 100)
c (32, 100)
a (32, 100)
a (32, 100)
b (32, 100)
a (32, 100)
b (32, 100)
b (32, 100)
b (32, 100)
b (32, 100)
c (32, 100)
a (32, 100)
b (32, 100)
a (32, 100)
c (32, 100)
b (32, 100)
b (32, 100)
a (32, 100)
b (32, 100)
b (32, 100)
a (32, 100)
a (32, 100)
c (32, 100)
c (32, 100)
c (32, 100)
a (32, 100)
a (32, 100)
b (32, 100)
a (32, 100)
a (32, 100)
a (32, 100)
b (32, 100)
b (32, 100)
a (32, 100)
a (32, 100)
c (32, 100)
b (32, 100)
c (32, 100)
b (32, 100)
c (32, 100)

In [None]:
def render_image(chars, fonts="fonts/FreeSans.ttf", size=(captchaWidth, captchaHeight), 
                 margin=(captchaMarginX, captchaMarginY), resample=Image.BILINEAR, noise=0.3, use_cuda=False):
    """
    generate a captcha with predicted chars and noise
    """
    render = Claptcha(chars, fonts, size, margin, resample=resample, noise=noise)

    _ , rendered_image = render.image
    rendered_image = np.array(rendered_image)[:,:,0] # the generator is gray scale, only keep one channel is enough
    rendered_image = np.divide(rendered_image, 255)
    rendered_image = torch.from_numpy(rendered_image)
    if use_cuda:
        rendered_image = rendered_image.cuda()
    return rendered_image

In [None]:
class CaptchaDataset(Dataset):
    """Captcha dataset."""

    def __init__(self, raw_captchas, transform=None):

        self.raw_captchas = raw_captchas
        self.transform = transform

    def __len__(self):
        return len(self.raw_captchas)

    def __getitem__(self, idx):
        label = self.raw_captchas[idx][0]
        image = self.raw_captchas[idx][1]
        
        image = np.divide(image, 255)
        image = torch.from_numpy(image).float()

        if self.transform:
            image = self.transform(image)
        return label, image

In [None]:
def make_loarders(BATCH_SIZE, raw_samples):
    """
    create data loaders for different numbers of captcha samples
    """
    dataloaders = [] # dataloaders for different num of char
    for lst in raw_samples:
        if lst:
            ds = CaptchaDataset(lst)
            dataloader = DataLoader(ds, batch_size=BATCH_SIZE,
                                    shuffle=True, num_workers=0, drop_last=True)
            dataloaders.append(dataloader)
    return dataloaders

def make_batches(dataloaders):
    """
    make shuffled mini batches from dataloaders
    all samples in the same mini batch have the same ground truth number of letters (N)
    """
    all_batches = []
    for dl in dataloaders:
        for i_batch, sample in enumerate(dl):
            all_batches.append(sample)
    random.shuffle(all_batches)
    random.shuffle(all_batches)
    return all_batches

TrainLoaders = make_loarders(BATCH_SIZE=batch_size, raw_samples=captcha_generated)

Define the networks

In [None]:
class NumNet(nn.Module):
    def __init__(self, img_size, out_size = 3):
        """
        Network to predict the number of letters in a captcha image
        """
        super(NumNet, self).__init__()
        self.neural_net = nn.Sequential(
            nn.Linear(img_size[0] * img_size[1], img_size[0] * img_size[1] * 2),
            nn.ReLU(),
            nn.Linear(img_size[0] * img_size[1] * 2, 256),
            nn.ReLU(),
            nn.Linear(256, out_size),
            nn.LogSoftmax())
            
    def forward(self, img):
        img = torch.reshape(img, (img.shape[0], img.shape[1] * img.shape[2]))
        prob = self.neural_net(img)
        return prob

In [None]:
class NoiseNet(nn.Module):
    def __init__(self, img_size, out_size = 1):
        """
        Network to predict noise for a captcha image
        noise should be inside of [0, 1]
        NOT USED
        """
        super(NoiseNet, self).__init__()
        self.fc1 = nn.Linear(img_size[0] * img_size[1], img_size[0] * img_size[1] * 2)
        self.fc2 = nn.Linear(img_size[0] * img_size[1] * 2, out_size)
        self.fc3 = nn.Linear(img_size[0] * img_size[1], img_size[0] * img_size[1] * 2)
        self.fc4 = nn.Linear(img_size[0] * img_size[1] * 2, out_size)
        self.softplus = nn.Softplus()

    def forward(self, img):
        img = torch.reshape(img, (-1,))
        alpha = F.relu(self.fc1(img))
        alpha = self.softplus(self.fc2(alpha))
        
        beta = F.relu(self.fc3(img))
        beta = self.softplus(self.fc4(beta))
    
        return alpha, beta

In [None]:
class CharNetSingle(nn.Module):
    def __init__(self, img_size, output_size, hidden_size=512, N_num_class=10, input_size=1024, num_layers=1):
        """
        Network to predict characters in the captcha
        Consturcted by a series of conv and linear layers for f_observe, applies to the input image
        and a LSTM to predict each letter, given the predicted number of letters N
        """
        super(CharNetSingle, self).__init__()
        self.img_size = img_size
        self.N_num_class = N_num_class
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.input_size = input_size
        self.rnn = nn.LSTM(input_size, hidden_size, num_layers)

        self.linear_layers = nn.ModuleList([nn.Linear(hidden_size, output_size) for i in range(N_num_class)])

        self.dropout = nn.Dropout()
        self.conv1 = nn.Conv2d(1, 32, 3)
        self.conv2 = nn.Conv2d(32, 64, 3)
        self.pool = nn.MaxPool2d(2, 2)
        self.pfc1 = nn.Linear(8832, 2048)
        self.pfc2 = nn.Linear(2048, 1024)
        self.softplus = nn.Softplus()
        self.h_0 = nn.Parameter(torch.zeros(self.num_layers, 1, self.hidden_size))
        self.c_0 = nn.Parameter(torch.zeros(self.num_layers, 1, self.hidden_size))
    
    def forward(self, img, N):
        #N = torch.tensor(1)
        BATCH_SIZE = img.shape[0]
        i = torch.arange(0, N)
        if USE_CUDA:
            i = i.cuda()
        # transfrom the index of each letter into one-hot format as well as the predicted N
        i_onehot = F.one_hot(i, num_classes=self.N_num_class).float()
        i_onehot = torch.reshape(i_onehot, (N, 1, self.N_num_class)).repeat(1, BATCH_SIZE, 1)
        N_onehot = F.one_hot(N-1, num_classes=self.N_num_class).repeat(N, BATCH_SIZE, 1).float()

        # add a fc layer to map inputs
        img = torch.reshape(img, (BATCH_SIZE, 1, self.img_size[0], self.img_size[1]))
        img = self.pool(F.relu(self.conv1(img)))
        img = self.pool(F.relu(self.conv2(img)))

        img = torch.reshape(img, (BATCH_SIZE, 8832))

        img = F.relu(self.pfc1(img))
        img = F.relu(self.pfc2(img))
        
        img = img.repeat(N, 1, 1)
        
        x = torch.cat((img, N_onehot, i_onehot), dim=2)
        x = torch.reshape(x, (N, BATCH_SIZE, self.input_size))
    
        h_0_contig = self.h_0.expand(self.num_layers, BATCH_SIZE, self.hidden_size).contiguous()
        c_0_contig = self.h_0.expand(self.num_layers, BATCH_SIZE, self.hidden_size).contiguous()
        outputs, hn = self.rnn(x, (h_0_contig, c_0_contig))
        outputs = self.dropout(outputs)
        
        outputs = torch.stack([self.linear_layers[i](outputs[i]) for i in range(outputs.shape[0])])

        assert outputs.shape[0] == N
        assert outputs.shape[1] == BATCH_SIZE
        
        outputs = F.log_softmax(outputs, dim=2)
        
        # we want the first dim to be the batch size
        outputs = torch.transpose(outputs, 0, 1)

        return outputs

In [None]:
class CaptchaModel(nn.Module):
    def __init__(self, use_cuda=False):
        super().__init__()
        self.num_char_domain = torch.arange(1, MAX_N + 1)
        print(self.num_char_domain)
        if use_cuda:
          self.num_char_domain = self.num_char_domain.cuda()
        self.char_dict = char_dict
        
        self.numNet = NumNet((captchaHeight, captchaWidth), len(self.num_char_domain))
        #self.noiseNet = NoiseNet((captchaHeight, captchaWidth), 1)
        self.rnn_hidden_size = 512
        self.rnn_num_layer = 2
        self.charNetSingle = CharNetSingle((captchaHeight, captchaWidth), len(self.char_dict), N_num_class=max(self.num_char_domain), input_size=1024 + max(self.num_char_domain) * 2, hidden_size=self.rnn_hidden_size, num_layers=self.rnn_num_layer)
        if use_cuda:
            self.cuda()
        self.use_cuda = use_cuda
    def model(self, baseline_image):
        pyro.module("captchasolver", self)
        
        # prior of p(N)
        num_p = torch.tensor(1 / len(self.num_char_domain)).repeat(len(self.num_char_domain))

        if self.use_cuda:
            num_p = num_p.cuda()

        N_index = pyro.sample("num_char", dist.Categorical(num_p))
        N = N_index + self.num_char_domain[0]
        if self.use_cuda:
            N = N.cuda()
        
        with pyro.plate("data", baseline_image.shape[0]):
            
            # prior of p(c_i)
            
            num_c = torch.tensor(1 / len(self.char_dict)).repeat((batch_size, N, len(self.char_dict)))
            if self.use_cuda:
                num_c = num_c.cuda()
    
            c = pyro.sample("chars", dist.Categorical(num_c).to_event(1)) # maybe 2 here
            
            rendered_images = []
            
            for i in range(c.shape[0]):
                chars = ""
                for j in range(c.shape[1]):
                    chars += self.char_dict[c[i][j]]
                rendered_image = render_image(chars, noise=0., use_cuda=self.use_cuda)
                rendered_images.append(rendered_image)
            
            rendered_images = torch.stack(rendered_images)
            
            sigma = torch.tensor(0.0000001)
            if self.use_cuda:
                sigma = sigma.cuda()

            pyro.sample("captcha", dist.Normal(rendered_images, sigma).to_event(2), obs=baseline_image)

    def guide(self, baseline_image):
        pyro.module("captchasolver", self)

        num_p = self.numNet(baseline_image)
        num_p = torch.mean(num_p, dim=0) # take the mean of predicted Ns for a batch
                                         # we want ta single N (i.e. one N for each batch)
        N_index = pyro.sample("num_char", dist.Categorical(num_p))
        N = N_index + self.num_char_domain[0]
        if self.use_cuda:
                N = N.cuda()
        
        with pyro.plate("data", baseline_image.shape[0]):
            
            charP = self.charNetSingle(baseline_image, N)
            c = pyro.sample("chars", dist.Categorical(charP).to_event(1))


Hyperparameters

In [None]:
captchaModel = CaptchaModel(USE_CUDA)
model = captchaModel.model
guide = captchaModel.guide
learning_rate = 1e-4
optimizer = optim.Adam({"lr":learning_rate})
svi = SVI(model, guide, optimizer, loss=Trace_ELBO())
loss_list = []



tensor([1, 2, 3, 4])


Optimize and testing functions

In [None]:
def test_cycle(use_cuda):
    captchaModel.numNet.eval()
    captchaModel.charNetSingle.eval()
    
    # test on the train dataset
    test(use_train=True, verbose=True, use_cuda=use_cuda)
    # test on a new generated dataset
    test(1000, use_train=False, verbose=True, use_cuda=use_cuda)
    # we may want to see the predicted and groundtruth
    test(10, use_train=False, verbose=False, use_cuda=use_cuda)
    
    captchaModel.numNet.train()
    captchaModel.charNetSingle.train()

def optimize(use_cuda=False):
    LOSS = 0
    pause = 5
    print("Optimizing...")
    for t in range(1, num_steps + 1):
        
        LOSS += inference(t, use_cuda)
        if (t % pause == 0) and (t > 0):
            print("at {} step loss is {}".format(t, LOSS / pause))
            LOSS = 0
            test_cycle(use_cuda=use_cuda)


def inference(t, use_cuda=False):
    loss = 0
    length = TrainingSample
    loss_group = []
    all_batches = make_batches(TrainLoaders)
    for i_batch, sample_batched in enumerate(all_batches):
        
        # get a batch and extract the images
        img = sample_batched[1]
        if use_cuda:
            img = img.cuda()

        imme_loss = svi.step(img) / (length * 1000)
        loss += imme_loss
        loss_group.append(imme_loss)
        
        if (i_batch > 0) and (i_batch % 10 == 0):
            # compute the mean of losses, for making a plot
            loss_mean = np.mean(np.array(loss_group))
            loss_list.append(loss_mean)
            loss_group = []

    print("loss at {} is {}".format(t, loss))
    return loss

In [None]:
def test(n = 0, use_train=False, verbose=False, use_cuda=False):
    if use_train:
        TestLoaders = make_loarders(BATCH_SIZE=1, raw_samples=captcha_generated)
    else:
        test_captcha_generated = generate_random_captcha(n, save=False)
        TestLoaders = make_loarders(BATCH_SIZE=1, raw_samples=test_captcha_generated)
    
    total_correct = 0
    char_correct = 0
    total_char = 0
    all_batches = make_batches(TestLoaders)
    for i_batch, t in enumerate(all_batches):

        label = t[0][0]
        img = t[1]

        if use_cuda:
            img = img.cuda()
        
        num_p = captchaModel.numNet(img)
        N_index = dist.Categorical(num_p[0]).sample()
        N = N_index + captchaModel.num_char_domain[0]
        
        if use_cuda:
            N = N.cuda()

        if use_cuda:
            N = N.cuda()
        charP = captchaModel.charNetSingle(img, N) # size (N, self.char_dict)

        charP = charP[0]
        if use_cuda:
            charP.cpu()

        chars = ""
        for i in range(N):
            cp = charP[i]
            c_index = int(dist.Categorical(cp).sample())
            chars +=  captchaModel.char_dict[c_index]
        correct = 0
        for p_char, t_char in zip(chars, label):
            if p_char == t_char:
                correct += 1
        if not verbose:
            print("N_predicted:", int(N), "Actual N:", len(label), "Predicted Text:", chars, "Actual Text:", label, "Correct:", correct)
        if correct == len(label) and int(N) == len(label):
            total_correct += 1
        char_correct += correct
        total_char += len(label)
    num_test_samples = i_batch + 1
    accuracy = total_correct / num_test_samples
    char_accuracy = char_correct / total_char
    print("use_train =", use_train, "Total correct:", total_correct, "accuracy:{}/{}=".format(total_correct, num_test_samples), accuracy, "char_accuracy:{}/{}=".format(char_correct, total_char), char_accuracy)


In [None]:
optimize(USE_CUDA)

Optimizing...


  input = module(input)


loss at 1 is 14151975248596.16
loss at 2 is 14053647043300.506
loss at 3 is 13965947429900.705
loss at 4 is 13867707186020.354
loss at 5 is 14054502605690.3
at 5 step loss is 14018755902701.604
use_train = True Total correct: 85 accuracy:85/6000= 0.014166666666666666 char_accuracy:4504/15089= 0.29849559281595867
use_train = False Total correct: 9 accuracy:9/1000= 0.009 char_accuracy:698/2540= 0.2748031496062992
N_predicted: 4 Actual N: 3 Predicted Text: ccaa Actual Text: abc Correct: 0
N_predicted: 4 Actual N: 3 Predicted Text: ccca Actual Text: bac Correct: 1
N_predicted: 3 Actual N: 4 Predicted Text: ccc Actual Text: aaca Correct: 1
N_predicted: 4 Actual N: 1 Predicted Text: ccca Actual Text: a Correct: 0
N_predicted: 4 Actual N: 2 Predicted Text: aacc Actual Text: ac Correct: 1
N_predicted: 3 Actual N: 3 Predicted Text: cac Actual Text: bab Correct: 1
N_predicted: 4 Actual N: 4 Predicted Text: caac Actual Text: cbac Correct: 3
N_predicted: 2 Actual N: 4 Predicted Text: cc Actual Tex

In [None]:
plt.plot(loss_list)
plt.title("loss")
plt.show()