In [91]:
import sys
sys.path.insert(1, '/home/furqan/.pyenv/versions/3.8.5/lib/python3.8/site-packages')

import albumentations
import torch

import os
import glob
import numpy as np
from sklearn import model_selection

from PIL import Image
from PIL import ImageFile

ImageFile.LOAD_TRUNCATED_IMAGES = True


In [109]:
class ClassificationDataset:
    def __init__(self, image_paths, targets, resize=None):
        # resize = (height, width)
        self.image_paths = image_paths
        self.targets = targets
        self.resize = resize

        mean = (0.485, 0.456, 0.406)
        std = (0.229, 0.224, 0.225)
         # albumentations is relatd to augmenting the image, such as resizing, rotating and normalizing etc.
        self.aug = albumentations.Compose(
            [
                albumentations.Normalize(
                    mean, std, max_pixel_value=255.0, always_apply=True
                )
            ]
        )

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, item):
        image = Image.open(self.image_paths[item]).convert("RGB")
        targets = self.targets[item]

        if self.resize is not None:
            image = image.resize(
                (self.resize[1], self.resize[0]), resample=Image.BILINEAR
            )

        image = np.array(image)
        augmented = self.aug(image=image)
        image = augmented["image"]
        # transpose is used to bring the channels dim first and width and height later. 
        image = np.transpose(image, (2, 0, 1)).astype(np.float32)

        return {
            "images": torch.tensor(image, dtype=torch.float),
            "targets": torch.tensor(targets, dtype=torch.long),
        }

In [74]:
dataset_path = "/home/furqan/Desktop/python_work/My_udemy_course/Chap1_OCR/captcha_images_v2"
image_files = glob.glob(os.path.join(dataset_path, "*.png"))

print(image_files)
# dataset = ClassificationDataset(image_path=dataset_path, )

['/home/furqan/Desktop/python_work/My_udemy_course/Chap1_OCR/captcha_images_v2/nf2n8.png', '/home/furqan/Desktop/python_work/My_udemy_course/Chap1_OCR/captcha_images_v2/537nf.png', '/home/furqan/Desktop/python_work/My_udemy_course/Chap1_OCR/captcha_images_v2/defyx.png', '/home/furqan/Desktop/python_work/My_udemy_course/Chap1_OCR/captcha_images_v2/m3b5p.png', '/home/furqan/Desktop/python_work/My_udemy_course/Chap1_OCR/captcha_images_v2/dyxnc.png', '/home/furqan/Desktop/python_work/My_udemy_course/Chap1_OCR/captcha_images_v2/fyfbn.png', '/home/furqan/Desktop/python_work/My_udemy_course/Chap1_OCR/captcha_images_v2/3cpwb.png', '/home/furqan/Desktop/python_work/My_udemy_course/Chap1_OCR/captcha_images_v2/nxx25.png', '/home/furqan/Desktop/python_work/My_udemy_course/Chap1_OCR/captcha_images_v2/n5x2n.png', '/home/furqan/Desktop/python_work/My_udemy_course/Chap1_OCR/captcha_images_v2/wye85.png', '/home/furqan/Desktop/python_work/My_udemy_course/Chap1_OCR/captcha_images_v2/2w4y7.png', '/home/fu

In [75]:
for i in image_files:
    j = i.split("/")
    print(j[-1][:-4])

nf2n8
537nf
defyx
m3b5p
dyxnc
fyfbn
3cpwb
nxx25
n5x2n
wye85
2w4y7
xe8xm
6cm6m
8npe3
wc2bd
68x48
pcm7f
36nx4
f2m8n
mxnw4
f858x
8g4yp
3nfdn
fp3wy
bd3b7
w75w8
dpbyd
7m8px
28348
865wm
b35f6
5nm6d
7fde7
p8wwf
yyn57
mgw3n
2npg6
4n2yg
c6745
x76mn
d378n
b2g8e
bpwd7
xxw44
n7ebx
3ym7f
f2fge
n6xc5
mxyxw
xyncc
ddmyg
yf424
nfbg8
7dgc2
6dd2y
5bgp2
5nxnn
nbfx5
662bw
cewnm
m2576
nc4yg
dn2ym
ef4np
33p4e
x6pdb
gnbn4
268g2
23n88
88y52
728n8
b5dn4
e84n2
b84xc
ecd4w
e6b7y
wd2gb
f35xp
mg5nn
b5nmm
exycn
7g3nf
8n62n
3x5fm
e7nx4
6bxwg
y5n6d
x3fwf
c7nn8
xfn6n
3ygde
n464c
e46yw
8bbw8
deep5
4gb3f
pyefb
5nggg
5nnff
bnc5f
6wb76
gfp54
m3wfw
34fxm
dn26n
658xe
6e2dg
mn5c4
7wyp4
bny23
mc35n
5p3mm
5n3w4
p2x7x
p2m6n
nf7bn
n3bm6
x38fn
dd5w5
ypp8f
264m5
wgnwp
gy8xb
fy2nd
nnn5p
yy824
xf5g7
pnmxf
428b6
3n2b4
3d7bd
6fg8c
be6np
x362g
23mdg
p57fn
bbymy
6n5fd
gc83b
pg4bf
xw465
bw5nf
ppx77
7e2y7
47m2b
d7nn3
xyyyw
g3ex3
ndecc
dmx8p
8xef7
8cm46
5ng6e
57b27
c8n8c
ny3dw
een23
7yf62
8d4wm
nd5wg
wm746
4w76g
fywb8
bm3p8
gfxcc
wnpec
ppwy

In [456]:
targets_orig = [i.split("/")[-1][:-4] for i in image_files]

In [457]:
targets_orig

['nf2n8',
 '537nf',
 'defyx',
 'm3b5p',
 'dyxnc',
 'fyfbn',
 '3cpwb',
 'nxx25',
 'n5x2n',
 'wye85',
 '2w4y7',
 'xe8xm',
 '6cm6m',
 '8npe3',
 'wc2bd',
 '68x48',
 'pcm7f',
 '36nx4',
 'f2m8n',
 'mxnw4',
 'f858x',
 '8g4yp',
 '3nfdn',
 'fp3wy',
 'bd3b7',
 'w75w8',
 'dpbyd',
 '7m8px',
 '28348',
 '865wm',
 'b35f6',
 '5nm6d',
 '7fde7',
 'p8wwf',
 'yyn57',
 'mgw3n',
 '2npg6',
 '4n2yg',
 'c6745',
 'x76mn',
 'd378n',
 'b2g8e',
 'bpwd7',
 'xxw44',
 'n7ebx',
 '3ym7f',
 'f2fge',
 'n6xc5',
 'mxyxw',
 'xyncc',
 'ddmyg',
 'yf424',
 'nfbg8',
 '7dgc2',
 '6dd2y',
 '5bgp2',
 '5nxnn',
 'nbfx5',
 '662bw',
 'cewnm',
 'm2576',
 'nc4yg',
 'dn2ym',
 'ef4np',
 '33p4e',
 'x6pdb',
 'gnbn4',
 '268g2',
 '23n88',
 '88y52',
 '728n8',
 'b5dn4',
 'e84n2',
 'b84xc',
 'ecd4w',
 'e6b7y',
 'wd2gb',
 'f35xp',
 'mg5nn',
 'b5nmm',
 'exycn',
 '7g3nf',
 '8n62n',
 '3x5fm',
 'e7nx4',
 '6bxwg',
 'y5n6d',
 'x3fwf',
 'c7nn8',
 'xfn6n',
 '3ygde',
 'n464c',
 'e46yw',
 '8bbw8',
 'deep5',
 '4gb3f',
 'pyefb',
 '5nggg',
 '5nnff',
 'bnc5f',


In [458]:
targets = [[c for c in x] for x in targets_orig]

In [459]:
targets

[['n', 'f', '2', 'n', '8'],
 ['5', '3', '7', 'n', 'f'],
 ['d', 'e', 'f', 'y', 'x'],
 ['m', '3', 'b', '5', 'p'],
 ['d', 'y', 'x', 'n', 'c'],
 ['f', 'y', 'f', 'b', 'n'],
 ['3', 'c', 'p', 'w', 'b'],
 ['n', 'x', 'x', '2', '5'],
 ['n', '5', 'x', '2', 'n'],
 ['w', 'y', 'e', '8', '5'],
 ['2', 'w', '4', 'y', '7'],
 ['x', 'e', '8', 'x', 'm'],
 ['6', 'c', 'm', '6', 'm'],
 ['8', 'n', 'p', 'e', '3'],
 ['w', 'c', '2', 'b', 'd'],
 ['6', '8', 'x', '4', '8'],
 ['p', 'c', 'm', '7', 'f'],
 ['3', '6', 'n', 'x', '4'],
 ['f', '2', 'm', '8', 'n'],
 ['m', 'x', 'n', 'w', '4'],
 ['f', '8', '5', '8', 'x'],
 ['8', 'g', '4', 'y', 'p'],
 ['3', 'n', 'f', 'd', 'n'],
 ['f', 'p', '3', 'w', 'y'],
 ['b', 'd', '3', 'b', '7'],
 ['w', '7', '5', 'w', '8'],
 ['d', 'p', 'b', 'y', 'd'],
 ['7', 'm', '8', 'p', 'x'],
 ['2', '8', '3', '4', '8'],
 ['8', '6', '5', 'w', 'm'],
 ['b', '3', '5', 'f', '6'],
 ['5', 'n', 'm', '6', 'd'],
 ['7', 'f', 'd', 'e', '7'],
 ['p', '8', 'w', 'w', 'f'],
 ['y', 'y', 'n', '5', '7'],
 ['m', 'g', 'w', '3'

In [460]:
kk  = []
for i in targets_orig:
    for j in i:
        kk.append(j)

In [461]:
kk

['n',
 'f',
 '2',
 'n',
 '8',
 '5',
 '3',
 '7',
 'n',
 'f',
 'd',
 'e',
 'f',
 'y',
 'x',
 'm',
 '3',
 'b',
 '5',
 'p',
 'd',
 'y',
 'x',
 'n',
 'c',
 'f',
 'y',
 'f',
 'b',
 'n',
 '3',
 'c',
 'p',
 'w',
 'b',
 'n',
 'x',
 'x',
 '2',
 '5',
 'n',
 '5',
 'x',
 '2',
 'n',
 'w',
 'y',
 'e',
 '8',
 '5',
 '2',
 'w',
 '4',
 'y',
 '7',
 'x',
 'e',
 '8',
 'x',
 'm',
 '6',
 'c',
 'm',
 '6',
 'm',
 '8',
 'n',
 'p',
 'e',
 '3',
 'w',
 'c',
 '2',
 'b',
 'd',
 '6',
 '8',
 'x',
 '4',
 '8',
 'p',
 'c',
 'm',
 '7',
 'f',
 '3',
 '6',
 'n',
 'x',
 '4',
 'f',
 '2',
 'm',
 '8',
 'n',
 'm',
 'x',
 'n',
 'w',
 '4',
 'f',
 '8',
 '5',
 '8',
 'x',
 '8',
 'g',
 '4',
 'y',
 'p',
 '3',
 'n',
 'f',
 'd',
 'n',
 'f',
 'p',
 '3',
 'w',
 'y',
 'b',
 'd',
 '3',
 'b',
 '7',
 'w',
 '7',
 '5',
 'w',
 '8',
 'd',
 'p',
 'b',
 'y',
 'd',
 '7',
 'm',
 '8',
 'p',
 'x',
 '2',
 '8',
 '3',
 '4',
 '8',
 '8',
 '6',
 '5',
 'w',
 'm',
 'b',
 '3',
 '5',
 'f',
 '6',
 '5',
 'n',
 'm',
 '6',
 'd',
 '7',
 'f',
 'd',
 'e',
 '7',
 'p',
 '8'

In [462]:
###########   OR ###### 
# try below code.


targets_flat = [c for clist in targets for c in clist]

In [463]:
targets_flat

['n',
 'f',
 '2',
 'n',
 '8',
 '5',
 '3',
 '7',
 'n',
 'f',
 'd',
 'e',
 'f',
 'y',
 'x',
 'm',
 '3',
 'b',
 '5',
 'p',
 'd',
 'y',
 'x',
 'n',
 'c',
 'f',
 'y',
 'f',
 'b',
 'n',
 '3',
 'c',
 'p',
 'w',
 'b',
 'n',
 'x',
 'x',
 '2',
 '5',
 'n',
 '5',
 'x',
 '2',
 'n',
 'w',
 'y',
 'e',
 '8',
 '5',
 '2',
 'w',
 '4',
 'y',
 '7',
 'x',
 'e',
 '8',
 'x',
 'm',
 '6',
 'c',
 'm',
 '6',
 'm',
 '8',
 'n',
 'p',
 'e',
 '3',
 'w',
 'c',
 '2',
 'b',
 'd',
 '6',
 '8',
 'x',
 '4',
 '8',
 'p',
 'c',
 'm',
 '7',
 'f',
 '3',
 '6',
 'n',
 'x',
 '4',
 'f',
 '2',
 'm',
 '8',
 'n',
 'm',
 'x',
 'n',
 'w',
 '4',
 'f',
 '8',
 '5',
 '8',
 'x',
 '8',
 'g',
 '4',
 'y',
 'p',
 '3',
 'n',
 'f',
 'd',
 'n',
 'f',
 'p',
 '3',
 'w',
 'y',
 'b',
 'd',
 '3',
 'b',
 '7',
 'w',
 '7',
 '5',
 'w',
 '8',
 'd',
 'p',
 'b',
 'y',
 'd',
 '7',
 'm',
 '8',
 'p',
 'x',
 '2',
 '8',
 '3',
 '4',
 '8',
 '8',
 '6',
 '5',
 'w',
 'm',
 'b',
 '3',
 '5',
 'f',
 '6',
 '5',
 'n',
 'm',
 '6',
 'd',
 '7',
 'f',
 'd',
 'e',
 '7',
 'p',
 '8'

In [464]:
from sklearn import preprocessing 

lbl_enc = preprocessing.LabelEncoder()  
# here we r encoding the labels. 
lbl_enc.fit(targets_flat)

LabelEncoder()

In [465]:
targets_enc = [lbl_enc.transform(x) for x in targets]
# targets_enc = np.array(targets_enc) + 1

In [466]:
targets_enc

[array([14, 11,  0, 14,  6]),
 array([ 3,  1,  5, 14, 11]),
 array([ 9, 10, 11, 18, 17]),
 array([13,  1,  7,  3, 15]),
 array([ 9, 18, 17, 14,  8]),
 array([11, 18, 11,  7, 14]),
 array([ 1,  8, 15, 16,  7]),
 array([14, 17, 17,  0,  3]),
 array([14,  3, 17,  0, 14]),
 array([16, 18, 10,  6,  3]),
 array([ 0, 16,  2, 18,  5]),
 array([17, 10,  6, 17, 13]),
 array([ 4,  8, 13,  4, 13]),
 array([ 6, 14, 15, 10,  1]),
 array([16,  8,  0,  7,  9]),
 array([ 4,  6, 17,  2,  6]),
 array([15,  8, 13,  5, 11]),
 array([ 1,  4, 14, 17,  2]),
 array([11,  0, 13,  6, 14]),
 array([13, 17, 14, 16,  2]),
 array([11,  6,  3,  6, 17]),
 array([ 6, 12,  2, 18, 15]),
 array([ 1, 14, 11,  9, 14]),
 array([11, 15,  1, 16, 18]),
 array([7, 9, 1, 7, 5]),
 array([16,  5,  3, 16,  6]),
 array([ 9, 15,  7, 18,  9]),
 array([ 5, 13,  6, 15, 17]),
 array([0, 6, 1, 2, 6]),
 array([ 6,  4,  3, 16, 13]),
 array([ 7,  1,  3, 11,  4]),
 array([ 3, 14, 13,  4,  9]),
 array([ 5, 11,  9, 10,  5]),
 array([15,  6, 16, 

In [467]:
targets_enc_1 = np.array(targets_enc) + 1

In [468]:
targets_enc_1

array([[15, 12,  1, 15,  7],
       [ 4,  2,  6, 15, 12],
       [10, 11, 12, 19, 18],
       ...,
       [14,  1,  2,  8, 16],
       [12,  6,  9, 11, 19],
       [ 6, 10,  3,  3, 14]])

In [469]:
# print(len(lbl_enc.classes_))    ### 19 unique classes hain.

In [470]:
train_imgs, test_imgs, train_targets, test_targets, _, test_orig_targets = model_selection.train_test_split(image_files, targets_enc, targets_orig, test_size=0.1, random_state=42)
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            

In [471]:
len(train_imgs)
# len(image_files)
# len(targets_enc)
# len(targets_orig)
len(test_imgs)
# len(test_imgs)
# len(test_orig_targets)
# len(test_orig_targets)

104

In [472]:
train_dataset = ClassificationDataset(image_paths=train_imgs, targets=train_targets, resize = (75, 300)) # resize = (height, width)

In [473]:
BATCH_SIZE = 8 
IMAGE_WIDTH = 300
IMAGE_HEIGHT = 75
NUM_WORKERS = 8
EPOCHS = 30

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = BATCH_SIZE,
                                           num_workers = NUM_WORKERS, shuffle = True)



In [474]:
test_dataset = ClassificationDataset(test_imgs, test_targets, resize= (75, 300))

In [475]:
test_dataset = ClassificationDataset(
        image_paths = test_imgs,
        targets = test_targets, 
        resize = (IMAGE_HEIGHT, IMAGE_WIDTH),
    )

In [476]:
test_loader= torch.utils.data.DataLoader(test_dataset, batch_size= BATCH_SIZE, num_workers= NUM_WORKERS, shuffle=False)

In [477]:
train_dataset[935]["images"].shape

torch.Size([3, 75, 300])

In [478]:
train_dataset[935]["targets"].shape

torch.Size([5])

# Modeling

In [509]:
import torch
from torch import nn
from torch.nn import functional as F

In [566]:
class CaptchaModel(nn.Module):
    def __init__(self, num_chars):
        super(CaptchaModel, self).__init__()
        self.conv1 = nn.Conv2d(3, 128, kernel_size=(3, 6), padding=(1, 1))
        self.pool1 = nn.MaxPool2d(kernel_size= (2, 2))
        
        self.conv2 = nn.Conv2d(128, 64, kernel_size=(3, 6), padding=(1, 1))
        self.pool2 = nn.MaxPool2d(kernel_size= (2, 2))
        
        self.linear1 = nn.Linear(768, 64)  # linear1 ka jo 1st parameter hai wo dependent hai relu k output par. Yani relu krny k baad jo image ki shape hogi hme linear ka first dim us hisaab sy rkhna pary ga. 
        self.drop1 = nn.Dropout(0.2)
        
        self.gru = nn.GRU(64, 32, bidirectional=True, num_layers = 2, dropout=0.25, batch_first = True)
        self.output = nn.Linear(64, num_chars + 1)
        
    def forward(self, images, targets=None):  # images mean yha training data pass hoga. 
        bs, _,_, _ = images.size()
        x = F.relu(self.conv1(images))
        x = self.pool1(x)
        x = F.relu(self.conv2(x))
        x = self.pool2(x)
        x = x.permute(0, 3, 1, 2) # permute is used to manupulate dimensions in pytorch. 
        x = x.view(bs, x.size(1), -1)  # -1 mean, baki dimensions ko mila ka ak column bna do. so like it is used to reduce 4 dimensions into 2 dimensions. 
        # (1, 47, 768)
        x = F.relu(self.linear1(x))
        x = self.drop1(x)
        x, _ = self.gru(x) # yha dash ki jga par hidden state output return hota hai. 1st parameter me actual output return hota hai which u can feed into next layer. 
        x = self.output(x)
        x = x.permute(1, 0, 2)
        
        if targets is not None:
            log_probs = F.log_softmax(x, 2)
            input_lengths = torch.full(
                size=(bs,), fill_value=log_probs.size(0), dtype=torch.int32  # log_probs.size(0) : 0
            )  # there is only one value which is 47
            
            target_lengths = torch.full(
                size=(bs,), fill_value=targets.size(1), dtype=torch.int32
            )
            
            loss = nn.CTCLoss(blank=0)(
                log_probs, targets, input_lengths, target
            )
            return target_lengths
        return x
            

In [567]:
img = torch.rand((1, 3, 50, 200))
img.size()
# cm(img, torch.rand((1, 5)))

torch.Size([1, 3, 50, 200])

In [568]:
target = torch.randint(1, 20, (1, 5))
target
# target = torch.rand((1, 5))
# target

tensor([[ 4,  8,  1,  4, 10]])

In [569]:
cm = CaptchaModel(19)  # 19 is the length of target. i.e. there are 19 unique letters. 

In [570]:
cm(img, target)   # yha par return hamesha forward k function me jo cheez return ho rhi hogi. VO return hoga. 

tensor([5], dtype=torch.int32)

In [540]:
# torch.Size([1, 47, 64])

In [344]:
# torch.Size([47, 1, 20])