# Utilities

In [1]:
#Import some libraries
from __future__ import absolute_import, division, print_function, unicode_literals
import pathlib
import os
import fnmatch
import cv2
import numpy as np
import string
import time
import json
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, LSTM, Reshape, BatchNormalization, Input, Conv2D, MaxPool2D, Lambda, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.activations import relu, sigmoid, softmax
import tensorflow.keras.backend as K
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import CSVLogger, TensorBoard, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
print(tf.__version__)


2.3.0


In [2]:
# Install kaggle
! pip install -q kaggle
#Upload json file
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"datnguyn","key":"a50c493876024bc081a79d4cbc9f9a15"}'}

In [3]:
#Download dataset
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download -d fournierp/captcha-version-2-images

Downloading captcha-version-2-images.zip to /content
 52% 9.00M/17.4M [00:02<00:02, 3.52MB/s]
100% 17.4M/17.4M [00:02<00:00, 8.05MB/s]


In [4]:
# Unzip data
! mkdir /data
! unzip /content/captcha-version-2-images.zip -d /data

Archive:  /content/captcha-version-2-images.zip
  inflating: /data/samples/226md.png  
  inflating: /data/samples/22d5n.png  
  inflating: /data/samples/2356g.png  
  inflating: /data/samples/23mdg.png  
  inflating: /data/samples/23n88.png  
  inflating: /data/samples/243mm.png  
  inflating: /data/samples/244e2.png  
  inflating: /data/samples/245y5.png  
  inflating: /data/samples/24f6w.png  
  inflating: /data/samples/24pew.png  
  inflating: /data/samples/25257.png  
  inflating: /data/samples/253dc.png  
  inflating: /data/samples/25egp.png  
  inflating: /data/samples/25m6p.png  
  inflating: /data/samples/25p2m.png  
  inflating: /data/samples/25w53.png  
  inflating: /data/samples/264m5.png  
  inflating: /data/samples/268g2.png  
  inflating: /data/samples/28348.png  
  inflating: /data/samples/28x47.png  
  inflating: /data/samples/2b827.png  
  inflating: /data/samples/2bg48.png  
  inflating: /data/samples/2cegf.png  
  inflating: /data/samples/2cg58.png  
  inflating: /da

In [23]:
# Getting paths of all images
DATA_DIR = '/data/samples/samples'
all_image_path = [os.path.join(DATA_DIR, x) for x in os.listdir(DATA_DIR)]
len(all_image_path)

1070

In [24]:
all_characters= set()
dict_path_label={}
for path in all_image_path:
  label = os.path.basename(os.path.splitext(path)[0])
  dict_path_label[str(path)] = label
  all_characters.update(set(label))

all_characters = sorted(all_characters)
print('Total number of characters is: ' + str(len(all_characters)))
print('All characters include: ' + "".join(all_characters))

Total number of characters is: 19
All characters include: 2345678bcdefgmnpwxy


In [25]:
# Match characters to index
def encode_to_labels(txt):
    dig_lst = []
    for index, char in enumerate(txt):
        try:
            dig_lst.append(all_characters.index(char))
        except:
            print("No found in all_characters :", char)
    return dig_lst

#Test function
encode_to_labels('2324obn')

No found in all_characters : o


[0, 1, 0, 2, 7, 14]

In [26]:
# Get the width and heights of the capcha picture
widths = []
heights = []
for path in all_image_path:
    try:
      img = cv2.imread(path)
      (height, width, _) = img.shape
      heights.append(height)
      widths.append(width)
    except:
      print(path)

In [27]:
min_height = min(heights)
max_height = max(heights)
min_width = min(widths)
max_width = max(widths)
print(min_height, max_height, min_width, max_width)

50 50 200 200


# Preprocessing

In [28]:
# Creating train set and test set
test_size = 0.1
train_image_path, test_image_path = train_test_split(all_image_path, test_size=test_size, random_state=42)

In [30]:
# INPUT PIPELINE FOR THE IMAGES
TIME_STEPS = 9

# lists for training dataset
training_img = []
training_txt = []
train_input_length = []
train_label_length = []
orig_txt = []

i=0
for path in train_image_path:
    # Read input image and preprocess
    img = cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2GRAY)
    dim = (72, 24)
    img = cv2.resize(img, dim, interpolation=cv2.INTER_AREA)
    img = np.expand_dims(img , axis = 2)
    img = img/255.

    # Getting the label for corresponding image
    label = dict_path_label[path]
    orig_txt.append(label)   
    train_label_length.append(len(label))

    train_input_length.append(TIME_STEPS)
    training_img.append(img)

    # Word to digit
    training_txt.append(encode_to_labels(label)) 
    i+=1
    if (i%500 == 0):
        print ("has processed trained {} files".format(i))

has processed trained 500 files


In [31]:
#Preprocessing validation set
valid_img = []
valid_txt = []
valid_input_length = []
valid_label_length = []
valid_orig_txt = []

i=0

for path in test_image_path:
    # Read input image and preprocess
    img = cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2GRAY)
    dim = (72, 24)
    img = cv2.resize(img, dim, interpolation=cv2.INTER_AREA)
    img = np.expand_dims(img , axis = 2)
    img = img/255.

    # Getting the label for corresponding image
    label = dict_path_label[path]
    valid_orig_txt.append(label)   
    valid_label_length.append(len(label))
    
    valid_input_length.append(TIME_STEPS)
    valid_img.append(img)

    # Word to digit
    valid_txt.append(encode_to_labels(label))
    i+=1
    if (i%500 == 0):
        print ("has processed test {} files".format(i))

In [32]:
max_label_len = TIME_STEPS 
# Padding
train_padded_txt = pad_sequences(training_txt, maxlen=max_label_len, padding='post', value = 0)
valid_padded_txt = pad_sequences(valid_txt, maxlen=max_label_len, padding='post', value = 0)

In [33]:
# Converting training and testing set into numpy array
training_img = np.array(training_img)
train_input_length = np.array(train_input_length) 
train_label_length = np.array(train_label_length)  

valid_img = np.array(valid_img)
valid_input_length = np.array(valid_input_length)
valid_label_length = np.array(valid_label_length)

# Model building

In [34]:
#Model building 
inputs = Input(shape=(24,72,1))
conv_1 = Conv2D(64, (3,3), activation = 'relu', padding='same')(inputs)
pool_1 = MaxPool2D(pool_size=(2, 2), strides=2)(conv_1)
conv_2 = Conv2D(128, (3,3), activation = 'relu', padding='same')(pool_1)
pool_2 = MaxPool2D(pool_size=(2, 2), strides=2)(conv_2)
conv_3 = Conv2D(256, (3,3), activation = 'relu', padding='same')(pool_2)
pool_3 = MaxPool2D(pool_size=(2, 2))(conv_3)
batch_norm_3 = BatchNormalization()(pool_3)
conv_4 = Conv2D(256, (3,3), activation = 'relu', padding='same')(batch_norm_3)
batch_norm_5 = BatchNormalization()(conv_4)
conv_6 = Conv2D(512, (3,3), activation = 'relu', padding='same')(batch_norm_5)
batch_norm_6 = BatchNormalization()(conv_6)
pool_6 = MaxPool2D(pool_size=(3, 1))(batch_norm_6)
squeezed = Lambda(lambda x: K.squeeze(x, 1))(pool_6)
blstm_1 = Bidirectional(LSTM(256, return_sequences=True, dropout = 0.2))(squeezed)
blstm_2 = Bidirectional(LSTM(256, return_sequences=True, dropout = 0.2))(blstm_1)
outputs = Dense(len(all_characters)+1, activation = 'softmax')(blstm_2)

model = Model(inputs, outputs)
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 24, 72, 1)]       0         
_________________________________________________________________
conv2d (Conv2D)              (None, 24, 72, 64)        640       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 12, 36, 64)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 12, 36, 128)       73856     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 6, 18, 128)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 6, 18, 256)        295168    
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 3, 9, 256)        

In [35]:
labels = Input(name='the_labels', shape=[max_label_len], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')
 
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)
loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([outputs, labels, input_length, label_length])

model = Model(inputs=[inputs, labels, input_length, label_length], outputs=loss_out)
model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 24, 72, 1)]  0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 24, 72, 64)   640         input_1[0][0]                    
__________________________________________________________________________________________________
max_pooling2d (MaxPooling2D)    (None, 12, 36, 64)   0           conv2d[0][0]                     
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 12, 36, 128)  73856       max_pooling2d[0][0]              
_______________________________________________________________________________________

In [36]:
#Preparing the callbacks
callbacks = [
    ModelCheckpoint(filepath='checkpoint_weights.hdf5',
                    monitor='val_loss',
                    save_best_only=True,
                    save_weights_only=True,
                    verbose=1),
    EarlyStopping(monitor='val_loss',
                  min_delta=1e-8,
                  patience=15,
                  restore_best_weights=True,
                  verbose=1),
    #Learning rate reduction
    ReduceLROnPlateau(monitor='val_loss',
                      min_delta=1e-8,
                      factor=0.2,
                      patience=2,
                      verbose=1)
]

In [37]:
# Compiling the model
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer = 'adam')
callbacks_list = callbacks

In [40]:
# choose batchsize and epochs
batch_size = 32
epochs = 100

model.fit(x=[training_img, train_padded_txt, train_input_length, train_label_length], 
          y=np.zeros(len(training_img)),
          batch_size=batch_size, 
          epochs = epochs,
          validation_data = ([valid_img, valid_padded_txt, valid_input_length, valid_label_length], [np.zeros(len(valid_img))]),
          verbose = 1, 
          callbacks = callbacks_list)

Epoch 1/100
Epoch 00001: val_loss did not improve from 2.50040
Epoch 2/100
Epoch 00002: val_loss improved from 2.50040 to 2.10511, saving model to checkpoint_weights.hdf5
Epoch 3/100
Epoch 00003: val_loss improved from 2.10511 to 1.98333, saving model to checkpoint_weights.hdf5
Epoch 4/100
Epoch 00004: val_loss improved from 1.98333 to 1.95140, saving model to checkpoint_weights.hdf5
Epoch 5/100
Epoch 00005: val_loss improved from 1.95140 to 1.92489, saving model to checkpoint_weights.hdf5
Epoch 6/100
Epoch 00006: val_loss improved from 1.92489 to 1.88515, saving model to checkpoint_weights.hdf5
Epoch 7/100
Epoch 00007: val_loss did not improve from 1.88515
Epoch 8/100
Epoch 00008: val_loss did not improve from 1.88515

Epoch 00008: ReduceLROnPlateau reducing learning rate to 4.0000001899898055e-05.
Epoch 9/100
Epoch 00009: val_loss did not improve from 1.88515
Epoch 10/100
Epoch 00010: val_loss did not improve from 1.88515

Epoch 00010: ReduceLROnPlateau reducing learning rate to 8.00

<tensorflow.python.keras.callbacks.History at 0x7fe38a6cd7f0>

In [41]:
# Save the model
model.save('mymodel.h5')