In [None]:
# -*- coding: utf-8 -*-
"""
Created on Sun Feb 27 18:08:54 2022

@author: Debabrata Ghorai

Objective: Handwritten Text Recognition using Tensorflow, Keras, and IAM Dataset

Dataset Used: https://fki.tic.heia-fr.ch/databases/iam-handwriting-database

References:
1) U. Marti and H. Bunke. The IAM-database: An English Sentence Database for Off-line Handwriting Recognition. Int. Journal on Document Analysis and Recognition, Volume 5, pages 39 - 46, 2002.
2) https://github.com/sudoaditya/Handwritten-Text-Recognition
3) https://wandb.ai/authors/text-recognition-crnn-ctc/reports/Text-Recognition-With-CRNN-CTC-Network--VmlldzoxNTI5NDI
4) https://keras.io/examples/vision/handwriting_recognition/
"""

In [None]:
# import modules
import os
import numpy as np
import cv2
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

In [None]:
# ignore warnings in the output
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [None]:
class CTCLayer(keras.layers.Layer):
    
    def __init__(self, name=None):
        
        super().__init__(name=name)
        self.loss_fn = keras.backend.ctc_batch_cost

    def call(self, y_true, y_pred):
        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        # Compute the training-time loss value and add it to the layer using self.add_loss()
        self.add_loss(loss)

        # At test time, just return the computed predictions
        return y_pred

In [None]:
def get_data(root):
    images = list()
    labels = list()
    wordslist = list()
    
    # get the image dir and metadata path
    imagepath = os.path.join(root, 'words')
    metadata = os.path.join(root, 'words.txt')
    
    # defining object words to open metadata file in read mode
    words = open(metadata, "r")
    # reading each line from original text file
    for line in words.readlines():
        # reading all lines that do not begin with "#"
        if not (line.startswith('#')):
            # splitting line and search correctly word to append into list
            if line.split(" ")[1] == "ok":
                wordslist.append(line)
                
    # shuffle the elements in list
    np.random.shuffle(wordslist)
    # loop over list of words
    for i, line in enumerate(wordslist):
        line = line.strip()
        line = line.split(" ")
        # get image path
        img_name = line[0]
        img_path = os.path.join(imagepath, img_name.split("-")[0], img_name.split("-")[0] + "-" + img_name.split("-")[1], img_name + ".png")
        if os.path.exists(img_path):
            images.append(img_path)
            labels.append(line[-1].strip())
            
    return images, labels

In [None]:
def image_process(image, width=None, height=None):
    # final image
    dim = (width, height)
    #X = cv2.imread(img, cv2.IMREAD_UNCHANGED)
    X = cv2.imread(image, cv2.IMREAD_GRAYSCALE)
    # print('Original Dimensions : ', X.shape)
    try:
        X = cv2.resize(X, dim, interpolation=cv2.INTER_AREA)
    except:
        X = cv2.resize(X.astype('uint8'), dim, interpolation=cv2.INTER_AREA)
    # scale
    X = X / 255.0
    X = X.reshape(X.shape[0], X.shape[1], 1)
    return X

In [None]:
def get_vocabulary(label_data, islist=True):
    characters = set()
    max_word_len = 0
    # loop over label data
    for label in label_data:
        for char in label:
            characters.add(char)
        # get max len
        max_word_len = max(max_word_len, len(label))
    # unique characters
    if islist == False:
        characters = "". join([str(s) for s in characters])
    return characters, max_word_len

In [None]:
def prepare_data(images, labels, char_to_num, max_word_len, width=None, height=None):
    x_data = []
    y_data = []
    for image, label in zip(images, labels):
        try:
            ix = image_process(image, width=width, height=height)
            ix = tf.convert_to_tensor(ix, dtype=tf.float32)
            ix = tf.transpose(ix, perm=[1,0,2])
            iy = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
            iy = tf.pad(iy, paddings=[[0, max_word_len - iy.shape[0]]], constant_values=999)
            x_data.append(ix)
            y_data.append(iy)
        except:
            pass
    return x_data, y_data

In [None]:
def build_model(width, height, characters):
    # Model Inputs
    images = keras.layers.Input(shape=(width, height, 1), name="image", dtype="float32")
    labels = keras.layers.Input(name="label", shape=(None,), dtype="float32")
    # Create CNN Layers
    x = keras.layers.Conv2D(32,(3, 3),activation="relu",kernel_initializer="he_normal",padding="same",name="Conv1",)(images)
    x = keras.layers.Conv2D(64,(3, 3),activation="relu",kernel_initializer="he_normal",padding="same",name="Conv2",)(x)
    x = keras.layers.MaxPooling2D((2, 2), name="pool1")(x)
    x = keras.layers.Conv2D(128,(3, 3),activation="relu",kernel_initializer="he_normal",padding="same",name="Conv3",)(x)
    x = keras.layers.MaxPooling2D((2, 2), name="pool2")(x)
    # Reshape the Layer before passing the output to RNN layer
    x = keras.layers.Reshape(target_shape=(x.shape[1], x.shape[2]*x.shape[3]), name="reshape")(x)
    x = keras.layers.Dense(64, activation="relu", name="dense1")(x)
    x = keras.layers.Dropout(0.2)(x)
    # Create RNN Layers
    x = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True, dropout=0.25))(x)
    x = keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True, dropout=0.25))(x)
    # Output layer
    x = keras.layers.Dense(len(characters)+1 , activation="softmax", name="dense2")(x)
    # Add CTC Layer to calculate ctc loss at each step
    output = CTCLayer(name='ctc_loss')(labels, x)
    # Define the model
    model = keras.models.Model(inputs=[images, labels], outputs=output, name="ocr_model")
    # Compile the model and return
    model.compile(optimizer=Adam(learning_rate = 0.0001))
    return model

In [None]:
def decode_batch_predictions(pred, max_len, num_to_char):
    input_len = np.ones(pred.shape[0]) * pred.shape[1]
    # Use greedy search. For complex tasks, you can use beam search
    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0][:, :max_len]
    # Iterate over the results and get back the text
    output_text = []
    for res in results:
        res = tf.gather(res, tf.where(tf.math.not_equal(res, -1)))
        res = tf.strings.reduce_join(num_to_char(res)).numpy().decode("utf-8")
        output_text.append(res)
    return output_text

In [None]:
def model_training(images, labels, width, height, characters, char_to_num, max_word_len):
    # train-test split of the model inputs
    x_data, y_data = prepare_data(images, labels, char_to_num, max_word_len, width=width, height=height)
    x_train, x_valid, y_train, y_valid = train_test_split(x_data, y_data, test_size=0.2, random_state=1)
    
    # convert list of numpy array to tensor
    x_train = tf.stack(x_train)
    x_valid = tf.stack(x_valid)
    y_train = tf.stack(y_train)
    y_valid = tf.stack(y_valid)
    
    # get the model (considering maximum time step 32)
    model = build_model(width, height, characters)
    model.summary()
    
    # set the callbacks
    checkpoint = keras.callbacks.ModelCheckpoint(filepath="best_model.h5", monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='min')
    earlystop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1)
    callbacks_list = [checkpoint, earlystop]
    
    # train the model
    history = model.fit(
        [x_train, y_train], 
        validation_data=[x_valid, y_valid], 
        epochs=100, 
        batch_size=64, 
        callbacks=callbacks_list, 
        verbose=1,
        )
    return history

In [None]:
def main(view_model_performance=True, view_pred_output=True):
    width = 128
    height = 32
    # get image paths and labels
    images_tot, labels_tot = get_data(root_dir)
    # get total characters and maximum word length
    characters, max_word_len = get_vocabulary(labels_tot, islist=True)
    
    # convert character to number for model training
    char_to_num = keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=list(characters),
        num_oov_indices=0,
        mask_token=None
        )
    
    # convert number to character for decode prediction
    num_to_char = keras.layers.experimental.preprocessing.StringLookup(
        vocabulary=char_to_num.get_vocabulary(),
        mask_token=None,
        invert=True
        )
    
    # train-test split
    images_train, images_test, labels_train, labels_test = train_test_split(images_tot, labels_tot, test_size=0.5, random_state=1)
    
    # training model
    history = model_training(images_train, labels_train, width, height, characters, char_to_num, max_word_len)
    
    if view_model_performance == True:
        # view model performance
        loss = history.history['loss']
        val_loss = history.history['val_loss']
        epochs = range(1,len(loss)+1)
        plt.plot(epochs, loss, 'b')
        plt.plot(epochs, val_loss, 'r')
        plt.title('Model Accuracy')
        plt.ylabel('Accuracy')
        plt.xlabel('Epoch')
        plt.legend(['Train', 'Val'], loc='upper left')
        plt.show()
    
    if view_pred_output == True:
        # visualize prediction (accuracy)
        xdata, ydata = prepare_data(images_test, labels_test, char_to_num, max_word_len, width=width, height=height)
        xdata, ydata = tf.stack(xdata), tf.stack(ydata)
        # model prediction
        model = build_model(width, height, characters)
        prediction_model = keras.models.Model(model.get_layer(name="image").input, model.get_layer(name="dense2").output)
        prediction_model.load_weights("best_model.h5")
        preds = prediction_model.predict(xdata)
        out_text = decode_batch_predictions(preds, max_word_len, num_to_char)
        # see the results
        for i, x in enumerate(out_text[0:10]):
            print("original_text =  ", labels_test[i])
            print("predicted text = ", x)
            plt.imshow(images_test[i].reshape(height,width), cmap=plt.cm.gray)
            plt.show()
            print('\n')
    return

In [None]:
if __name__ == '__main__':
    root_dir = r"\...\Practical\IAM_Handwriting_Database"
    os.chdir(root_dir) # save best model in this root_dir only
    main(view_model_performance=False, view_pred_output=False)