<a href="https://colab.research.google.com/github/ericburdett/hwr/blob/master/notebook-tf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Simple HWR - TensorFlow
Implementation of Gated Convolutional Recurrent Neural Network for Handwriting Recognition as recorded in [Bluche](http://ieeexplore.ieee.org/document/8270042/).

In [2]:
try:
  %tensorflow_version 2.x
except Exception:
  pass

TensorFlow 2.x selected.


In [3]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
import tensorflow.keras.layers as L
from tensorflow.keras import Model
import pandas as pd
import numpy as np
import os
import cv2
import tqdm
from PIL import Image
from google.colab import drive
from IPython.core.ultratb import AutoFormattedTB

__ITB__ = AutoFormattedTB(mode = 'Verbose',color_scheme='LightBg', tb_offset = 1)
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [4]:
!cp "drive/My Drive/datasets/iam.zip" "/content"
!unzip -q iam.zip
!rm iam.zip

  (attempting to process anyway)
error [iam.zip]:  reported length of central directory is
  -76 bytes too long (Atari STZip zipfile?  J.H.Holm ZIPSPLIT 1.1
  zipfile?).  Compensating...
error:  expected central file header signature not found (file #95170).
  (please check that you have transferred or created the zipfile in the
  appropriate BINARY mode and that you have compiled UnZip properly)


In [0]:
class Encoder():
  # input => (tuple of strings)
  def get_representation(words):
    charlists = []
    zeros = np.zeros(20)

    if type(words) == str:
      charlist = [ord(c) for c in words]
      charlist = np.concatenate((charlist, zeros))
      charlists.append(charlist[:16])

      return np.array(charlists)

    for word in words:
      charlist = [ord(c) for c in word]
      charlist = np.concatenate((charlist, zeros))
      charlists.append(charlist[:16])

    return np.array(charlists)

In [0]:
def resize_img(img, desired_size):
  img_size = np.array(img).shape

  img_ratio = img_size[0] / img_size[1]
  desired_ratio = desired_size[0] / desired_size[1]

  if img_ratio >= desired_ratio:
    # Solve by height
    new_height = desired_size[0]
    new_width = int(desired_size[0] // img_ratio)
  else:
    new_height = int(desired_size[1] * img_ratio)
    new_width = desired_size[1]
    # Solve by width

  img = np.array(img.resize((new_width, new_height)))

  border_top = desired_size[0] - new_height
  border_right = desired_size[1] - new_width

  border_img = cv2.copyMakeBorder(
      img,
      top=border_top,
      bottom=0,
      left=0,
      right=border_right,
      borderType=cv2.BORDER_CONSTANT,
      value=[255]
  )

  return border_img

def tensor_image(path, desired_size):
  img = Image.open(path + '.png')
  img = resize(img, desired_size)
  x = np.array(img)

  return x

def iam_generator(desired_size=(128, 32), path='/content/labels.csv'):
  if not os.path.exists(path):
    raise Exception('Iam dataset does not exist in ' + path)

  df = pd.read_csv(path, sep='\t', header=None, names=['word', 'seg', 'transcription'])
  df = df.drop(['seg'], axis=1)
  df = df.drop(df[df['transcription'] == '.'].index)
  df = df.drop(df[df['transcription'] == '!'].index)
  df = df.drop(df[df['transcription'] == ','].index)
  df = df.drop(df[df['transcription'] == ';'].index)
  df = df.drop(df[df['transcription'] == ':'].index)    
  df = df.drop(df[df['transcription'] == ')'].index)
  df = df.drop(df[df['transcription'] == '('].index)
  df = df.reset_index()

  for index, row in df.iterrows():
    path = 'images/' + row['word'] + '.png'
    img = Image.open(path)
    img = resize_img(img, desired_size)
    x = tf.expand_dims(tf.convert_to_tensor(np.array(img), dtype=tf.float32), 2)
    y = tf.convert_to_tensor(Encoder.get_representation(row['transcription']), dtype=tf.int32)

    yield(x, y)

In [0]:
class Recognizer(Model):
  def __init__(self):
    super(Recognizer, self).__init__()
    
    # Encoder
    self.conv1 = L.Conv2D(8, 3, strides=1, padding='same', activation='tanh')
    self.conv2 = L.Conv2D(16, 3, strides=1, padding='same', activation='tanh')
    self.conv3 = L.Conv2D(32, 3, strides=1, padding='same', activation='tanh')
    self.conv4 = L.Conv2D(64, 3, strides=1, padding='same', activation='tanh')
    self.conv5 = L.Conv2D(128, 3, strides=1, padding='same', activation='tanh')

    self.gate1 = L.Conv2D(16, 3, strides=1, padding='same', activation='sigmoid')
    self.gate2 = L.Conv2D(32, 3, strides=1, padding='same', activation='sigmoid')
    self.gate3 = L.Conv2D(64, 3, strides=1, padding='same', activation='sigmoid')

    # MaxPool
    self.mp = L.MaxPool2D((32, 1))

    # Decoder
    self.gru1 = L.Bidirectional(L.GRU(256, return_sequences=True))
    self.fc1 = L.Dense(128)
    self.gru2 = L.Bidirectional(L.GRU(256, return_sequences=True))
    self.fc2 = L.Dense(16)
    self.permute = L.Permute((2, 1))

  def call(self, x):
    # Encoder
    out = self.conv1(x)
    out = self.conv2(x)

    g1 = self.gate1(out)
    out = out * g1

    out = self.conv3(out)
    
    g2 = self.gate2(out)
    out = out * g2

    out = self.conv4(out)

    g3 = self.gate3(out)
    out = out * g3

    out = self.conv5(out)

    # Max Pooling across vertical dimension
    out = self.mp(out)

    # Decoder
    out = tf.reshape(out, [-1, 128, 128])

    out = self.gru1(out)
    out = self.fc1(out)
    out = self.gru2(out)
    out = self.fc2(out)
    out = self.permute(out)

    return out

In [0]:
@tf.function
def train_step(images, labels):
  with tf.GradientTape() as tape:
    batch_size = images.shape[0]
    input_lengths = tf.constant(np.full((batch_size,), 16))
    label_lengths = tf.squeeze(tf.math.count_nonzero(labels, axis=2))
    predictions = model(images)
    labels = tf.squeeze(labels, axis=1)

    loss = tf.nn.ctc_loss(labels, predictions, label_lengths, input_lengths, logits_time_major=False,)
    loss = tf.reduce_mean(loss)
  
  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))
  train_loss(loss)

In [118]:
tf.nn.softmax(tf.constant([-5.1231, -2.32342]))

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.05734148, 0.94265854], dtype=float32)>

In [186]:
try:
  EPOCHS = 1
  BATCH_SIZE = 250

  dataset = tf.data.Dataset.from_generator(
      iam_generator,
      (tf.float32, tf.int32),
      (tf.TensorShape([None, None, 1]), tf.TensorShape([None, 16]))
  )
  train_dataset = dataset.batch(BATCH_SIZE)

  model = Recognizer()
  optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
  train_loss = tf.keras.metrics.Mean(name='train_loss')

  for epoch in range(EPOCHS):
    train_loss.reset_states()

    for images, labels in train_dataset:
      train_step(images, labels)
      print('Epoch: {}, Loss: {}'.format(epoch, train_loss.result()))
      
except:
  __ITB__()

Epoch: 0, Loss: 66.80127716064453
Epoch: 0, Loss: 65.98078918457031
Epoch: 0, Loss: 65.01065826416016
Epoch: 0, Loss: 62.84225845336914
Epoch: 0, Loss: 59.46714401245117
Epoch: 0, Loss: 55.66038131713867
Epoch: 0, Loss: 51.98318099975586
Epoch: 0, Loss: 49.18561553955078
Epoch: 0, Loss: 46.48262023925781
Epoch: 0, Loss: 44.731666564941406
Epoch: 0, Loss: 42.81177520751953
Epoch: 0, Loss: 41.11843490600586
Epoch: 0, Loss: 39.54366683959961
Epoch: 0, Loss: 38.109642028808594
Epoch: 0, Loss: 37.044803619384766
Epoch: 0, Loss: 36.07862854003906
Epoch: 0, Loss: 35.237876892089844
Epoch: 0, Loss: 34.39948272705078
Epoch: 0, Loss: 33.644737243652344
Epoch: 0, Loss: 32.93911361694336
Epoch: 0, Loss: 32.421871185302734
Epoch: 0, Loss: 31.873327255249023
[0;31m---------------------------------------------------------------------------[0m
[0;31mKeyboardInterrupt[0m                         Traceback (most recent call last)
[0;32m/usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ea

### CTC-Loss Example

In [182]:
labels = tf.constant(np.random.randint(low=0, high=128, size=(1, 16)))
preds = tf.constant(np.random.randn(1, 16, 128), dtype=tf.float32)
llengths = tf.constant(np.random.randint(low=0, high=16, size=(1)))
ilengths = tf.constant(np.full((1,), 16))

print('labels: ', labels.shape)
print(labels)
print('preds: ', preds.shape)
print(preds)
print('input lengths: ', ilengths.shape)
print(ilengths)
print('label lengths: ', llengths.shape)
print(llengths)

tf.reduce_mean(tf.nn.ctc_loss(labels, preds, llengths, ilengths, logits_time_major=False, blank_index=0))

labels:  (1, 16)
tf.Tensor([[ 27  49  61  99  80  61  47  25  19  35  25  43 107  89 104  16]], shape=(1, 16), dtype=int64)
preds:  (1, 16, 128)
tf.Tensor(
[[[ 1.3211424   0.36390853 -0.5032348  ...  0.5346142   0.15908684
    0.6808094 ]
  [ 0.10130436 -1.150955   -0.03122388 ...  0.3131712   0.22086355
   -0.67402965]
  [ 0.47346625 -1.4858168  -1.1933379  ... -1.0516319  -1.0400614
    0.26930356]
  ...
  [-0.4674597   1.482517   -1.3243313  ...  0.05548872  0.812202
   -2.0924296 ]
  [ 0.30128506 -0.6138613  -1.7958736  ... -1.7538375   0.4280902
    0.02597057]
  [ 0.36920932 -0.29998782 -1.7132615  ...  0.6641271  -0.35159066
   -1.1128763 ]]], shape=(1, 16, 128), dtype=float32)
input lengths:  (1,)
tf.Tensor([16], shape=(1,), dtype=int64)
label lengths:  (1,)
tf.Tensor([15], shape=(1,), dtype=int64)


<tf.Tensor: shape=(), dtype=float32, numpy=84.38543>

In [157]:
-np.log(0.99)

0.01005033585350145