<a href="https://colab.research.google.com/github/ericburdett/hwr/blob/master/notebook-tf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Simple HWR - TensorFlow
Implementation of Gated Convolutional Recurrent Neural Network for Handwriting Recognition as recorded in [Bluche](http://ieeexplore.ieee.org/document/8270042/).

In [1]:
try:
  %tensorflow_version 2.x
except Exception:
  pass

TensorFlow 2.x selected.


In [55]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
import tensorflow.keras.layers as L
from tensorflow.keras import Model
import pandas as pd
import numpy as np
import os
import cv2
import tqdm
from PIL import Image
from google.colab import drive
from IPython.core.ultratb import AutoFormattedTB

__ITB__ = AutoFormattedTB(mode = 'Verbose',color_scheme='LightBg', tb_offset = 1)
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
!cp "drive/My Drive/datasets/iam.zip" "/content"
!unzip -q iam.zip
!rm iam.zip

In [0]:
class Encoder():
  # input => (tuple of strings)
  def get_representation(words):
    charlists = []
    zeros = np.zeros(20)

    if type(words) == str:
      charlist = [ord(c) for c in words]
      charlist = np.concatenate((charlist, zeros))
      charlists.append(charlist[:16])

      return np.array(charlists)

    for word in words:
      charlist = [ord(c) for c in word]
      charlist = np.concatenate((charlist, zeros))
      charlists.append(charlist[:16])

    return np.array(charlists)

In [0]:
def resize_img(img, desired_size):
  img_size = np.array(img).shape

  img_ratio = img_size[0] / img_size[1]
  desired_ratio = desired_size[0] / desired_size[1]

  if img_ratio >= desired_ratio:
    # Solve by height
    new_height = desired_size[0]
    new_width = int(desired_size[0] // img_ratio)
  else:
    new_height = int(desired_size[1] * img_ratio)
    new_width = desired_size[1]
    # Solve by width

  img = np.array(img.resize((new_width, new_height)))

  border_top = desired_size[0] - new_height
  border_right = desired_size[1] - new_width

  border_img = cv2.copyMakeBorder(
      img,
      top=border_top,
      bottom=0,
      left=0,
      right=border_right,
      borderType=cv2.BORDER_CONSTANT,
      value=[255]
  )

  return border_img

def tensor_image(path, desired_size):
  img = Image.open(path + '.png')
  img = resize(img, desired_size)
  x = np.array(img)

  return x

def iam_generator(desired_size=(128, 32), path='/content/labels.csv'):
  if not os.path.exists(path):
    raise Exception('Iam dataset does not exist in ' + path)

  df = pd.read_csv(path, sep='\t', header=None, names=['word', 'seg', 'transcription'])
  df = df.drop(['seg'], axis=1)
  df = df.drop(df[df['transcription'] == '.'].index)
  df = df.drop(df[df['transcription'] == '!'].index)
  df = df.drop(df[df['transcription'] == ','].index)
  df = df.drop(df[df['transcription'] == ';'].index)
  df = df.drop(df[df['transcription'] == ':'].index)    
  df = df.drop(df[df['transcription'] == ')'].index)
  df = df.drop(df[df['transcription'] == '('].index)
  df = df.reset_index()

  for index, row in df.iterrows():
    path = 'images/' + row['word'] + '.png'
    img = Image.open(path)
    img = resize_img(img, desired_size)
    x = tf.convert_to_tensor(np.array(img))
    y = tf.convert_to_tensor(Encoder.get_representation(row['transcription']))

    yield(x, y)

In [0]:
class Recognizer(Model):
  def __init__(self):
    super(Recognizer, self).__init__()
    
    # Encoder
    self.conv1 = L.Conv2D(8, 3, strides=1, padding='same', activation='tanh')
    self.conv2 = L.Conv2D(16, 3, strides=1, padding='same', activation='tanh')
    self.conv3 = L.Conv2D(32, 3, strides=1, padding='same', activation='tanh')
    self.conv4 = L.Conv2D(64, 3, strides=1, padding='same', activation='tanh')
    self.conv5 = L.Conv2D(128, 3, strides=1, padding='same', activation='tanh')

    self.gate1 = L.Conv2D(16, 3, strides=1, padding='same', activation='sigmoid')
    self.gate2 = L.Conv2D(32, 3, strides=1, padding='same', activation='sigmoid')
    self.gate3 = L.Conv2D(64, 3, strides=1, padding='same', activation='sigmoid')

    # MaxPool
    self.mp = L.MaxPool2D((32, 1))

    # Decoder
    self.gru1 = L.Bidirectional(L.GRU(256))
    self.fc1 = L.Dense(128)
    self.gru2 = L.Bidirectional(L.GRU(256))
    self.fc2 = L.Dense(16)
    self.softmax = L.Softmax(axis=1)
    self.permute = L.Permute((3, 1, 2))

  def call(self, x):
    # Encoder
    out = self.conv1(x)
    out = self.conv2(x)

    g1 = self.gate1(out)
    out = out * g1

    out = self.conv3(out)
    
    g2 = self.gate2(out)
    out = out * g2

    out = self.conv4(out)

    g3 = self.gate3(out)
    out = out * g3

    out = self.conv5(out)

    # Max Pooling across vertical dimension
    out = self.mp(out)

    # Decoder
    out = tf.reshape(out, [-1, 128, 128])

    out = self.gru1(out)
    out = self.fc1(out)
    out = self.gru2(out)
    out = self.fc2(out)
    out = self.softmax(out)
    out = self.permute(out)

    return out

In [0]:
def seq_lengths(tensors):
  lengths = []

  tf.reshape(tensors, (-1, 16))

  for tensor in tensors:
    count = 0
    for val in tensor:
      if val != 0:
        count += 1
      else:
        break
    
    lengths.append(count)

  return lengths

In [0]:
@tf.function
def train_step(images, labels):
  with tf.GradientTape() as tape:
    batch_size = images.shape[0]
    input_lengths = tf.convert_to_tensor(np.full((batch_size,), 16))
    label_lengths = tf.convert_to_tensor(seq_lengths(labels))
    predictions = model(images)
    loss = tf.nn.ctc_loss(labels, predictions, label_lengths, input_lengths)
  
  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))

  train_loss(loss)

In [60]:
EPOCHS = 1
BATCH_SIZE = 250

dataset = tf.data.Dataset.from_generator(
    iam_generator,
    (tf.int64, tf.int64),
    (tf.TensorShape([None, None]), tf.TensorShape([None, 16]))
)

train_dataset = dataset.batch(BATCH_SIZE)

model = Recognizer()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
objective = tf.nn.ctc_loss

train_loss = tf.keras.metrics.Mean(name='train_loss')

for epoch in range(EPOCHS):
  
  train_loss.reset_states()

  for step, (images, labels) in enumerate(train_dataset):
    train_step(images, labels)
  
  print('Epoch: {}, Loss: {}'.format(epoch, train_loss.result()))

InaccessibleTensorError: ignored

In [0]:
t = tf.convert_to_tensor(tf.constant([[1, 2, 3]]))
tf.reshape(t, -1)

In [0]:
BATCH_SIZE = 1

train_dataset = dataset.batch(BATCH_SIZE)
for step, (x, y) in enumerate(train_dataset):
  print(x.shape)
  print(y.shape)

  break

(1, 128, 32)
(1, 1, 16)
