<a href="https://colab.research.google.com/github/ericburdett/hwr/blob/master/notebook-tf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Simple HWR - TensorFlow
Implementation of Gated Convolutional Recurrent Neural Network for Handwriting Recognition as recorded in [Bluche](http://ieeexplore.ieee.org/document/8270042/).

In [2]:
try:
  %tensorflow_version 2.x
except Exception:
  pass

TensorFlow 2.x selected.


In [212]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
import tensorflow.keras.layers as L
from tensorflow.keras import Model
import pandas as pd
import numpy as np
import os
import cv2
from tqdm import tqdm
from PIL import Image
from google.colab import drive
from IPython.core.ultratb import AutoFormattedTB

__ITB__ = AutoFormattedTB(mode = 'Verbose',color_scheme='LightBg', tb_offset = 1)
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!cp "drive/My Drive/datasets/iam.zip" "/content"
!unzip -q iam.zip
!rm iam.zip

  (attempting to process anyway)
error [iam.zip]:  reported length of central directory is
  -76 bytes too long (Atari STZip zipfile?  J.H.Holm ZIPSPLIT 1.1
  zipfile?).  Compensating...
error:  expected central file header signature not found (file #95170).
  (please check that you have transferred or created the zipfile in the
  appropriate BINARY mode and that you have compiled UnZip properly)


In [0]:
class Encoder():
  # input => (tuple of strings)
  def get_representation(words):
    charlists = []
    zeros = np.zeros(20)

    if type(words) == str:
      charlist = [ord(c) for c in words]
      charlist = np.concatenate((charlist, zeros))
      charlists.append(charlist[:16])

      return np.array(charlists)

    for word in words:
      charlist = [ord(c) for c in word]
      charlist = np.concatenate((charlist, zeros))
      charlists.append(charlist[:16])

    return np.array(charlists)

  def get_strings(tensor):
    words = []

    for indices in tensor:
      word = ''

      for index in indices:
        if index.item() != 0:
          letter = chr(index.item())
          word += letter
      
      words.append(word)

In [0]:
def resize_img(img, desired_size):
  img_size = np.array(img).shape

  img_ratio = img_size[0] / img_size[1]
  desired_ratio = desired_size[0] / desired_size[1]

  if img_ratio >= desired_ratio:
    # Solve by height
    new_height = desired_size[0]
    new_width = int(desired_size[0] // img_ratio)
  else:
    new_height = int(desired_size[1] * img_ratio)
    new_width = desired_size[1]
    # Solve by width

  img = np.array(img.resize((new_width, new_height)))

  border_top = desired_size[0] - new_height
  border_right = desired_size[1] - new_width

  border_img = cv2.copyMakeBorder(
      img,
      top=border_top,
      bottom=0,
      left=0,
      right=border_right,
      borderType=cv2.BORDER_CONSTANT,
      value=[255]
  )

  return border_img

def tensor_image(path, desired_size):
  img = Image.open(path + '.png')
  img = resize(img, desired_size)
  x = np.array(img)

  return x

def get_dataset_df(path='/content/labels.csv'):
  if not os.path.exists(path):
    raise Exception('Iam dataset does not exist in ' + path)

  df = pd.read_csv(path, sep='\t', header=None, names=['word', 'seg', 'transcription'])
  df = df.drop(['seg'], axis=1)
  df = df.drop(df[df['transcription'] == '.'].index)
  df = df.drop(df[df['transcription'] == '!'].index)
  df = df.drop(df[df['transcription'] == ','].index)
  df = df.drop(df[df['transcription'] == ';'].index)
  df = df.drop(df[df['transcription'] == ':'].index)    
  df = df.drop(df[df['transcription'] == ')'].index)
  df = df.drop(df[df['transcription'] == '('].index)
  df = df.reset_index()

  return df

def iam_generator(desired_size=(128, 32)):
  df = get_dataset_df()

  for index, row in df.iterrows():
    path = 'images/' + row['word'] + '.png'
    img = Image.open(path)
    img = resize_img(img, desired_size)
    x = tf.expand_dims(tf.convert_to_tensor(np.array(img), dtype=tf.float32), 2)
    y = tf.convert_to_tensor(Encoder.get_representation(row['transcription']), dtype=tf.int32)

    yield(x, y)

In [200]:
len(get_dataset_df())

86018

In [0]:
class Recognizer(Model):
  def __init__(self):
    super(Recognizer, self).__init__()
    
    # Encoder
    self.conv1 = L.Conv2D(8, 3, strides=1, padding='same', activation='tanh')
    self.conv2 = L.Conv2D(16, 3, strides=1, padding='same', activation='tanh')
    self.conv3 = L.Conv2D(32, 3, strides=1, padding='same', activation='tanh')
    self.conv4 = L.Conv2D(64, 3, strides=1, padding='same', activation='tanh')
    self.conv5 = L.Conv2D(128, 3, strides=1, padding='same', activation='tanh')

    self.gate1 = L.Conv2D(16, 3, strides=1, padding='same', activation='sigmoid')
    self.gate2 = L.Conv2D(32, 3, strides=1, padding='same', activation='sigmoid')
    self.gate3 = L.Conv2D(64, 3, strides=1, padding='same', activation='sigmoid')

    # MaxPool
    self.mp = L.MaxPool2D((32, 1))

    # Decoder
    self.gru1 = L.Bidirectional(L.GRU(256, return_sequences=True))
    self.fc1 = L.Dense(128)
    self.gru2 = L.Bidirectional(L.GRU(256, return_sequences=True))
    self.fc2 = L.Dense(16)
    self.permute = L.Permute((2, 1))

  def call(self, x):
    # Encoder
    out = self.conv1(x)
    out = self.conv2(x)

    g1 = self.gate1(out)
    out = out * g1

    out = self.conv3(out)
    
    g2 = self.gate2(out)
    out = out * g2

    out = self.conv4(out)

    g3 = self.gate3(out)
    out = out * g3

    out = self.conv5(out)

    # Max Pooling across vertical dimension
    out = self.mp(out)

    # Decoder
    out = tf.reshape(out, [-1, 128, 128])

    out = self.gru1(out)
    out = self.fc1(out)
    out = self.gru2(out)
    out = self.fc2(out)
    out = self.permute(out)

    return out

In [218]:
r = Recognizer()
r(tf.constant(np.random.randn(250, 32, 128, 1))).shape



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



TensorShape([250, 16, 128])

In [0]:
def show_imgs(x, new_fig=True):
    grid = utils.make_grid(x.detach().cpu(), nrow=8, normalize=True, pad_value=0.3)
    grid = grid.transpose(0,2).transpose(0,1) # channels as last dimension
    if new_fig:
        plt.figure()
    plt.imshow(grid.numpy())

In [0]:
@tf.function
def train_step(iteration_num, images, labels):
  with tf.GradientTape() as tape:
    batch_size = images.shape[0]
    input_lengths = tf.constant(np.full((batch_size,), 16))
    label_lengths = tf.squeeze(tf.math.count_nonzero(labels, axis=2))
    predictions = model(images)
    labels = tf.squeeze(labels, axis=1)

    loss = tf.nn.ctc_loss(labels, predictions, label_lengths, input_lengths, logits_time_major=False)
    loss = tf.reduce_mean(loss)
  
  gradients = tape.gradient(loss, model.trainable_variables)
  optimizer.apply_gradients(zip(gradients, model.trainable_variables))
  train_loss(loss)

  if iteration_num % ITERATION_SAMPLE == 0:
    print('here')
    decoded, _ = tf.nn.ctc_greedy_decoder(tf.reshape(preds, (16, -1, 128)), ilengths)
    print('after decoder')
    preds_strings = convert_to_strings_with_decoder(decoded)
    target_strings = convert_to_strings(labels)

    print('Predicted: ', preds_strings[:3])
    print('Target: ', target_strings[:3])
    show_imgs(images[:3])

In [329]:
# try:
  EPOCHS = 1
  BATCH_SIZE = 250
  ITERATION_SAMPLE = 100
  
  num_iterations = len(get_dataset_df()) // BATCH_SIZE
  dataset = tf.data.Dataset.from_generator(
      iam_generator,
      (tf.float32, tf.int32),
      (tf.TensorShape([None, None, 1]), tf.TensorShape([None, 16]))
  )
  train_dataset = dataset.batch(BATCH_SIZE)

  model = Recognizer()
  optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
  train_loss = tf.keras.metrics.Mean(name='train_loss')

  for epoch in range(EPOCHS):
    train_loss.reset_states()
    loop = tqdm(total=num_iterations, position=0, leave=False)

    for iteration_num, (images, labels) in enumerate(train_dataset):
      train_step(iteration_num, images, labels)
      loop.set_description('Epoch: {}, Loss: {}'.format(epoch, train_loss.result()))
      loop.update(1)
    
    loop.close()
      
# except:
#   __ITB__()

  0%|          | 0/344 [00:00<?, ?it/s]

here
after decoder


OperatorNotAllowedInGraphError: ignored

In [230]:
train_loss.

[]

### CTC-Loss Example

In [0]:
def decode(index_num):
  return chr(index_num)

In [307]:
vectors = tf.constant(np.array([[97, 98, 99, 100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [65, 66, 67, 68, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]))
convert_to_strings(vectors)

['abcd', 'ABCD']

In [0]:
@tf.function
def convert_to_strings_with_decoder(decoded):
  values = decoded[0].values
  indices = decoded[0].indices

  words = []

  current_index = 0
  current_word = ''

  for index, value in zip(indices, values):
    if index[0] == current_index:
      if value != 0:
        current_word += decode(value)
    else:
      current_index = index[0]
      words.append(current_word)
      if value != 0:
        current_word = decode(value)
      else:
        current_word = ''
  
  words.append(current_word)
  
  return words

def convert_to_strings(vectors):
  vectors = vectors.numpy()
  words = []

  for vector in vectors:
    word = ''

    for char in vector:
      if char != 0:
        word += decode(char)
    
    words.append(word)
  
  return words

In [299]:
labels = tf.constant(np.random.randint(low=0, high=128, size=(2, 16)))
preds = tf.constant(np.random.randn(2, 16, 128), dtype=tf.float32)
llengths = tf.constant(np.random.randint(low=0, high=16, size=(2)), dtype=tf.int32)
ilengths = tf.constant(np.full((2,), 16), dtype=tf.int32)

# print('labels: ', labels.shape)
# print('preds: ', preds.shape)
# print(preds)
# print('input lengths: ', ilengths.shape)
# print('label lengths: ', llengths.shape)
decoded, neg_sum_logits = tf.nn.ctc_greedy_decoder(tf.reshape(preds, (16, -1, 128)), ilengths)
print('decode: ', decoded[0])
print('ctc-loss: ', tf.reduce_mean(tf.nn.ctc_loss(labels, preds, llengths, ilengths, logits_time_major=False, blank_index=0)))

decode:  SparseTensor(indices=tf.Tensor(
[[ 0  0]
 [ 0  1]
 [ 0  2]
 [ 0  3]
 [ 0  4]
 [ 0  5]
 [ 0  6]
 [ 0  7]
 [ 0  8]
 [ 0  9]
 [ 0 10]
 [ 0 11]
 [ 0 12]
 [ 0 13]
 [ 0 14]
 [ 1  0]
 [ 1  1]
 [ 1  2]
 [ 1  3]
 [ 1  4]
 [ 1  5]
 [ 1  6]
 [ 1  7]
 [ 1  8]
 [ 1  9]
 [ 1 10]
 [ 1 11]
 [ 1 12]
 [ 1 13]
 [ 1 14]
 [ 1 15]], shape=(31, 2), dtype=int64), values=tf.Tensor(
[101 111  50  70 105 100 111  92  39  84 122 105  18  30  38  96  32  39
  66  12 112  74  44  79  97  73  87  85   0  84   9], shape=(31,), dtype=int64), dense_shape=tf.Tensor([ 2 16], shape=(2,), dtype=int64))
ctc-loss:  tf.Tensor(78.46625, shape=(), dtype=float32)


In [317]:
convert_to_strings_with_decoder(decoded)

["eo2Fido\\'Tzi\x12\x1e&", "` 'B\x0cpJ,OaIWUT\t"]