<a href="https://colab.research.google.com/github/ericburdett/hwr/blob/master/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Simple HWR

Implementation of Gated Convolutional Recurrent Neural Network for Handwriting Recognition as recorded in [Bluche](http://ieeexplore.ieee.org/document/8270042/).

### Imports

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import matplotlib.pyplot as plt
from torchvision import transforms, utils, datasets
from tqdm import tqdm
from torch.nn.parameter import Parameter
import pdb
import torchvision
import os
import gzip
import tarfile
import time
from PIL import Image, ImageOps
import gc
import pdb
import pandas as pd
import cv2
from PIL import Image
from matplotlib.pyplot import imshow
from google.colab import drive
drive.mount('/content/drive')
from IPython.core.ultratb import AutoFormattedTB
__ITB__ = AutoFormattedTB(mode = 'Verbose',color_scheme='LightBg', tb_offset = 1)

assert torch.cuda.is_available(), "Request a GPU from Runtime > Change Runtime"

In [0]:
!cp "drive/My Drive/datasets/iam.zip" "/content"
!unzip -q iam.zip
!rm iam.zip

### Dataset

In [0]:
class IamDataset(Dataset):
  def __init__(self, desired_size=(128, 512)):
    if not os.path.exists('/content/labels.csv'):
      raise Exception('Iam dataset does not exist in /content/labels.csv')

    self.desired_size = desired_size
    self.path = '/content/images/'
    self.df = pd.read_csv('/content/labels.csv', sep='\t', header=None, names=['word', 'seg', 'transcription'])
    self.df = self.df.drop(['seg'], axis=1)
    self.df = self.df.drop(self.df[self.df['transcription'] == '.'].index)
    self.df = self.df.drop(self.df[self.df['transcription'] == '!'].index)
    self.df = self.df.drop(self.df[self.df['transcription'] == ','].index)
    self.df = self.df.drop(self.df[self.df['transcription'] == ';'].index)
    self.df = self.df.drop(self.df[self.df['transcription'] == ')'].index)
    self.df = self.df.drop(self.df[self.df['transcription'] == '('].index)
    self.df = self.df.reset_index()
    
  def get_df(self):
    return self.df

  def tensor_image(self, path):
    img = Image.open(path + '.png')
    img = self.resize(img)
    x = transforms.functional.to_tensor(img)

    # Look into automatically resizing or adding padding to images
    # With a GAN, we will likely need all the input images to be the same size

    return x

  def resize(self, img):
    img_size = np.array(img).shape

    img_ratio = img_size[0] / img_size[1]
    desired_ratio = self.desired_size[0] / self.desired_size[1]

    if img_ratio >= desired_ratio:
      # Solve by height
      new_height = self.desired_size[0]
      new_width = int(self.desired_size[0] // img_ratio)
    else:
      new_height = int(self.desired_size[1] * img_ratio)
      new_width = self.desired_size[1]
      # Solve by width

    img = np.array(img.resize((new_width, new_height)))

    border_top = self.desired_size[0] - new_height
    border_right = self.desired_size[1] - new_width

    border_img = cv2.copyMakeBorder(
        img,
        top=border_top,
        bottom=0,
        left=0,
        right=border_right,
        borderType=cv2.BORDER_CONSTANT,
        value=[255]
    )

    return border_img

  def __getitem__(self, index):
    img = self.tensor_image('images/' + self.df['word'][index])

    return img, self.df['transcription'][index]

  def __len__(self):
    return len(self.df)

### Classes

In [0]:
class Encoder(nn.Module):
  def __init__(self, dim=128, layers=4):
    super(Encoder, self).__init__()

    self.layers = layers
    self.dim = dim

    # input_size, hidden_size, num_layers, bidirectional=True
    self.gru = nn.GRU(1, dim, layers, bidirectional=True, batch_first=True)
    self.fc = nn.Linear(256, 16) # used to match up shapes when given to generator

    self.hidden_init_state = torch.randn((8, 1, dim), requires_grad=True)
    self.embedding = nn.Embedding(128, 1)

  def getRepresentation(words):
    charlists = []
    zeros = np.zeros(20)

    if type(words) == str:
      charlist = [ord(c) for c in words]
      charlist = np.concatenate((charlist, zeros))
      charlists.append(charlist[:16])

      return torch.tensor(charlists).long().cuda()

    for word in words:
      charlist = [ord(c) for c in word]
      charlist = np.concatenate((charlist, zeros))
      charlists.append(charlist[:16])

    return torch.tensor(charlists).long().unsqueeze(2).cuda()

  # input => (tuple of strings)
  def getEmbedding(words, embedding):
    rep = Encoder.getRepresentation(words)
    
    return embedding(rep)

  def init_hidden(self, batch):
    return torch.zeros(self.layers * 2, batch, self.dim).cuda()

  def weight_init(self, mean, std):
      for m in self._modules:
          normal_init(self._modules[m], mean, std)
    
  # input -> embedding
  def forward(self, embedding):
    hidden_state = self.init_hidden(embedding.shape[0])

    out, _ = self.gru(embedding.float(), hidden_state)
    out = self.fc(out)
    out = out.view(-1, 1, 1, 256)

    return out

In [0]:
class Recognizer(nn.Module):
  def __init__(self):
    super(Recognizer, self).__init__()

    self.hidden_size = 256
    self.num_layers = 1
    self.max_length = 20

    # Encoder
    self.conv1 = nn.Conv2d(1, 8, 3, 1, 1)
    self.tanh1 = nn.Tanh()
    self.conv2 = nn.Conv2d(8, 16, 3, 1, 1) # 4x2
    self.tanh2 = nn.Tanh()
    self.conv3 = nn.Conv2d(16, 32, 3, 1, 1)
    self.tanh3 = nn.Tanh()
    self.conv4 = nn.Conv2d(32, 64, 3, 1, 1) # 4x2
    self.tanh4 = nn.Tanh()
    self.conv5 = nn.Conv2d(64, 128, 3, 1, 1)
    self.tanh5 = nn.Tanh()

    self.gate1 = nn.Sequential(nn.Conv2d(16, 16, 3, 1, 1), nn.Sigmoid()) 
    self.gate2 = nn.Sequential(nn.Conv2d(32, 32, 3, 1, 1), nn.Sigmoid())
    self.gate3 = nn.Sequential(nn.Conv2d(64, 64, 3, 1, 1), nn.Sigmoid())

    # Maxpool
    self.mp = nn.MaxPool2d((32, 1))
    # reshape out.view(-1, 512, 1)

    # Decoder
    self.gru1 = nn.GRU(128, self.hidden_size, self.num_layers, bidirectional=True, batch_first=True)
    self.fc1 = nn.Linear(self.hidden_size * 2, 128)
    self.gru2 = nn.GRU(128, self.hidden_size, self.num_layers, bidirectional=True, batch_first=True)
    self.fc2 = nn.Linear(self.hidden_size * 2, 16)
    self.softmax = nn.LogSoftmax(dim=1)

  def init_hidden(self, batch_size):
    return torch.zeros(self.num_layers * 2, batch_size, self.hidden_size)

  def weight_init(self, mean, std):
    for m in self._modules:
        normal_init(self._modules[m], mean, std)

  # Input -> (Batch, 1, 128, 256)
  def forward(self, x):
    # Encoder
    out = self.conv1(x)
    out = self.tanh1(out)
    out = self.conv2(out)
    out = self.tanh2(out)

    gate1 = self.gate1(out)
    out = out * gate1

    out = self.conv3(out)
    out = self.tanh3(out)

    gate2 = self.gate2(out)
    out = out * gate2

    out = self.conv4(out)
    out = self.tanh4(out)

    gate3 = self.gate3(out)
    out = out * gate3

    out = self.conv5(out)
    out = self.tanh5(out)

    # Max Pooling across vertical dimension
    out = self.mp(out)

    # Decoder
    out = out.view(-1, 128, 128)

    out, _ = self.gru1(out)
    out = self.fc1(out)
    out, _ = self.gru2(out)
    out = self.fc2(out)
    out = self.softmax(out)

    # Change shape so that we can pass directly to CTC-Loss
    out = out.permute(2, 0, 1)

    return out

### Training

In [1]:
# In future, potentially apply gradient balancing

def train():
  try:
    EPOCHS = 200
    BATCH_SIZE = 250
    VALIDATION_EVERY

    dataset = IamDataset(desired_size=(32, 128))
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [int(.8 * len(dataset)) + 1, int(.2 * len(dataset))])

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)

    recognizer = Recognizer().cuda()
    optimizer = optim.Adam(recognizer.parameters(), lr=4e-4, betas=(0, .999))
    objective = nn.CTCLoss(blank=0, reduction='none', zero_infinity=True)

    losses = []

    for epoch in range(EPOCHS):
      loop = tqdm(total=len(data_loader), position=0, leave=False)

      for batch_num, (real_imgs, words) in enumerate(data_loader):
        batch_size = real_imgs.shape[0]

        real_imgs, labels = real_imgs.cuda(), Encoder.getRepresentation(words).cuda()

        rec_optimizer.zero_grad()

        real_word_labels = labels.squeeze().long()
        input_lengths = torch.full((batch_size,), 16, dtype=torch.long) # (BATCH_SIZE) -> Sequence_Length
        target_lengths = word_lengths_tensor(words) # (BATCH_SIZE) -> Word lengths

        real_preds = recognizer(real_imgs)

        loss = rec_objective(real_preds, real_word_labels, input_lengths, target_lengths)
        loss = torch.mean(rec_loss)

        loss.backward()
        optimizer.step()

        losses.append(rec_loss.item())

        if batch_num % VALIDATION_EVERY == 0:
          # Same code but for validation set

        loop.set_description('Epoch: {}, Generator Loss: {:.4f}, Discriminator Loss: {:.4f}, Recognizer Loss: {:.4f}, AvgGenerator Loss: {:.4f}, AvgDiscriminator Loss: {:.4f}, AvgRecognizer Loss: {:.4f}'.format(
            epoch, gen_loss.item(), disc_loss.item(), rec_loss.item(), np.mean(g_losses), np.mean(d_losses), np.mean(r_losses)))
        loop.update(1)

      loop.close()
    
  except:
    __ITB__()
    gc.collect()
  finally:
    return generator, recognizer, discriminator, g_losses, d_losses, r_losses

IndentationError: ignored

### Results