<a href="https://colab.research.google.com/github/divy-arun-mav/handwriting-Recognition/blob/main/handwritingRecognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# collecting data
!wget -q https://git.io/J0fjL -O IAM_Words.zip
!unzip -qq IAM_Words.zip
!
!mkdir data
!mkdir data/words
!tar -xf IAM_Words/words.tgz -C data/words
!mv IAM_Words/words.txt data

In [3]:
!head -20 data/words.txt

#--- words.txt ---------------------------------------------------------------#
#
# iam database word information
#
# format: a01-000u-00-00 ok 154 1 408 768 27 51 AT A
#
#     a01-000u-00-00  -> word id for line 00 in form a01-000u
#     ok              -> result of word segmentation
#                            ok: word was correctly
#                            er: segmentation of word can be bad
#
#     154             -> graylevel to binarize the line containing this word
#     1               -> number of components for this word
#     408 768 27 51   -> bounding box around this word in x,y,w,h format
#     AT              -> the grammatical tag for this word, see the
#                        file tagset.txt for an explanation
#     A               -> the transcription for this word
#
a01-000u-00-00 ok 154 408 768 27 51 AT A
a01-000u-00-01 ok 154 507 766 213 48 NN MOVE


In [5]:
from tensorflow.keras.layers import StringLookup
from tensorflow import keras
import tensorflow as tf
import os
import matplotlib.pyplot as plt
import numpy as np

np.random.seed(42)
tf.random.set_seed(42)

In [7]:
base_path = "data"
words_list = []

words = open(f"{base_path}/words.txt", "r").readlines()
for line in words:
    if line[0] == "#":
        continue
    if line.split(" ")[1] != "err":
        words_list.append(line)

len(words_list)
np.random.shuffle(words_list)

In [8]:
print(words_list[0:20])

['e04-030-04-08 ok 170 1489 1499 120 39 JJ sure\n', 'k02-102-05-03 ok 182 836 1623 69 52 PP3A he\n', 'a01-082u-01-04 ok 172 1582 1043 234 88 IN during\n', 'm01-000-07-00 ok 196 339 1998 75 107 INO of\n', 'g01-031-07-06 ok 152 1912 2038 167 59 NN booty\n', 'f07-081b-01-06 ok 168 1366 924 350 88 NN gastronomy\n', 'n03-082-04-03 ok 165 992 1414 118 135 NN boy\n', 'g06-018c-04-05 ok 182 1298 1438 96 58 ATI The\n', 'g06-011j-06-06 ok 182 1222 1785 146 95 CC and\n', 'f04-024-01-06 ok 183 1104 981 60 70 IN in\n', 'g06-050k-00-07 ok 156 1842 717 85 76 PP$ his\n', 'm01-100-01-06 ok 185 1459 880 177 131 JJ flying\n', 'n02-033-04-01 ok 149 940 1486 353 85 VBD presented\n', 'p03-047-00-01 ok 191 393 739 127 80 UH No\n', 'g06-011e-08-00 ok 154 386 2190 70 44 CS as\n', 'p02-000-00-02 ok 182 752 727 97 85 AP few\n', 'c01-009-08-03 ok 182 1387 2363 170 81 NN work\n', 'c04-156-01-04 ok 175 989 912 65 142 INO of\n', 'h02-004-09-01 ok 191 776 2424 191 76 NNS items\n', 'g06-037o-03-02 ok 188 850 1274 214 

In [9]:
split_idx = int(0.9 * len(words_list))
train_samples = words_list[:split_idx]
test_samples = words_list[split_idx:]

val_split_idx = int(0.5 * len(test_samples))
validation_samples = test_samples[:val_split_idx]
test_samples = test_samples[val_split_idx:]

assert len(words_list) == len(train_samples) + len(validation_samples) + len(test_samples)

print(f"Total training samples: {len(train_samples)}")
print(f"Total validation samples: {len(validation_samples)}")
print(f"Total test samples: {len(test_samples)}")

Total training samples: 86810
Total validation samples: 4823
Total test samples: 4823


In [11]:
base_image_path = os.path.join(base_path, "words")

def get_image_paths_and_labels(samples):
    paths = []
    corrected_samples = []
    for (i, file_line) in enumerate(samples):
        line_split = file_line.strip()
        line_split = line_split.split(" ")

        image_name = line_split[0]
        partI = image_name.split("-")[0]
        partII = image_name.split("-")[1]

        img_path = os.path.join(
            base_image_path, partI, partI + "-" + partII, image_name + ".png"
        )

        if os.path.getsize(img_path):
            paths.append(img_path)
            corrected_samples.append(file_line.split("\n")[0])

    return paths, corrected_samples

train_img_paths, train_labels = get_image_paths_and_labels(train_samples)
validation_img_paths, validation_labels = get_image_paths_and_labels(validation_samples)
test_img_paths, test_labels = get_image_paths_and_labels(test_samples)

In [12]:
train_img_paths[0:20]

['data/words/e04/e04-030/e04-030-04-08.png',
 'data/words/k02/k02-102/k02-102-05-03.png',
 'data/words/a01/a01-082u/a01-082u-01-04.png',
 'data/words/m01/m01-000/m01-000-07-00.png',
 'data/words/g01/g01-031/g01-031-07-06.png',
 'data/words/f07/f07-081b/f07-081b-01-06.png',
 'data/words/n03/n03-082/n03-082-04-03.png',
 'data/words/g06/g06-018c/g06-018c-04-05.png',
 'data/words/g06/g06-011j/g06-011j-06-06.png',
 'data/words/f04/f04-024/f04-024-01-06.png',
 'data/words/g06/g06-050k/g06-050k-00-07.png',
 'data/words/m01/m01-100/m01-100-01-06.png',
 'data/words/n02/n02-033/n02-033-04-01.png',
 'data/words/p03/p03-047/p03-047-00-01.png',
 'data/words/g06/g06-011e/g06-011e-08-00.png',
 'data/words/p02/p02-000/p02-000-00-02.png',
 'data/words/c01/c01-009/c01-009-08-03.png',
 'data/words/c04/c04-156/c04-156-01-04.png',
 'data/words/h02/h02-004/h02-004-09-01.png',
 'data/words/g06/g06-037o/g06-037o-03-02.png']

In [13]:
train_labels[0:20]

['e04-030-04-08 ok 170 1489 1499 120 39 JJ sure',
 'k02-102-05-03 ok 182 836 1623 69 52 PP3A he',
 'a01-082u-01-04 ok 172 1582 1043 234 88 IN during',
 'm01-000-07-00 ok 196 339 1998 75 107 INO of',
 'g01-031-07-06 ok 152 1912 2038 167 59 NN booty',
 'f07-081b-01-06 ok 168 1366 924 350 88 NN gastronomy',
 'n03-082-04-03 ok 165 992 1414 118 135 NN boy',
 'g06-018c-04-05 ok 182 1298 1438 96 58 ATI The',
 'g06-011j-06-06 ok 182 1222 1785 146 95 CC and',
 'f04-024-01-06 ok 183 1104 981 60 70 IN in',
 'g06-050k-00-07 ok 156 1842 717 85 76 PP$ his',
 'm01-100-01-06 ok 185 1459 880 177 131 JJ flying',
 'n02-033-04-01 ok 149 940 1486 353 85 VBD presented',
 'p03-047-00-01 ok 191 393 739 127 80 UH No',
 'g06-011e-08-00 ok 154 386 2190 70 44 CS as',
 'p02-000-00-02 ok 182 752 727 97 85 AP few',
 'c01-009-08-03 ok 182 1387 2363 170 81 NN work',
 'c04-156-01-04 ok 175 989 912 65 142 INO of',
 'h02-004-09-01 ok 191 776 2424 191 76 NNS items',
 'g06-037o-03-02 ok 188 850 1274 214 75 VBD took']

In [16]:
train_labels_cleaned = []
characters = set()
max_len = 0

for label in train_labels:
    label = label.split(" ")[-1].strip()
    for char in label:
      characters.add(char)
    max_len = max(max_len, len(label))
    train_labels_cleaned.append(label)
print("Maximum Length: ",max_len)
print("Vocab Size: ",len(characters))
train_labels_cleaned[0:20]

Maximum Length:  21
Vocab Size:  78


['sure',
 'he',
 'during',
 'of',
 'booty',
 'gastronomy',
 'boy',
 'The',
 'and',
 'in',
 'his',
 'flying',
 'presented',
 'No',
 'as',
 'few',
 'work',
 'of',
 'items',
 'took']

In [17]:
def clean_labels(labels):
    cleaned_labels = []
    for label in labels:
        label = label.split(" ")[-1].strip()
        cleaned_labels.append(label)
    return cleaned_labels

validation_labels_cleaned = clean_labels(validation_labels)
test_labels_cleaned = clean_labels(test_labels)

In [18]:
AUTOTUNE = tf.data.AUTOTUNE

char_to_num = StringLookup(vocabulary=list(characters), mask_token=None)
num_to_char = StringLookup(vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True)