In [1]:
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
from tensorflow import keras
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import os
import scipy.io
import xml.etree.ElementTree as ET
import re
import random
from unidecode import unidecode
from PIL import Image
from tqdm.auto import tqdm
import uuid

np.random.seed(42)
tf.random.set_seed(42)

#### Load data
Load IAM-Handwritten-Database words, sentences and

In [2]:
iam_root_path = './data/IAM-Handwritten-Database/'
iam_dataset = [] # format : {"image_path": ..., "laabel": ...}

# Function taht : 
# opens iam words data descrpition file
# If line is not a comment and image is formatted correctly, add the image data
# File names in are formated like this :
# part1-part2-part3
# Files a stored like this on hard drive :
# part1/part1-part2/part1-part2-part3.png
# filter only well formetted files which size are > 0 bytes
# remove every spaces between punctuation, every newline, trailing spaces and vertical bars from label
def get_iam_handwritten_db_data(data_type):
    dataset = []
    with open(os.path.join(iam_root_path, data_type + ".txt"), 'r') as iam_data_file:
        segmentation_result_idx =  1 if data_type == 'words' or data_type == 'line' else 2
        lines = [line for line in iam_data_file]
        for line in lines:
            splitted_line = line.split(' ')
            if line[0] != '#' and splitted_line[segmentation_result_idx] != 'err': # if line is not a comment and file is formatted correctly
                splitted_image_name = splitted_line[0].split('-')
                img_path = os.path.join(
                    iam_root_path,
                    data_type,
                    splitted_image_name[0],
                    splitted_image_name[0] + '-' + splitted_image_name[1],
                    splitted_line[0] + ".png"
                )
                if os.path.exists(img_path) and os.path.getsize(img_path) > 0: #we only keep files that exists and are > 0 bytes
                    dataset.append({
                        "image_path": img_path,
                        "label": re.sub (r'\s([?.!",\'-;/](?:\s|$))', r'\1' , splitted_line[-1].split('\n')[0].replace('|', ' ').strip()) 
                    })
    return dataset

iam_dataset = iam_dataset + get_iam_handwritten_db_data('sentences')
iam_dataset = iam_dataset + get_iam_handwritten_db_data('lines')
iam_dataset = iam_dataset + get_iam_handwritten_db_data('words')


Load IIIT5K-Word V3.0 dataset

In [3]:
iit_5k_words_root = './data/IIIT5K-Word_V3.0/IIIT5K/'
iit_5k_words_dataset = [] # format : {"image_path": ..., "laabel": ...}

# Function taht reads .mat file structure of iit 5k word dataset and returns the formatted data
def get_iit_5k_word_data(structure):
    dataset = []
    for data in structure:
        img_path = os.path.join(iit_5k_words_root, data[0][0])
        if os.path.exists(img_path) and os.path.getsize(img_path) > 0: #we only keep files that exists and are > 0 bytes
            dataset.append({
                "image_path": img_path,
                "label": data[1][0]
            })
    return dataset

# Call get_iit_5k_word_data and add train and test data in iit_5k_words_dataset
iit_5k_words_dataset = iit_5k_words_dataset + get_iit_5k_word_data(
        scipy.io.loadmat(iit_5k_words_root + 'traindata.mat')['traindata'][0]
)
iit_5k_words_dataset = iit_5k_words_dataset + get_iit_5k_word_data(
        scipy.io.loadmat(iit_5k_words_root + 'testdata.mat')['testdata'][0]
)

Load ICDAR 2003 Robust Reading Competitions - Robust Word Recognition dataset

In [4]:
icdar_2023_words_root = "./data/ICDAR 2003 Robust Reading Competitions/Robust Word Recognition/"
icdar_2023_words_dataset = []

# Function that read xml file to create formatted dataset
def get_icdar_2003_words_data(xml_filepath):
    dataset = []
    dirname = os.path.dirname(xml_filepath)
    image_list = ET.parse(xml_filepath).getroot()

    for image in image_list:
        img_path = os.path.join(dirname, image.attrib["file"])
        if os.path.exists(img_path) and os.path.getsize(img_path): #we only keep files that exists and are > 0 bytes
            dataset.append({
                "image_path": img_path,
                "label": image.attrib['tag']
            })
    return dataset
    

icdar_2023_words_dataset = icdar_2023_words_dataset + get_icdar_2003_words_data(os.path.join(icdar_2023_words_root, "Sample Set", "word.xml"))
icdar_2023_words_dataset = icdar_2023_words_dataset + get_icdar_2003_words_data(os.path.join(icdar_2023_words_root, "TrialTest Set", "word.xml"))
icdar_2023_words_dataset = icdar_2023_words_dataset + get_icdar_2003_words_data(os.path.join(icdar_2023_words_root, "TrialTrain Set", "word.xml"))

Generate EMNIST dataset

In [4]:
from emnist import extract_samples

EMNIST_LABEL_TO_CHAR = [
    '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
    'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J','K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T','U', 'V', 'W', 'X', 'Y', 'Z',
    'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j','k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't','u', 'v', 'w', 'x', 'y', 'z',
]

# Functions that returns emnist letters and digits in the following format : {'a': [all 'a' images], 'b': all 'b' images, ...}
def get_emnist_sorted_by_label():
    dataset = {}
    emnist_train_images, emnist_train_labels = extract_samples('byclass', 'train')
    emnist_test_images, emnist_test_labels = extract_samples('byclass', 'test')
    images, labels = np.concatenate((emnist_train_images, emnist_test_images)), np.concatenate((emnist_train_labels, emnist_test_labels))
    labels_set = sorted(list(set(label for label in labels)))
    
    for unique_label_idx in labels_set:        
        matching_indices = [i for i in range(len(labels)) if labels[i] == unique_label_idx]
        dataset[EMNIST_LABEL_TO_CHAR[unique_label_idx]] = list(
            map(
                lambda index: images[index] ,
                matching_indices
            )
        )
    return dataset

# Function that adds up EMNIST images array to create given word
def create_word_matrix(word, characters_bank):
    word_matrix = np.zeros((28, len(word) * 28))
    for char_idx in range(len(word)):
        random_letter_matrix = random.choice(characters_bank[word[char_idx]]) if word[char_idx] != ' ' else np.zeros((28, 28))
        word_matrix[0:28, (char_idx) * 28:(char_idx + 1) * 28] = random_letter_matrix
    return word_matrix

# Function that loop over all dictionnary words, create and save handwritten words
def save_handwritten_words_on_disk(dictionnary, characters_bank, directory="./data/EMNIST-Handwritten-French-Words/", iterations=1):
    # Create directory if does not exists
    if os.path.exists(directory) == False: os.makedirs(directory)
    # Open and create data file if does not exists
    data_file = open(os.path.join(directory, "data.txt"), "a+")
    
    for _ in range(iterations):
        for word in tqdm(dictionnary):
            im = Image.fromarray(create_word_matrix(word, characters_bank)).convert('L')
            file_name = str(uuid.uuid1()) + ".png"
            im.save(os.path.join(directory, file_name))
            data_file.write(f"{file_name} {word}\n")
    data_file.close()

# Create formatted EMNIST label dataset
emnist_sorted_by_label = get_emnist_sorted_by_label()
#Open french dictionnary file
french_dictionnary = open('./data/dela-fr-public.txt').readlines()
# Transform words so they have no accents
french_dictionnary = list(map(lambda line:  unidecode(line.split(',')[0]), french_dictionnary))
# Filter to remove words with special characters
french_dictionnary = list(filter(lambda word: bool(re.match('^[a-zA-Z0-9]*$', word)), french_dictionnary))
# Call function that will create and save handwritten words
save_handwritten_words_on_disk(french_dictionnary, emnist_sorted_by_label)

#1. Get all characters in a set
#2. Get a dict with this format : {'a': [all 'a' images], 'b': all 'b' images, ...}
#3. Find a French dictionnary adn for each words
#4. Create x time this word by adding matrices and creating new ones
#5. Do the same but with sentences
#6. Save on hard drive since tensorflow will need to load on hard drive



  0%|          | 0/683824 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [103]:
dataset = iam_words_dataset + iit_5k_words_dataset + icdar_2023_words_dataset
print(dataset[0])
np.random.shuffle(dataset)
# print(dataset[:1000])
# len(iam_words_dataset) = 96454
# len(iit_5k_words_dataset) = 5000
# len(icdar_2023_words_dataset) = 1666
#

# subject_labels
# eoc_labels
# alphabet
# strokes
# eow_labels
# char_labels
# word_labels
# max
# min
# soc_labels
# mean
# texts
# std
# preprocessing
# sow_labels


#### !!!! CHECK SI EN RESIZANT LES IMAGES EMNIST AVE TF ON A UN TRUC ENCORE LISIBLE

## split in 98:1:1
## shuffle


96454
5000
1666


NameError: name 'train_labels' is not defined