In [1]:
import tensorflow as tf
print (tf.__version__)

In [2]:
import collections
import itertools
import os

from typing import List, Tuple, Union, NewType, OrderedDict

import matplotlib.pyplot as plt
import nltk
import numpy as np
import sklearn
import sklearn.model_selection as model_selection
import tqdm

import ocr_input

%matplotlib inline
plt.rcParams["figure.figsize"] = (12, 9)


In [3]:
InputText = NewType('InputText', Union[str, List[str]])
Label = NewType('Label', int)
DocumentRecord = NewType('DocumentRecord', Tuple[InputText, Label])
Dataset = NewType('Dataset', List[DocumentRecord])

Token = NewType('Token', str)
Vocabulary = NewType('Vocabulary', OrderedDict[Token, int])

class_names = ['email', 'form', 'handwritten', 'invoice', 'advertisement']
num_classes = len(class_names)

random_seed = 42

# Load the dataset

In [4]:
def get_dataset() -> Dataset:
    dataset_path = r"F:\tobacco_dataset2\\ocr"

    all_files = os.listdir(dataset_path)
    doc_ocr_d = {file: content for file, content in tqdm.tqdm(zip(map(lambda f: os.path.splitext(f)[0], all_files), 
                                                                  map(ocr_input.parse_xml, map(lambda p: os.path.join(dataset_path, p), all_files))),
                                                                  total=len(all_files))}

    with open(r"F:\tobacco_dataset2\labels.txt", "r") as fp:
        label_d = {file: label for file, label in map(lambda line: line.split(','), fp.readlines())}

    return [(doc_ocr_d[file], label) for file, label in label_d.items()]

def split_dataset(dataset: Dataset, test_size: float = 0.2, random_seed: int = random_seed) -> Tuple[List[InputText], List[InputText], List[Label], List[Label]]:
    return model_selection.train_test_split(*zip(*dataset), test_size=test_size, random_state=random_seed)
    

dataset = get_dataset()
x_train, x_test, y_train, y_test = split_dataset(dataset)

In [5]:
print(len(dataset))
print(len(x_train), len(x_test))


# Study the vocabulary

In [6]:
# Naive vocabulary counting: splitting on space character
vocabulary = collections.Counter(itertools.chain(*map(str.split, x_train)))

In [7]:
def plot_vocabulary(vocabulary, n=1000):
    plt.plot(list(range(n)), [i for _, i in vocabulary.most_common(n)])

    plt.title(f"Évolution du nombre d'occurrence des {n} tokens les plus fréquents")
    plt.show()

def plot_accumulated_vocabulary(vocabulary, n=1000):
    total_tokens = sum(vocabulary.values()) / 100
    plt.plot(list(range(n)), list(itertools.accumulate(i / total_tokens for _, i in vocabulary.most_common(n))))

    plt.title(f"Évolution du nombre d'occurrences cumulé des {n} tokens les plus fréquents rapporté au nombre total de tokens")
    plt.show()

In [8]:
plot_vocabulary(vocabulary, n=10000)
plot_accumulated_vocabulary(vocabulary, n=500000)
print(len(vocabulary))

## We must reduce vocabulary size

In [9]:
vocabulary_reduced = collections.Counter(itertools.chain(*map(str.split, map(str.lower, x_train))))

plot_vocabulary(vocabulary_reduced, n=10000)
plot_accumulated_vocabulary(vocabulary_reduced, n=500000)
print(len(vocabulary_reduced))

In [10]:
# Conventions:
# index 0 is reserved for unknown tokens that will be mapped to `__UNK__`.
# other special token come just after (eg. `__NUM__` for numbers).
# other classic token are inserted in order for reverse dictionnary purpose.
__UNK__ = '__UNK__'

def unknown_wrapped(f):
    def wrapped(word, vocabulary=None):
        res = f(word)

        if vocabulary is not None and res not in vocabulary:
            return __UNK__
        return res
    
    return wrapped

@unknown_wrapped
def word_to_token(word: str) -> Token:
    return word

@unknown_wrapped
def word_to_token_reduced(word: str) -> Token:
    return word.lower()


def compute_vocabulary(input_text: List[InputText], *, max_size=1000, tokenize_f=word_to_token) -> Vocabulary:
    word_occurences_d = collections.Counter(i for i in map(tokenize_f,                      # Convert each word to its token
                                                           itertools.chain(*map(str.split,  # Split words on space character ` ` and flatten the iterables 
                                                                                x_train)))
                                            if i)                                           # Filter out empty strings 
    # compute special tokens
    no_special = 1

    vocabulary = collections.OrderedDict([(word, i) for i, (word, _) in enumerate(word_occurences_d.most_common(max_size - no_special), no_special)])
    vocabulary[__UNK__] = 0
    vocabulary.move_to_end(__UNK__, last=False)

    return vocabulary


In [12]:
# voc = compute_vocabulary(x_train, max_size=10**4, tokenize_f=word_to_token_reduced)
print(word_to_token_reduced('a', voc))
print(word_to_token_reduced('aaaaaaa', voc))

# print(*zip(vocabulary_reduced.most_common(10000), list(voc.items())[1:10001]), sep='\n')