<a href="https://colab.research.google.com/github/clemsage/NeuralDocumentClassification/blob/master/skeleton_ocr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training a classifier on OCR text input


## Imports & Cloning repository



### Import Tensorflow v2


In [None]:
!pip install tensorflow_text

import tensorflow as tf
print(tf.__version__)

In [None]:
# Check your devices, if it fails change your execution context to GPU

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

### Usefull imports and clone repo


In [None]:
import os
import sys

print(sys.version)

In [None]:
# Clone the git repository

if not os.path.exists('NeuralDocumentClassification'):
  !git clone https://github.com/clemsage/NeuralDocumentClassification.git
else:
  !git -C NeuralDocumentClassification pull
sys.path.append('NeuralDocumentClassification')

In [None]:
# Lot of usefull imports

# All of them are already installed on the colab session


# STD imports
import collections  # contains idiomatic data structures
import copy
import itertools    # provides efficient tools on iterators
import random
import re           # regexes

from functools import partial  # little helper for partially applying a function
from typing import List, Dict, Tuple, Union, NewType, TypeVar, Counter, Iterator, Callable  # statically typing for python

import matplotlib.pyplot as plt  # plotting tool
import nltk                      # natural language processing toolkit
import numpy as np               # main scientific linear algebra library in python (matrices)
import pandas as pd              # dataframes
import sklearn                   # machine learning & data mining library
import tensorflow as tf          # machine learning library
import tensorflow_text as tf_text# bonus text utilities for tensorflow
import tqdm                      # progression bar


from tensorflow import keras     # high level tensorflow API
from sklearn import metrics      # metrics for model performances

nltk.download('stopwords')

%matplotlib inline
plt.rcParams["figure.figsize"] = (12, 9)


### Defining some constants and types

In [None]:
# Some usefull types for this dataset

InputText = NewType('InputText', Union[str, List[str]])
Label = NewType('Label', int)
DocumentRecord = NewType('DocumentRecord', Tuple[InputText, Label])
Dataset = NewType('Dataset', Dict[str, List[DocumentRecord]])

Token = NewType('Token', str)
Vocabulary = NewType('Vocabulary', Dict[Token, int])


# Constants

CLASS_NAMES = ['form', 'email', 'handwritten', 'advertisement', 'invoice']
CLASS_INDICES = ['1', '2', '3', '4', '11']
NUM_CLASSES = len(CLASS_NAMES)


## Load the dataset



In [None]:
# Some local scripts imports
import download_dataset  # dowloading from google drive
import ocr_input         # deals with reading dataset and xml parsing


for elt in ['label', 'ocr', 'dataset_assignment']:
  download_dataset.download_and_extract(elt)
dataset_path = 'dataset'

In [None]:
def get_dataset(clean_text_f: Callable = (lambda x: x)):
  labels = {}
  with open(os.path.join(dataset_path, "label.txt"), "r") as f:
      for line in f.readlines():
          file_id, lbl = line.strip().split(",")
          labels[file_id] = lbl

  dataset = collections.defaultdict(list)
  with open(os.path.join(dataset_path, "dataset_assignment.txt"), "r") as f:
      for line in f.readlines():
          line = line.split('\n')[0]
          file_id, assignment = line.split(',')
          img_path = os.path.join(dataset_path, "image_png", f"{file_id}.png")
          ocr_path = os.path.join(dataset_path, "ocr", f"{file_id}.xml")
          
          text = ocr_input.parse_xml(ocr_path)
          text = clean_text_f(text) 
          
          dataset[f"{assignment}_ocr"].append(text)
          dataset[f"{assignment}_lbl"].append(CLASS_INDICES.index(labels[file_id]))
  return dataset

dataset = get_dataset()

print(f"Number of training documents: {len(dataset['training_ocr'])}")
print(f"Number of test documents: {len(dataset['test_ocr'])}")

## Study the vocabulary

In this part we will look at the data.

When dealing with text and words, the first thing to do is looking at those words.

In [None]:
# To access a specific element or range in a list, you can use bracket notation: 
# `my_list[0]` is the first element
# `my_list[10: 20]` is an array containing elements from index 10 (included) to 20 (excluded)

## Print some texts from the dataset and look at what the OCR system has read. ##


## Any remarks ? ##

In [None]:
#@title

# Use the function `print` to look at texts in the datasets (either x_train or x_test)
# To access a scpecific element or range in a list, you can use bracket notation: 
# `my_list[0]` is the first element
# `my_list[10: 20]` is an array containing elements from index 10 (included) to 20 (excluded)


# print some texts from the dataset and look at what the OCR system has read.
for x in random.choices(dataset["training_ocr"], k=5):
  print(x)


# Any remarks ?

"""
Mostly not words, bunch of symbols. Very hard to understand.
"""

### Vocabulary

Let's find out of what's in the texts and clean it a bit!

In [None]:
# Some plotting functions to display the vocabulary

def plot_token_count(token_count, n=1000):
    """
    Plots occurences for n most common tokens
    """
    plt.plot(list(range(n)), [i for _, i in token_count.most_common(n)])

    plt.yscale("log")
    plt.title(f"Evolution of occurences of the {n} most frequent tokens")
    plt.show()

def plot_accumulated_token_count(token_count, n=1000):
    """
    Plots accumulated occurences divided by token number of tokens for n most common tokens 
    """
    total_tokens = sum(token_count.values()) / 100
    plt.plot(list(range(n)), list(itertools.accumulate(i / total_tokens for _, i in token_count.most_common(n))))

    plt.title(f"Evolution of cumulated occurences of the {n} most frequent tokens divided by total number of tokens")
    plt.show()

In [None]:
## Use the `collections.Counter` class to count each word occurence

## What are the most common tokens ? ##

## Plot token occurences and cumulated token occurences. ##

## How many percentages of the token are included in the vocabulary if we use 1_000 tokens ? 10_000 ? 100_000 ? ##


In [None]:
#@title
# Count each word occurence
word_count = Counter()
for text in dataset["training_ocr"]:
  word_count.update(text.split())


# What are the most common tokens
print(list(word_count.most_common(100)))

# Plot token occurences and cumulated token occurences.
plot_token_count(word_count, n=10000)
plot_accumulated_token_count(word_count, n=10000)

# How many percentages of the token are included in the vocabulary if we use 1_000 tokens ? 10_000 ? 100_000 ?
for size in [10**3, 10**4, 10**5]:
  print(f"With a vocabulary of size {size}, you cover {sum(t for _, t in word_count.most_common(size)) / sum(word_count.values()) * 100:0.2f}% of the encountered tokens")

In [None]:
# If you want, you can clean up the texts according to your observations by removing stop words or special characters
# Define a function that takes one document text and cleans it and provide it to `get_dataset` function

STOP_WORD_S = set(nltk.corpus.stopwords.words('english'))


# Example for removing stopwords
def clean_text(text):
    words = text.lower().split()
    return " ".join([w for w in words if w not in STOP_WORD_S])

dataset = get_dataset(clean_text)

## Now we can build the dataset


In [None]:
VOCABULARY_SIZE = 10**4

tokenizer = tf_text.WhitespaceTokenizer()

# Takes text and label, returns tokenized text and label
def process_text(text, lbl):
    tokens = tokenizer.tokenize(text)

    # Assign same random index to each word by hashing it
    tokens = tf.strings.to_hash_bucket_fast(tokens, VOCABULARY_SIZE)
    return tokens[:400], lbl


In [None]:
train_txt_ds = tf.data.Dataset.from_tensor_slices((dataset["training_ocr"], dataset["training_lbl"]))
train_txt_ds = train_txt_ds.shuffle(100000)
train_txt_ds = train_txt_ds.map(process_text)

test_txt_ds = tf.data.Dataset.from_tensor_slices((dataset["test_ocr"], dataset["test_lbl"]))
test_txt_ds = test_txt_ds.map(process_text)

print(next(iter(train_txt_ds)))

## Basic Model: Bag of Words



### Vectorizer

To implement a Bag of Word model, we first need to represent sentences according to bag of words
Each document must be represented by a `VOCABULARY_SIZE` length vector counting occurences of each word

In [None]:
# use `tf.math.bincount` to count word occurences.
# Can you find how to implement binary bag of words ?

In [None]:
#@title

def convert_to_bow(tokens, lbl):
    # Use param `binary_output=True` for binary counting
    emb = tf.math.bincount(tf.cast(tokens, tf.int32), minlength=VOCABULARY_SIZE)
    return emb, lbl

train_txt_bow_ds = train_txt_ds.map(convert_to_bow)
test_txt_bow_ds = test_txt_ds.map(convert_to_bow)

print(next(iter(train_txt_bow_ds)))


### Model
We will now start building our model.

You can use any optimizer (`SGD`, `RMSProp`, …) but `Adam` is one of the best currently. It converges faster and to a better minimum than other optimizers most of the times

We are doing a classification problem, use `sparse_categorical_crossentropy` as your loss and `sparse_categorical_accuracy` as your metric.

Feel free to try multiple numbers of hidden units, layers, activation functions, add new types of layers (see keras.layers for this: https://keras.io/layers/core/) …


In [None]:
## Create a Sequential model that takes a sentence vector in input (size=VOCABULARY_SIZE) and returns a vector of size NUM_CLASSES. ##
# Find help here: https://keras.io/models/sequential/
# and here: https://www.tensorflow.org/tutorials/keras/classification

# Create your model here and compile it.
model = keras.models.Sequential([

])

# Select and optimizer
optimizer = None

# Compile model with loss, optimizer and metrics

model.summary()

In [None]:
#@title
## Create a Sequential model that takes a sentence vector in input (size=VOCABULARY_SIZE) and returns a vector of size NUM_CLASSES. ##
# Find help here: https://keras.io/models/sequential/
# and here: https://www.tensorflow.org/tutorials/keras/classification

model = keras.models.Sequential([
    keras.layers.Dense(input_dim=VOCABULARY_SIZE, units=32, activation="relu"),
    keras.layers.Dense(units=32, activation="relu"),
    keras.layers.Dense(units=NUM_CLASSES, activation="softmax"),
])

optimizer = keras.optimizers.Adam(learning_rate=0.0001)

model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["sparse_categorical_accuracy"])

model.summary()

### Training
We are now ready to train our model !

In [None]:
batch_size = 128

model.fit(train_txt_bow_ds.batch(batch_size),
          epochs=15,
          validation_data=test_txt_bow_ds.batch(8),
          callbacks=[tf.keras.callbacks.TensorBoard("logs/nlp")],
          verbose=1)

### Evaluation
We can also evaluate our model on the test set.

In [None]:
loss, metric = model.evaluate(test_txt_bow_ds.batch(8), verbose=2)

bow_y_pred = model.predict(test_txt_bow_ds.batch(8)).argmax(-1)
bow_y_test = [lbl for _, lbl in test_txt_bow_ds.as_numpy_iterator()]

print(pd.DataFrame(metrics.confusion_matrix(bow_y_test, bow_y_pred), columns=CLASS_NAMES, index=CLASS_NAMES))

## A bit more complex: Recurrent Neural Networks and Long-Short Term Memory


### Model

In [None]:
## Like the Bag of Words model, implement a Sequential LSTM model  and compile it

model = keras.models.Sequential([
    tf.keras.Input(shape=(None,), dtype=tf.int64, ragged=True),
    keras.layers.Embedding(input_dim=VOCABULARY_SIZE, output_dim=EMBEDDING_SIZE)
])

optimizer = None

model.summary()

In [None]:
#@title
## Like the Bag of Words model, implement a Sequential LSTM model  and compile it##

EMBEDDING_SIZE = 128

model = keras.models.Sequential([
    tf.keras.Input(shape=(None,), dtype=tf.int64, ragged=True),
    keras.layers.Embedding(input_dim=VOCABULARY_SIZE, output_dim=EMBEDDING_SIZE),  # Embeddings of tokens
    keras.layers.Bidirectional(keras.layers.LSTM(32)),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(32, activation="relu"),
    keras.layers.Dense(units=NUM_CLASSES, activation="softmax")
])

optimizer = keras.optimizers.Adam(learning_rate=0.0001)

model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["sparse_categorical_accuracy"])

model.summary()

In [None]:
## Train your model ##
# Use `padded_batch` instead of `batch` because documents are not all the same length

In [None]:
#@title
## Train your model ##
batch_size = 128

model.fit(train_txt_ds.padded_batch(batch_size),
          epochs=15,
          validation_data=test_txt_ds.padded_batch(8),
          callbacks=[tf.keras.callbacks.TensorBoard("logs/nlp")],
          verbose=1)

In [None]:
## Evaluate your new model. Is it better than Bag of Words ? CNN ? ##


In [None]:
#@title
## Evaluate your new model. Is it better than Bag of Words ? CNN ? ##

loss, metric = model.evaluate(test_txt_ds.padded_batch(8), verbose=2)

rnn_y_pred = model.predict(test_txt_ds.padded_batch(8)).argmax(-1)
rnn_y_test = [lbl for _, lbl in test_txt_ds.as_numpy_iterator()]

print(pd.DataFrame(metrics.confusion_matrix(rnn_y_test, rnn_y_pred), columns=CLASS_NAMES, index=CLASS_NAMES))

# If you have finished
Take a look at HuggingFace transformers library and try using BERT model: https://huggingface.co/docs/transformers/model_doc/bert
You should use TF BERT Tokenizer instead of the Whitespace tokenizer: https://www.tensorflow.org/text/api_docs/python/text/BertTokenizer