<a href="https://colab.research.google.com/github/clemsage/NeuralDocumentClassification/blob/master/skeleton_ocr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training a classifier on OCR text input


## Imports & Cloning repository



### Import Tensorflow v2


In [1]:
!pip install tensorflow-gpu==2.0
import tensorflow as tf
print(tf.__version__)

Collecting tensorflow-gpu==2.0
[?25l  Downloading https://files.pythonhosted.org/packages/25/44/47f0722aea081697143fbcf5d2aa60d1aee4aaacb5869aee2b568974777b/tensorflow_gpu-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl (380.8MB)
[K     |████████████████████████████████| 380.8MB 43kB/s 
Collecting tensorboard<2.1.0,>=2.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/d3/9e/a48cd34dd7b672ffc227b566f7d16d63c62c58b542d54efa45848c395dd4/tensorboard-2.0.1-py3-none-any.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 59.4MB/s 
Collecting tensorflow-estimator<2.1.0,>=2.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/fc/08/8b927337b7019c374719145d1dceba21a8bb909b93b1ad6f8fb7d22c1ca1/tensorflow_estimator-2.0.1-py2.py3-none-any.whl (449kB)
[K     |████████████████████████████████| 450kB 52.2MB/s 
Collecting google-auth<2,>=1.6.3
[?25l  Downloading https://files.pythonhosted.org/packages/7b/cb/786dc53d93494784935a62947643b48250b84a882474e714f9af5e1a1928/go

2.0.0


In [3]:
# Check your devices, if it fails change your execution context to GPU

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


### Usefull imports and clone repo


In [4]:
import os
import sys

print(sys.version)

3.6.8 (default, Oct  7 2019, 12:59:55) 
[GCC 8.3.0]


In [5]:
# Clone the git repository

if not os.path.exists('NeuralDocumentClassification'):
  !git clone https://github.com/clemsage/NeuralDocumentClassification.git
else:
  !git -C NeuralDocumentClassification pull
sys.path.append('NeuralDocumentClassification')

Cloning into 'NeuralDocumentClassification'...
remote: Enumerating objects: 140, done.[K
remote: Counting objects: 100% (140/140), done.[K
remote: Compressing objects: 100% (110/110), done.[K
remote: Total 140 (delta 70), reused 78 (delta 28), pack-reused 0[K
Receiving objects: 100% (140/140), 307.29 KiB | 592.00 KiB/s, done.
Resolving deltas: 100% (70/70), done.


In [58]:
# Lot of usefull imports

# All of them are already installed on the colab session


# STD imports
import collections  # contains idiomatic data structures
import itertools    # provides efficient tools on iterators
import re           # regexes

from functools import partial  # little helper for partially applying a function
from typing import List, Dict, Tuple, Union, NewType, TypeVar, Counter, Iterator  # statically typing for python

import matplotlib.pyplot as plt  # plotting tool
import nltk                      # natural language processing toolkit
import numpy as np               # main scientific linear algebra library in python (matrices)
import pandas as pd              # dataframes
import sklearn                   # machine learning & data mining library
import tqdm                      # progression bar

from tensorflow import keras     # high level tensorflow API

nltk.download('stopwords')

%matplotlib inline
plt.rcParams["figure.figsize"] = (12, 9)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Defining some constants and types

In [0]:
# Some usefull types for this dataset

InputText = NewType('InputText', Union[str, List[str]])
Label = NewType('Label', int)
DocumentRecord = NewType('DocumentRecord', Tuple[InputText, Label])
Dataset = NewType('Dataset', Dict[str, List[DocumentRecord]])

Token = NewType('Token', str)
Vocabulary = NewType('Vocabulary', Dict[Token, int])


# Constants

CLASS_NAMES = ['form', 'email', 'handwritten', 'advertisement', 'invoice']
CLASS_INDICES = ['1', '2', '3', '4', '11']
NUM_CLASSES = len(CLASS_NAMES)

STOP_WORD_S = set(nltk.corpus.stopwords.words('english'))

## Load the dataset

In [8]:
# Some local scripts imports
import download_dataset  # dowloading from google drive
import ocr_input         # deals with reading dataset and xml parsing


for elt in ['label', 'ocr', 'dataset_assignment']:
  download_dataset.download_and_extract(elt)
dataset_path = 'dataset'

Downloading ./dataset/label.txt


194kB [00:00, 38.8MB/s]


Downloading ./tmp/ocr.zip


25.5MB [00:00, 35.0MB/s]


Unzipping ./tmp/ocr.zip to dataset/ocr…
Downloading ./dataset/dataset_assignment.txt


289kB [00:00, 45.8MB/s]


In [11]:
def get_dataset() -> Dataset:
  """
  Parse all data xml files and make pairs withe their label.
  Splits the records into training and test datasets.
  """

  # Parsing xml files into doc_ocr_d
  all_files = os.listdir(os.path.join(dataset_path, "ocr"))
  doc_ocr_d = {file: content for file, content in tqdm.tqdm(zip(map(lambda f: os.path.splitext(f)[0], all_files), 
                                                                map(ocr_input.parse_xml, map(lambda p: os.path.join(dataset_path, "ocr", p), all_files))),
                                                            total=len(all_files))}
  
  # Fetching labels into label_d
  with open(os.path.join(dataset_path, "label.txt"), "r") as fp:
      label_d = {file: CLASS_INDICES.index(label.strip()) for file, label in map(lambda line: line.split(','), fp.readlines())}

  # Fetching assignments into dataset_splits
  dataset_split = {"training": [], "test": []}
  with open(os.path.join(dataset_path, 'dataset_assignment.txt'), 'r') as f:
    for line in f.readlines():
      line = line.strip()
      file_id, assignment = line.split(',')
      dataset_split[assignment].append(file_id)

  return {split_name: [(doc_ocr_d[file], label_d[file]) for file in file_split_l] for split_name, file_split_l in dataset_split.items()}

dataset = get_dataset()

print(f"Number of training documents: {len(dataset['training'])}")
print(f"Number of test documents: {len(dataset['test'])}")

x_train, y_train = zip(*dataset["training"])
x_test, y_test = zip(*dataset["test"])

100%|██████████| 16275/16275 [00:02<00:00, 6456.47it/s]

Number of training documents: 12952
Number of test documents: 3323
LLZS TSLLS cz ~>i®o-z~   2





## Study the vocabulary

In this part we will look at the data.

When dealing with text and words, the first thing to do is looking at those words.

In [0]:
# To access a specific element or range in a list, you can use bracket notation: 
# `my_list[0]` is the first element
# `my_list[10: 20]` is an array containing elements from index 10 (included) to 20 (excluded)

## Print some texts from the dataset and look at what the OCR system has read. ##


## Any remarks ? ##

In [27]:
#@title

# Use the function `print` to look at texts in the datasets (either x_train or x_test)
# To access a scpecific element or range in a list, you can use bracket notation: 
# `my_list[0]` is the first element
# `my_list[10: 20]` is an array containing elements from index 10 (included) to 20 (excluded)


# print some texts from the dataset and look at what the OCR system has read.
for i in range(10):
  print(i, x_train[i])

# Any remarks ?

"""
Mostly not words, bunch of symbols. Very hard to understand.
"""

0 -4~&4 9~j~ 6~ 6/ 90 ~a 33 39 /o 3i-3y 3b yl vo - yo 30 3~6 t 6y 59 60 70 A~, al-34 95 a~ 35 XAuE CoLLEGE -t- 91 18 4e 39 6/ aW 51440 9402 67 3.-A J	56	-7S 36	yy	30 61	S7 70 qa 0 6/ cia ii I6 7 /a /o 1a~~ 9, . aa as iy aS a~ r-L, ~57 S9 63 -5 6 5 / Sa CR)Lp IN 014 50 .50 iib 5~ous~ poi 6MaKE2 a9 30 30 s4 .57 .5a ~9 .59 Sy S 1 .5 / y 7 ,~0 3a y9 T /8 2 94A.~& 64-XL !a a6 76 9v~ &-~ N~ ~`/~ 3'V NH ~~ut~ + 5 a 73 -7'7 73 Ea 3 3 48 ag as 16 o-: L, a 701- /a 7 ad a5 a8 a9 ay a9 ay -a3 aa a6 a3 7t 77 7 s 7y 77 7 ~5 U ~J ?~., [iiLi ~ a 7 3.~ iy y`3 a 3 y3 01 -Yo 571 39 a6 3y 311 ay 37 yo 3a LLJ  ~ yy ~ 4/ 5o S3 .5y 6a ta59 ~ ~&I ~~ {_ F 4,;a '7D.t- 51440 9404 7ot yo - ,&~ -~7 o* 5y* ~9-1` Sy~` ~~~ &Ail ~ /3 /y 13 13 13 /a l ~ .zj~ )t q /3 r 3~~ ~ ~ / y 13 7 /y 9 / a 13 / 0-2 /y 13 7 9 6 8 3 7 6 9 P ~:.~ ~ ~ ~~I _ ~. wo~i~~/ ~ ao /7 a~ /8 ;~dj- aI -3~ /7 /71 0 32- 41 a94` 30~` iC/ aG 31 a- /6 a3 ay - G /9 /7 /s i ~ 13 /a /~) 4 -4~- ~a-~ 4riX-1'4 A-c~ ' ' L T -Afii~.' ,..~. y3 S/ 57 y 9 y~2 S

In [0]:
# Naive vocabulary counting: splitting on space character

# Conventions:
# index 0 is reserved for unknown tokens that will be mapped to `__UNK__`.
# other special token come just after (eg. `__NUM__` for numbers).
# other classic token are inserted in order for reverse dictionnary purpose.
__UNK__ = '__UNK__'

# always put __UNK__ first when redefining special char.
DEFAULT_SPECIALS = [__UNK__]

def unknown_wrapped(f):
    """
    A wrapper around a tokenizer that provides a vocabulary parameter.
    If vocabulary is not None, generated token are checked against the vocabulary.
    If it does not contains this specific token, __UNK__ is yielded instead.
    """
    def wrapped(text, vocabulary=None):
        gen = f(text)
        if vocabulary is None:
            yield from gen
        else:
            for token in gen:
                if token not in vocabulary:
                    yield __UNK__
                else:
                    yield token

    return wrapped

# the most basic tokenizer: split on space charater
@unknown_wrapped
def basic_tokenizer(text: str) -> Iterator[Token]:
    yield from text.split(" ")


# the most basic preprocess: no preprocess done
def no_preprocess(text: str) -> str:
    return text


def compute_vocabulary(input_text: List[InputText],
                       max_size=1000,
                       tokenize_f=basic_tokenizer,
                       specials=DEFAULT_SPECIALS,
                       preprocess_f=no_preprocess) -> Tuple[Vocabulary, Counter[Token]]:
    """
    Given a preprocessing function, a tokenizer and a collection of special tokens,
    compute the vocabulary mapping and a corresponding tokenizer and number of occurences of tokens.

    main steps:
      - First preprocessing is applied to each text.
      - Then each preprocessed text is splitted into tokens.
      - All tokens from all text are chained together and empty tokens are filtered out.
      - Tokens are sorted by reversed number of occurences in the vocabulary.
      - A special treatment is reverved for special tokens.

    return:
      - vocabulary: A mapping from tokens to their corresponding index. Indices start at 0 and end at max_size-1
      - word_tokenizer_f: A tokenizer function that only produce tokens included in the vocabulary. (__UNK__ is returned if the token is not in the vocabulary)
      - token_occurences_d: A mapping from tokens to their corresponding number of occurences in the texts.
    """
    token_occurences_d = collections.Counter(i for i in itertools.chain(*map(tokenize_f,        # Split into token
                                                                            map(preprocess_f,  # Preprocess text before tokenization
                                                                                x_train)))
                                            if i)                                              # Filter out empty strings


    # compute number of missing special tokens in the word occurences
    no_missing_special = sum(1 for sp in specials if not sp in token_occurences_d)
    vocabulary = collections.OrderedDict([(word, i) for i, (word, _) in enumerate(token_occurences_d.most_common(max_size - no_missing_special), no_missing_special)])

    # Put special tokens at the beginning of the vocabulary
    i = 1
    for sp in reversed(specials):
        if sp not in vocabulary:
            vocabulary[sp] = no_missing_special - i
            vocabulary.move_to_end(sp, last=False)
            i += 1

    # Specialize the given tokenizer for the computed vocabulary
    word_tokenizer_f = partial(tokenize_f, vocabulary=vocabulary)

    return vocabulary, word_tokenizer_f, token_occurences_d

In [0]:
# Some plotting functions to display the vocabulary

def plot_token_count(token_count, n=1000):
    """
    Plots occurences for n most common tokens
    """
    plt.plot(list(range(n)), [i for _, i in token_count.most_common(n)])

    plt.title(f"Evolution of occurences of the {n} most frequent tokens")
    plt.show()

def plot_accumulated_token_count(token_count, n=1000):
    """
    Plots accumulated occurences divided by token number of tokens for n most common tokens 
    """
    total_tokens = sum(token_count.values()) / 100
    plt.plot(list(range(n)), list(itertools.accumulate(i / total_tokens for _, i in token_count.most_common(n))))

    plt.title(f"Evolution of cumulated occurences of the {n} most frequent tokens divided by total number of tokens")
    plt.show()

In [0]:
## Use the function `compute_vocabulary` to get vocabulary and token_count object. ##

## What are the most common tokens ? ##

## Plot token occurences and cumulated token occurences. ##

## How many percentages of the token are included in the vocabulary if we use 1_000 tokens ? 10_000 ? 100_000 ? ##


In [0]:
#@title
# Use the function `compute_vocabulary` to get vocabulary and token_count object.
*_, word_count = compute_vocabulary(x_train, max_size=10**4)


# What are the most common tokens
print(list(word_count.most_common(100)))

# Plot token occurences and cumulated token occurences.
plot_token_count(word_count, n=10000)
plot_accumulated_token_count(word_count, n=10000)

# How many percentages of the token are included in the vocabulary if we use 1_000 tokens ? 10_000 ? 100_000 ?
for size in [10**3, 10**4, 10**5]:
  print(f"With a vocabulary of size {size}, you cover {sum(t for _, t in word_count.most_common(size)) / sum(word_count.values()) * 100:0.2f}% of the encountered tokens")

## We must reduce vocabulary size

To clean the texts from all the noise produced by the OCR, we can use advanced preprocessing and tokenizer.

The job of the **preprocessing** is to prepare the text to be splitted on space characters. An example of simple preprocessing would be:
* Use lowercase only.
* Remove useless characters that are unlikely to really be in the document and likely to be noise produced by OCR.
* Introduce additional spaces between words and punctuation so "This is a cat." is transformed into "This is a cat ." (note the space at the end).

The job of the **tokenizer** is to split sentences into separate tokens. Our vocabulary is polluted by multiple punctuation and numbers. A simple workaround is to create special tokens that represent a group of symbols. For example we could introduce a `__NUM__` token wich represent all numbers. Any number in the text would be mapped to `__NUM__`.

The resulting vocabulary should include much less noise and a lot a words !







In [0]:
## Implement the preprocessor described above. Feel free to add other steps in the preprocessing. ##

def my_preprocess(text: str) -> str:
  # Implement here
  return text

## Implement the tokenizer described above. Examples of groups of tokens are: numbers, punctuation, mix of those... ##

# Some categories of character
ALPHA = {i for i in "azertyuiopqsdfghjklmwxcvbn"}
DIGIT = {i for i in "1234567890"}
PUNCT = {i for i in r".?,!:$£@/-\\"}


__NUM__ = "__NUM__"  # Numbers
__PUN__ = "__PUN__"  # Punctuation
__MIX__ = "__MIX__"  # Mix of numbers and puntuation
MY_SPECIALS = [__UNK__, __NUM__, __PUN__, __MIX__]

@unknown_wrapped
def my_tokenizer(text: str) -> Iterator[Token]:
  for word in text.split(" "):
    # Implement here, use keyword `yield` instead of return to produce an iterator over your tokens
    pass

In [0]:
#@title
# Implement the preprocessor described above. Feel free to add other steps in the preprocessing.

def regex_preprocess(text: str) -> str:
    text = re.sub(r"(?<=[a-z])([.?,!:])", r" \1", text.lower())  # Add an extra space around punctuation (usefull in english)
    text = re.sub(r"([.?,!:])(?=[a-z])", r"\1 ", text)           # Add an extra space around punctuation (usefull in english)
    return re.sub(r"[^ a-z0-9.?,!:$£@/\-\\]", " ", text)         # Remove any non basic character


# Implement the tokenizer described above. Examples of groups of tokens are: numbers, punctuation, mix of those...
ALPHA = {i for i in "azertyuiopqsdfghjklmwxcvbn"}
DIGIT = {i for i in "1234567890"}
PUNCT = {i for i in r".?,!:$£@/-\\"}


__NUM__ = "__NUM__"
__PUN__ = "__PUN__"
__MIX__ = "__MIX__"
MY_SPECIALS = [__UNK__, __NUM__, __PUN__, __MIX__]


@unknown_wrapped
def special_tokenizer(text: str) -> Iterator[Token]:
    for word in text.split(" "):
        if not word in STOP_WORD_S:
            if all(c in ALPHA for c in word):
                yield word
            elif all(c in DIGIT for c in word):
                yield __NUM__
            elif all(c in PUNCT for c in word):
                yield __PUN__
            else:
                yield __MIX__


In [0]:
# Same Questions as before but with your new preprocessing and tokenizer

# What are the most common tokens

# Plot token occurences and cumulated token occurences.

# How many percentages of the token are included in the vocabulary if we use 1_000 tokens ? 10_000 ? 100_000 ?

In [0]:
#@title
# Same Questions as before but with your new preprocessing and tokenizer
*_, word_count = compute_vocabulary(x_train, max_size=10**4, tokenize_f=special_tokenizer, specials=MY_SPECIALS, preprocess_f=regex_preprocess)


# What are the most common tokens
print(list(word_count.most_common(100)))

# Plot token occurences and cumulated token occurences.
plot_token_count(word_count, n=10000)
plot_accumulated_token_count(word_count, n=10000)

# How many percentages of the token are included in the vocabulary if we use 1_000 tokens ? 10_000 ? 100_000 ?
for size in [10**3, 10**4, 10**5]:
  print(f"With a vocabulary of size {size}, you cover {sum(t for _, t in word_count.most_common(size)) / sum(word_count.values()) * 100:0.2f}% of the encountered tokens")

## Basic Model: Bag of Words
To implement a Bag of Word model, we first need to convert sentences to vector using a CountVectorizer.

It basically counts how many times each token appears in a text and put each value at each token's index.


In [38]:
VOCABULARY_SIZE = 10**5
## Use skleanr's CountVectorizer to implement a vectorizer: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html ##
# Remember to specify the vocabulary, the tokenizer and the preprocessor with your own to erase sklearn's defaults


vocabulary, tokenizer_f, _ = compute_vocabulary(x_train, max_size=VOCABULARY_SIZE, tokenize_f=my__tokenizer, preprocess_f=my_preprocess, specials=MY_SPECIALS)
# Create the CountVectorizer here

With a vocabulary of size 10000, you cover 89.11%


In [44]:
#@title
VOCABULARY_SIZE = 10**5
## Use skleanr's CountVectorizer to implement a vectorizer: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html ##

# Remember to specify the vocabulary, the tokenizer and the preprocessor with your own to erase sklearn's defaults

vocabulary, tokenizer_f, _ = compute_vocabulary(x_train, max_size=VOCABULARY_SIZE, tokenize_f=special_tokenizer, preprocess_f=regex_preprocess, specials=MY_SPECIALS)
vectorizer = sklearn.feature_extraction.text.CountVectorizer(vocabulary=vocabulary, tokenizer=tokenizer_f, preprocessor=regex_preprocess, binary=True)

With a vocabulary of size 100000, you cover 95.66%


The count vectorizer should have its features identicall to our vocabulary.

Try to preprocess a text and give it to the count vectorizer.

In [0]:
## Check if your vectorizer is correct with a small sentence ##

In [0]:
#@title
## Check if your vectorizer is correct ##

print(vectorizer.get_feature_names() == list(vocabulary.keys()))

print(regex_preprocess("12 / tobacco."))
print(vectorizer.fit_transform(["12 / tobacco."]))

print(vocabulary["tobacco"])


We will now start building our model.

You can use any optimizer (`SGD`, `RMSProp`, …) but `Adam` is one of the best currently. It converges faster and to a better minimum than other optimizers most of the times

We are doing a classification problem, use `sparse_categorical_crossentropy` as your loss and `sparse_categorical_accuracy` as your metric.


In [0]:
## Create a Sequential model that takes a sentence vector in input (size=VOCABULARY_SIZE) and returns a vector of size NUM_CLASSES. ##
# Find help here: https://keras.io/models/sequential/
# and here: https://www.tensorflow.org/tutorials/keras/classification

# Create your model here and compile it.

model.summary()

In [0]:
#@title
## Create a Sequential model that takes a sentence vector in input (size=VOCABULARY_SIZE) and returns a vector of size NUM_CLASSES. ##
# Find help here: https://keras.io/models/sequential/
# and here: https://www.tensorflow.org/tutorials/keras/classification

model = keras.models.Sequential([
    keras.layers.Dense(input_dim=VOCABULARY_SIZE, units=128, activation="relu"),
    keras.layers.Dense(units=32, activation="relu"),
    keras.layers.BatchNormalization(),
    keras.layers.Dense(units=32, activation="relu"),
    keras.layers.Dense(units=NUM_CLASSES, activation="softmax"),
])

optimizer = keras.optimizers.Adam(learning_rate=0.0001)

model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["sparse_categorical_accuracy"])

model.summary()

We are now ready to train our model !

In [0]:
model.fit(vectorizer.fit_transform(x_train), np.array(y_train), epochs=10, batch_size=256, validation_split=0.1, shuffle=True, verbose=1)

We can also evaluate our model on the test set.

In [60]:
model.evaluate(vectorizer.fit_transform(x_test), y_test, verbose=2)

y_pred = model.predict_classes(vectorizer.fit_transform(x_test))

print(pd.DataFrame(sklearn.metrics.confusion_matrix(y_test, y_pred), columns=CLASS_NAMES))

3323/3323 - 3s - loss: 0.3964 - sparse_categorical_accuracy: 0.8703
   form  email  handwritten  advertisement  invoice
0   538     19           31             53       43
1    17    634            3              8        3
2    13      5          514            121       14
3     5      2           34            622        7
4    10      2           19             22      584


# A bit more complex: Recurrent Neural Networks and Long-Short Term Memory

In [36]:
VOCABULARY_SIZE = 10**4
EMBEDDING_SIZE = 64
MAX_SEQ_LEN = 1 * 10**2

__PAD__ = "__PAD__"
PAD_SPECIALS = MY_SPECIALS + [__PAD__]

vocabulary, tokenizer_f, _ = compute_vocabulary(x_train, max_size=VOCABULARY_SIZE, tokenize_f=special_tokenizer, preprocess_f=regex_preprocess, specials=PAD_SPECIALS)

def build_vectorizer(vocabulary, tokenizer_f, preprocess_f):
    def rnn_vectorizer(x: List[InputText]):
        return [[vocabulary[token] for token in tokenizer_f(preprocess_f(text))] for text in x]
    return rnn_vectorizer

vectorizer = build_vectorizer(vocabulary, tokenizer_f, regex_preprocess)

print(vectorizer(x_test[0:2]), sep="\n")

With a vocabulary of size 10000, you cover 91.00%
[[0, 0, 2, 0, 2, 0, 0, 2, 0, 2, 5, 0, 0, 0, 0, 0, 0, 4, 2, 152, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 0, 0, 2, 0, 8, 0, 0, 4, 0, 0, 2, 0, 0, 0, 0, 0, 0, 203, 0, 0, 0, 2, 3, 11, 0, 0, 2, 0, 2, 0], [13, 4, 0, 0, 0, 3, 2, 3, 2, 0, 20, 2, 3, 3, 2, 3, 2, 0, 0, 0, 4, 2, 0, 2, 13, 3, 3, 3, 0]]


In [0]:
pad_f = partial(keras.preprocessing.sequence.pad_sequences, maxlen=MAX_SEQ_LEN, padding="post", truncating="post", value=vocabulary[__PAD__])

rnn_x_train, rnn_x_test = map(pad_f, map(vectorizer, [x_train, x_test]))
rnn_y_train, rnn_y_test = map(np.array, [y_train, y_test])

In [38]:
model = keras.models.Sequential([
    keras.layers.Embedding(VOCABULARY_SIZE, EMBEDDING_SIZE, input_length=MAX_SEQ_LEN),
    keras.layers.Bidirectional(keras.layers.GRU(128, return_sequences=True)),
    keras.layers.Bidirectional(keras.layers.GRU(128, dropout=0.5)),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(units=NUM_CLASSES, activation="softmax")
])

optimizer = keras.optimizers.Adam(learning_rate=0.005)

model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["sparse_categorical_accuracy"])

model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 100, 64)           640000    
_________________________________________________________________
bidirectional_9 (Bidirection (None, 100, 256)          148992    
_________________________________________________________________
bidirectional_10 (Bidirectio (None, 256)               296448    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 5)                 1285      
Total params: 1,086,725
Trainable params: 1,086,725
Non-trainable params: 0
_________________________________________________________________


In [39]:
model.fit(rnn_x_train, rnn_y_train, epochs=10, batch_size=128, validation_split=0.1, verbose=1, shuffle=True)

Train on 11656 samples, validate on 1296 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fda586b45c0>

In [40]:
model.evaluate(rnn_x_test, rnn_y_test, verbose=2)

rnn_pred = model.predict_classes(rnn_x_test)
print(class_names)
print(sklearn.metrics.confusion_matrix(y_test, rnn_pred))

3323/1 - 4s - loss: 2.3444 - sparse_categorical_accuracy: 0.7951
['form', 'email', 'handwritten', 'advertisement', 'invoice']
[[459  30  47  55  93]
 [ 15 628  11   7   4]
 [ 15  20 457 128  47]
 [  7  24  45 572  22]
 [ 30   8  36  37 526]]


In [24]:
model = keras.models.Sequential([
    keras.layers.Embedding(VOCABULARY_SIZE, EMBEDDING_SIZE, input_length=MAX_SEQ_LEN),
    keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True)),
    keras_self_attention.SeqSelfAttention(attention_activation='sigmoid'),
    keras.layers.Bidirectional(keras.layers.LSTM(128)),
    keras.layers.Dense(units=NUM_CLASSES, activation="softmax")
])

optimizer = keras.optimizers.Adam(learning_rate=0.005)

model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["sparse_categorical_accuracy"])

model.summary()

NameError: ignored

In [0]:
model.fit(rnn_x_train, rnn_y_train, epochs=10, batch_size=128, validation_split=0.1, verbose=1, shuffle=True)

In [0]:
model.evaluate(rnn_x_test, rnn_y_test, verbose=2)

rnn_pred = model.predict_classes(rnn_x_test)
print(class_names)
print(sklearn.metrics.confusion_matrix(y_test, rnn_pred))