# Run the model

### Import necessary modules

In [None]:
#!/usr/bin/env python
import argparse
import os
import csv
import numpy as np
#try:
#    import better_exceptions
#except ImportError:
#    pass
from tqdm import trange
import tensorflow as tf
from src.model import crnn_fn
from src.data_handler import data_loader
from src.data_handler import preprocess_image_for_prediction

from src.config import Params, Alphabet, import_params_from_json

### Set up model parameters

In [None]:
# parser = argparse.ArgumentParser()
# parser.add_argument('-ft', '--csv_files_train', required=True, type=str, help='CSV filename for training',
#                     nargs='*', default=None)
# parser.add_argument('-fe', '--csv_files_eval', type=str, help='CSV filename for evaluation',
#                     nargs='*', default=None)
# parser.add_argument('-o', '--output_model_dir', required=True, type=str,
#                     help='Directory for output', default='./estimator')
# parser.add_argument('-n', '--nb-epochs', type=int, default=30, help='Number of epochs')
# parser.add_argument('-g', '--gpu', type=str, help="GPU 0,1 or '' ", default='')
# parser.add_argument('-p', '--params-file', type=str, help='Parameters filename', default=None)
# args = vars(parser.parse_args())

csv_files_train = "/home/danny/Repos/text_recognition/tf-crnn-master/data/train.csv"
csv_files_eval = "/home/danny/Repos/text_recognition/tf-crnn-master//data/valid.csv"
output_model_dir = "/home/danny/Repos/text_recognition/tf-crnn-master/estimator"
n_epochs = 30
gpu = "0"

The cell below contains information for the actual network model

In [None]:
parameters = Params(train_batch_size=64,
                    eval_batch_size=64,
                    learning_rate=1e-3,  # 1e-3 recommended
                    learning_decay_rate=0.95,
                    learning_decay_steps=5000,
                    evaluate_every_epoch=5,
                    save_interval=5e3,
                    input_shape=(117, 1669),
                    optimizer='adam',
                    digits_only=False,
                    alphabet=Alphabet.MY_ALPHABET,
                    alphabet_decoding='same',
                    csv_delimiter='\t',
                    csv_files_eval=csv_files_eval,
                    csv_files_train=csv_files_train,
                    output_model_dir=output_model_dir,
                    n_epochs=n_epochs,
                    gpu=gpu
                    )


model_params = {
    'Params': parameters,
}

parameters.export_experiment_params()

In [None]:
os.environ['CUDA_VISIBLE_DEVICES'] = parameters.gpu
config_sess = tf.ConfigProto()
config_sess.gpu_options.per_process_gpu_memory_fraction = 0.8
config_sess.gpu_options.allow_growth = True

### Create the estimator (including the model, below)

In [None]:
# Config estimator
est_config = tf.estimator.RunConfig()
est_config.replace(keep_checkpoint_max=10,
                   save_checkpoints_steps=parameters.save_interval,
                   session_config=config_sess,
                   save_checkpoints_secs=None,
                   save_summary_steps=1000,
                   model_dir=parameters.output_model_dir)

estimator = tf.estimator.Estimator(model_fn=crnn_fn,
                                   params=model_params,
                                   model_dir=parameters.output_model_dir,
                                   config=est_config
                                   )

### Run the estimator

In [None]:
# Count number of image filenames in csv
n_samples = 0
with open(parameters.csv_files_eval, 'r', encoding='utf8') as csvfile:
    reader = csv.reader(csvfile, delimiter=parameters.csv_delimiter)
    n_samples += len(list(reader))
    
    
try:
    for e in trange(0, parameters.n_epochs, parameters.evaluate_every_epoch):
        estimator.train(input_fn=data_loader(csv_filename=parameters.csv_files_train,
                                             params=parameters,
                                             batch_size=parameters.train_batch_size,
                                             num_epochs=parameters.evaluate_every_epoch,
                                             data_augmentation=True,
                                             image_summaries=True))
        estimator.evaluate(input_fn=data_loader(csv_filename=parameters.csv_files_eval,
                                                params=parameters,
                                                batch_size=parameters.eval_batch_size,
                                                num_epochs=1),
                           steps=np.floor(n_samples/parameters.eval_batch_size)
                           )

except KeyboardInterrupt:
    print('Interrupted')
    estimator.export_savedmodel(os.path.join(parameters.output_model_dir, 'export'),
                                preprocess_image_for_prediction(min_width=10))
    print('Exported model to {}'.format(os.path.join(parameters.output_model_dir, 'export')))

estimator.export_savedmodel(os.path.join(parameters.output_model_dir, 'export'),
                            preprocess_image_for_prediction(min_width=10))
print('Exported model to {}'.format(os.path.join(parameters.output_model_dir, 'export')))

# Create the model architecture

### The start of the crnn, split into parts

In [None]:
# def crnn_fn(features, labels, mode, params):
"""
:param features: dict {
                        'images'
                        'images_widths'
                        'filenames'
                        }
:param labels: labels. flattend (1D) array with encoded label (one code per character)
:param mode:
:param params: dict {
                        'Params'
                    }
:return:
"""

parameters = params.get('Params')
assert isinstance(parameters, Params)

if mode == tf.estimator.ModeKeys.TRAIN:
    parameters.keep_prob_dropout = 0.7
else:
    parameters.keep_prob_dropout = 1.0

conv = deep_cnn(features['images'], (mode == tf.estimator.ModeKeys.TRAIN), summaries=False)

**The deep_cnn code (line immediately above) is in the cells below**

In [None]:
# needed for quickly making convolutional layers
def weightVar(shape, mean=0.0, stddev=0.02, name='weights'):
    init_w = tf.truncated_normal(shape=shape, mean=mean, stddev=stddev)
    return tf.Variable(init_w, name=name)


def biasVar(shape, value=0.0, name='bias'):
    init_b = tf.constant(value=value, shape=shape)
    return tf.Variable(init_b, name=name)


def conv2d(input, filter, strides=[1, 1, 1, 1], padding='SAME', name=None):
    return tf.nn.conv2d(input, filter, strides=strides, padding=padding, name=name)

In [None]:
# def deep_cnn(input_imgs: tf.Tensor, is_training: bool, summaries: bool=True) -> tf.Tensor:
input_tensor = input_imgs
if input_tensor.shape[-1] == 1:
    input_channels = 1
elif input_tensor.shape[-1] == 3:
    input_channels = 3
else:
    raise NotImplementedError

# Following source code, not paper

##### Convolution layer 1
input: image tensor  
filter: [3,3,input_channels,64]  
bias: 64  
activation: relu  
pooling: [1,2,2,1], [1,2,2,1]

In [None]:
# with tf.variable_scope('deep_cnn'):
# - conv1 - maxPool2x2
with tf.variable_scope('layer1'):
    W = weightVar([3, 3, input_channels, 64])
    b = biasVar([64])
    conv = conv2d(input_tensor, W, name='conv')
    out = tf.nn.bias_add(conv, b)
    conv1 = tf.nn.relu(out)
    pool1 = tf.nn.max_pool(conv1, [1, 2, 2, 1], strides=[1, 2, 2, 1],
                           padding='SAME', name='pool')

    if summaries:
        weights = [var for var in tf.global_variables() if var.name == 'deep_cnn/layer1/weights:0'][0]
        tf.summary.histogram('weights', weights)
        bias = [var for var in tf.global_variables() if var.name == 'deep_cnn/layer1/bias:0'][0]
        tf.summary.histogram('bias', bias)

##### Convolution layer 2
input: pool1 from convolution 1  
filter: [3,3,64,128]  
bias: 128  
activation: relu  
pooling: [1,2,2,1], [1,2,2,1]

In [None]:
# - conv2 - maxPool 2x2
with tf.variable_scope('layer2'):
    W = weightVar([3, 3, 64, 128])
    b = biasVar([128])
    conv = conv2d(pool1, W)
    out = tf.nn.bias_add(conv, b)
    conv2 = tf.nn.relu(out)
    pool2 = tf.nn.max_pool(conv2, [1, 2, 2, 1], strides=[1, 2, 2, 1],
                           padding='SAME', name='pool1')

    if summaries:
        weights = [var for var in tf.global_variables() if var.name == 'deep_cnn/layer2/weights:0'][0]
        tf.summary.histogram('weights', weights)
        bias = [var for var in tf.global_variables() if var.name == 'deep_cnn/layer2/bias:0'][0]
        tf.summary.histogram('bias', bias)

##### Convolution layer 3
input: pool2 from convolution 2  
filter: [3,3,128,256]  
bias: 256  
normalization: batch normalization  
activation: relu

In [None]:
# - conv3 - w/batch-norm (as source code, not paper)
with tf.variable_scope('layer3'):
    W = weightVar([3, 3, 128, 256])
    b = biasVar([256])
    conv = conv2d(pool2, W)
    out = tf.nn.bias_add(conv, b)
    b_norm = tf.layers.batch_normalization(out, axis=-1,
                                           training=is_training, name='batch-norm')
    conv3 = tf.nn.relu(b_norm, name='ReLU')

    if summaries:
        weights = [var for var in tf.global_variables() if var.name == 'deep_cnn/layer3/weights:0'][0]
        tf.summary.histogram('weights', weights)
        bias = [var for var in tf.global_variables() if var.name == 'deep_cnn/layer3/bias:0'][0]
        tf.summary.histogram('bias', bias)

##### Convolution layer 4
input: conv3 from convolution 3  
filter: [3,3,256,256]  
bias: 256  
activation: relu  
pooling: [1,2,2,1], [1,2,2,1]

In [None]:
# - conv4 - maxPool 2x1
with tf.variable_scope('layer4'):
    W = weightVar([3, 3, 256, 256])
    b = biasVar([256])
    conv = conv2d(conv3, W)
    out = tf.nn.bias_add(conv, b)
    conv4 = tf.nn.relu(out)
    pool4 = tf.nn.max_pool(conv4, [1, 2, 2, 1], strides=[1, 2, 1, 1],
                           padding='SAME', name='pool4')

    if summaries:
        weights = [var for var in tf.global_variables() if var.name == 'deep_cnn/layer4/weights:0'][0]
        tf.summary.histogram('weights', weights)
        bias = [var for var in tf.global_variables() if var.name == 'deep_cnn/layer4/bias:0'][0]
        tf.summary.histogram('bias', bias)

##### Convolution layer 5
input: pool4 from convolution 4  
filter: [3,3,256,512]  
bias: 512  
normalization: batch normalization  
activation: relu

In [None]:
# - conv5 - w/batch-norm
with tf.variable_scope('layer5'):
    W = weightVar([3, 3, 256, 512])
    b = biasVar([512])
    conv = conv2d(pool4, W)
    out = tf.nn.bias_add(conv, b)
    b_norm = tf.layers.batch_normalization(out, axis=-1,
                                           training=is_training, name='batch-norm')
    conv5 = tf.nn.relu(b_norm)

    if summaries:
        weights = [var for var in tf.global_variables() if var.name == 'deep_cnn/layer5/weights:0'][0]
        tf.summary.histogram('weights', weights)
        bias = [var for var in tf.global_variables() if var.name == 'deep_cnn/layer5/bias:0'][0]
        tf.summary.histogram('bias', bias)

##### Convolution layer 6
input: conv5 from convolution 5  
filter: [3,3,512,512]  
bias: 512  
activation: relu  
pooling: [1,2,2,1], [1,2,2,1]

In [None]:
# - conv6 - maxPool 2x1 (as source code, not paper)
with tf.variable_scope('layer6'):
    W = weightVar([3, 3, 512, 512])
    b = biasVar([512])
    conv = conv2d(conv5, W)
    out = tf.nn.bias_add(conv, b)
    conv6 = tf.nn.relu(out)
    pool6 = tf.nn.max_pool(conv6, [1, 2, 2, 1], strides=[1, 2, 1, 1],
                           padding='SAME', name='pool6')

    if summaries:
        weights = [var for var in tf.global_variables() if var.name == 'deep_cnn/layer6/weights:0'][0]
        tf.summary.histogram('weights', weights)
        bias = [var for var in tf.global_variables() if var.name == 'deep_cnn/layer6/bias:0'][0]
        tf.summary.histogram('bias', bias)

##### Convolution layer 7
input: pool6 from convolution 6  
filter: [3,3,512,512]  
bias: 512  
normalization: batch normalization  
activation: relu

In [None]:
# - conv 7 - w/batch-norm (as source code, not paper)
with tf.variable_scope('layer7'):
    W = weightVar([2, 2, 512, 512])
    b = biasVar([512])
    conv = conv2d(pool6, W, padding='VALID')
    out = tf.nn.bias_add(conv, b)
    b_norm = tf.layers.batch_normalization(out, axis=-1,
                                           training=is_training, name='batch-norm')
    conv7 = tf.nn.relu(b_norm)

    if summaries:
        weights = [var for var in tf.global_variables() if var.name == 'deep_cnn/layer7/weights:0'][0]
        tf.summary.histogram('weights', weights)
        bias = [var for var in tf.global_variables() if var.name == 'deep_cnn/layer7/bias:0'][0]
        tf.summary.histogram('bias', bias)

##### Final network output
input: conv7 from convolution 7  
reshape from [batch, height, width, features] to [batch, width, height x features]

In [None]:
cnn_net = conv7

with tf.variable_scope('Reshaping_cnn'):
    shape = cnn_net.get_shape().as_list()  # [batch, height, width, features]
    transposed = tf.transpose(cnn_net, perm=[0, 2, 1, 3],
                              name='transposed')  # [batch, width, height, features]
    conv_reshaped = tf.reshape(transposed, [shape[0], -1, shape[1] * shape[3]],
                               name='reshaped')  # [batch, width, height x features]

return conv_reshaped

**Back to crnn function**

In [None]:
logprob, raw_pred = deep_bidirectional_lstm(conv, params=parameters, summaries=False)

**Now to deep_bidirectional_lstm***

Create bidirectional rnn from 2 basic LSTM cells with 2 layers size [256, 256]  
input: output from deep convolution

In [None]:
# def deep_bidirectional_lstm(inputs: tf.Tensor, params: Params, summaries: bool=True) -> tf.Tensor:
# Prepare data shape to match `bidirectional_rnn` function requirements
# Current data input shape: (batch_size, n_steps, n_input) "(batch, time, height)"

list_n_hidden = [256, 256]

with tf.name_scope('deep_bidirectional_lstm'):
    # Forward direction cells
    fw_cell_list = [BasicLSTMCell(nh, forget_bias=1.0) for nh in list_n_hidden]
    # Backward direction cells
    bw_cell_list = [BasicLSTMCell(nh, forget_bias=1.0) for nh in list_n_hidden]

    lstm_net, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(fw_cell_list,
                                                                    bw_cell_list,
                                                                    inputs,
                                                                    dtype=tf.float32
                                                                    )

Dropout layer based on dropout probability in parameters

In [None]:
    # Dropout layer
    lstm_net = tf.nn.dropout(lstm_net, keep_prob=params.keep_prob_dropout)

Reshape the rnn from [batch, width, 2\*n_hidden] to [batch x width, 2\*n_hidden]

In [None]:
    with tf.variable_scope('Reshaping_rnn'):
        shape = lstm_net.get_shape().as_list()  # [batch, width, 2*n_hidden]
        rnn_reshaped = tf.reshape(lstm_net, [-1, shape[-1]])  # [batch x width, 2*n_hidden]

Create fully connected layer with linear function, f:|reshaped rnn| -> # classes

In [None]:
    with tf.variable_scope('fully_connected'):
        W = weightVar([list_n_hidden[-1]*2, params.n_classes])
        b = biasVar([params.n_classes])
        fc_out = tf.nn.bias_add(tf.matmul(rnn_reshaped, W), b)

        if summaries:
            weights = [var for var in tf.global_variables()
                       if var.name == 'deep_bidirectional_lstm/fully_connected/weights:0'][0]
            tf.summary.histogram('weights', weights)
            bias = [var for var in tf.global_variables()
                    if var.name == 'deep_bidirectional_lstm/fully_connected/bias:0'][0]
            tf.summary.histogram('bias', bias)

Reshape fully connected output and run through softmax to get predictions

In [None]:
    lstm_out = tf.reshape(fc_out, [shape[0], -1, params.n_classes], name='reshape_out')  # [batch, width, n_classes]

    raw_pred = tf.argmax(tf.nn.softmax(lstm_out), axis=2, name='raw_prediction')

The other output is changing the lstm output to dim [width(time), batch, n_classes]

In [None]:
    # Swap batch and time axis
    lstm_out = tf.transpose(lstm_out, [1, 0, 2], name='transpose_time_major')  # [width(time), batch, n_classes]

    return lstm_out, raw_pred

**Back to crnn function**

Set up for loss and training

In [None]:
# Compute seq_len from image width
n_pools = CONST.DIMENSION_REDUCTION_W_POOLING  # 2x2 pooling in dimension W on layer 1 and 2
seq_len_inputs = tf.divide(features['images_widths'], n_pools, name='seq_len_input_op') - 1

predictions_dict = {'prob': logprob,
                    'raw_predictions': raw_pred,
                    }
try:
    predictions_dict['filenames'] = features['filenames']
except KeyError:
    pass

Get keys (letters) and values (integer stand ins for letters)

In [None]:
if not mode == tf.estimator.ModeKeys.PREDICT:
    # Alphabet and codes
    keys = [c for c in parameters.alphabet] # the letters themselves
    values = parameters.alphabet_codes # integer representations

Create non-string labels from the keys and values above

In [None]:
    # Convert string label to code label
    with tf.name_scope('str2code_conversion'):
        table_str2int = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(keys, values), -1)
        splited = tf.string_split(labels, delimiter='')  # TODO change string split to utf8 split in next tf version
        codes = table_str2int.lookup(splited.values)
        sparse_code_target = tf.SparseTensor(splited.indices, codes, splited.dense_shape)

    seq_lengths_labels = tf.bincount(tf.cast(sparse_code_target.indices[:, 0], tf.int32),
                                     minlength=tf.shape(predictions_dict['prob'])[1])

Use ctc loss on probabilities from lstm output

In [None]:
    # Loss
    # ----
    # >>> Cannot have longer labels than predictions -> error
    with tf.control_dependencies([tf.less_equal(sparse_code_target.dense_shape[1], tf.reduce_max(tf.cast(seq_len_inputs, tf.int64)))]):
        loss_ctc = tf.nn.ctc_loss(labels=sparse_code_target,
                                  inputs=predictions_dict['prob'],
                                  sequence_length=tf.cast(seq_len_inputs, tf.int32),
                                  preprocess_collapse_repeated=False,
                                  ctc_merge_repeated=True,
                                  ignore_longer_outputs_than_inputs=True,  # returns zero gradient in case it happens -> ema loss = NaN
                                  time_major=True)
        loss_ctc = tf.reduce_mean(loss_ctc)
        loss_ctc = tf.Print(loss_ctc, [loss_ctc], message='* Loss : ')

Create the learning rate as well as a moving average

In [None]:
    global_step = tf.train.get_or_create_global_step()
    # # Create an ExponentialMovingAverage object
    ema = tf.train.ExponentialMovingAverage(decay=0.99, num_updates=global_step, zero_debias=True)
    # Create the shadow variables, and add op to maintain moving averages
    maintain_averages_op = ema.apply([loss_ctc])
    loss_ema = ema.average(loss_ctc)

    # Train op
    # --------
    learning_rate = tf.train.exponential_decay(parameters.learning_rate, global_step,
                                               parameters.learning_decay_steps, parameters.learning_decay_rate,
                                               staircase=True)

Set up optimizer

In [None]:
    if parameters.optimizer == 'ada':
        optimizer = tf.train.AdadeltaOptimizer(learning_rate)
    elif parameters.optimizer == 'adam':
        optimizer = tf.train.AdamOptimizer(learning_rate, beta1=0.5)
    elif parameters.optimizer == 'rms':
        optimizer = tf.train.RMSPropOptimizer(learning_rate)

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    opt_op = optimizer.minimize(loss_ctc, global_step=global_step)
    with tf.control_dependencies(update_ops + [opt_op]):
        train_op = tf.group(maintain_averages_op)

Get predictions for words (not totally necessary for training)

In [None]:
    # Summaries
    # ---------
    tf.summary.scalar('learning_rate', learning_rate)
    tf.summary.scalar('losses/ctc_loss', loss_ctc)
else:
    loss_ctc, train_op = None, None

if mode in [tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT, tf.estimator.ModeKeys.TRAIN]:
    with tf.name_scope('code2str_conversion'):
        keys = tf.cast(parameters.alphabet_decoding_codes, tf.int64)
        values = [c for c in parameters.alphabet_decoding]
        table_int2str = tf.contrib.lookup.HashTable(tf.contrib.lookup.KeyValueTensorInitializer(keys, values), '?')

        sparse_code_pred, log_probability = tf.nn.ctc_beam_search_decoder(predictions_dict['prob'],
                                                                          sequence_length=tf.cast(seq_len_inputs, tf.int32),
                                                                          merge_repeated=False,
                                                                          beam_width=100,
                                                                          top_paths=2)
        # Score
        predictions_dict['score'] = tf.subtract(log_probability[:, 0], log_probability[:, 1])
        # around 10.0 -> seems pretty sure, less than 5.0 bit unsure, some errors/challenging images
        sparse_code_pred = sparse_code_pred[0]

        sequence_lengths_pred = tf.bincount(tf.cast(sparse_code_pred.indices[:, 0], tf.int32),
                                            minlength=tf.shape(predictions_dict['prob'])[1])

        pred_chars = table_int2str.lookup(sparse_code_pred)
        predictions_dict['words'] = get_words_from_chars(pred_chars.values, sequence_lengths=sequence_lengths_pred)

        tf.summary.text('predicted_words', predictions_dict['words'][:10])

Do something for eval mode

In [None]:
# Evaluation ops
# --------------
if mode == tf.estimator.ModeKeys.EVAL:
    with tf.name_scope('evaluation'):
        CER = tf.metrics.mean(tf.edit_distance(sparse_code_pred, tf.cast(sparse_code_target, dtype=tf.int64)), name='CER')

        # Convert label codes to decoding alphabet to compare predicted and groundtrouth words
        target_chars = table_int2str.lookup(tf.cast(sparse_code_target, tf.int64))
        target_words = get_words_from_chars(target_chars.values, seq_lengths_labels)
        accuracy = tf.metrics.accuracy(target_words, predictions_dict['words'], name='accuracy')

        eval_metric_ops = {
                           'eval/accuracy': accuracy,
                           'eval/CER': CER,
                           }
        CER = tf.Print(CER, [CER], message='-- CER : ')
        accuracy = tf.Print(accuracy, [accuracy], message='-- Accuracy : ')

else:
    eval_metric_ops = None

Export the model for the estimator above

In [None]:
export_outputs = {'predictions': tf.estimator.export.PredictOutput(predictions_dict)}

return tf.estimator.EstimatorSpec(
    mode=mode,
    predictions=predictions_dict,
    loss=loss_ctc,
    train_op=train_op,
    eval_metric_ops=eval_metric_ops,
    export_outputs=export_outputs,
    scaffold=tf.train.Scaffold()
    # scaffold=tf.train.Scaffold(init_fn=None)  # Specify init_fn to restore from previous model
)