In [1]:
import os                                                                          
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"                                       
os.environ["CUDA_VISIBLE_DEVICES"]="0"  
from tensorflow.contrib.slim.nets import resnet_v2, resnet_utils
import tensorflow as tf
from tensorflow.contrib import layers as layers_lib
from tensorflow.python.ops import variable_scope
from tensorflow.contrib.layers.python.layers import utils
from tensorflow.contrib import slim
from tensorflow.nn import ctc_loss, conv2d
import numpy as np
resnet_v2_block = resnet_v2.resnet_v2_block
resnet_v2 = resnet_v2.resnet_v2
from lev_dist import distance

import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters


In [2]:
BATCH_SIZE = 32
RESNET_STRIDE = 1
IMAGE_HEIGHT = 32
VAL_BATCH_SIZE = 64
VAL_SIZE = 2048

In [3]:
def resnet_v2_26_base(inputs,
                 num_classes=None,
                 is_training=True, # True - due to update batchnorm layers
                 global_pool=False,
                 output_stride=1, # effective stride 
                 reuse=None,
                 include_root_block=False, #first conv layer. Removed due to max pool supression. We need large receprive field
                 scope='resnet_v2_26'):
  
    """
    Tensorflow resnet_v2 use only bottleneck blocks (consist of 3 layers).
    Thus, this resnet layer model consist of 26 layers.
    I put stride = 2 on each block due to increase receptive field.

    """
    blocks = [
      resnet_v2_block('block1', base_depth=64, num_units=2, stride=2),
      resnet_v2_block('block2', base_depth=128, num_units=2, stride=2),
      resnet_v2_block('block3', base_depth=256, num_units=2, stride=2),
      resnet_v2_block('block4', base_depth=512, num_units=2, stride=2),
    ]
    return resnet_v2(
      inputs,
      blocks,
      num_classes,
      is_training,
      global_pool,
      output_stride,
      include_root_block,
      reuse=reuse,
      scope=scope)

def make_ocr_net(inputs, num_classes, is_training=True):
    '''
    Creates neural network graph.
    Image width halved and it's define timestamps width (feature sequence length) 
    No activation after output (no softmax), due to it's presence at ctc_loss() and beam_search().
    After resnet head features are resized to be [batch,1,width,channel], and after that goes 1x1 conv 
    to make anology of dense connaction for each timestamp.
    
    input: batch of images
    output: tensor of size [batch, time_stamps_width, num_classes]
    '''
    with tf.variable_scope('resnet_base', values=[inputs]) as sc:
        with slim.arg_scope([slim.conv2d],
                              activation_fn=None, normalizer_fn=None):
            net = resnet_utils.conv2d_same(inputs, 64, 7, stride=2, scope='conv1') #root conv for resnet
            #net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1') # due to enlarge of receptive field
            net = resnet_v2_26_base(net, output_stride=1, is_training = is_training)[0] # ouput is a tuple of last tensor and all tensors 
    with tf.variable_scope('class_head', values=[net]) as sc:
        net = tf.transpose(net, [0,3,1,2]) # next 4 lines due to column to channel reshape. [batch,c,h,w]
        _,c,h,_ = net.get_shape() # depth of input to conv op tensor should be static (defined)
        shape = tf.shape(net)
        net = tf.reshape(net, [shape[0], c*h, 1, shape[3]])
        net = tf.transpose(net,[0,2,3,1]) # back to [batch,h,w,c] = [batch,1,w,features*h]
        net = layers_lib.conv2d(net, num_classes, [1, 1], activation_fn=None) #CTC got softmax [batch,1,w,num_classes]
        net = tf.squeeze(net,1) #[batch,w,num_classes]
        return net

def ctc_loss_layer(sequence_labels, logits, sequence_length):
    """
    Build CTC Loss layer for training
    sequence_length is a list of siquences lengths, len(sequence_length) = batch_size.
    In our case sequences can not be different size due to it origin of images batch, 
    which should be of equal size (e.g. padded)
    """
    loss = tf.nn.ctc_loss( sequence_labels, 
                           logits, 
                           sequence_length,
                           time_major=False,  # [batch_size, max_time, num_classes] for logits
                           ignore_longer_outputs_than_inputs=True )
    total_loss = tf.reduce_mean( loss )
    return total_loss

def get_training(sequence_labels, net_logits, sequence_length, 
                   learning_rate=1e-4, decay_steps=2**16, decay_rate=0.9, decay_staircase=False, 
                   momentum=0.9):
    """
    Set up training ops
    https://github.com/weinman/cnn_lstm_ctc_ocr/blob/master/src/model_fn.py
    """
    with tf.name_scope( "train" ):
        net_logits_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)        
        loss = ctc_loss_layer(sequence_labels, net_logits, sequence_length) 
        # Update batch norm stats [http://stackoverflow.com/questions/43234667]
        extra_update_ops = tf.get_collection( tf.GraphKeys.UPDATE_OPS )
        with tf.control_dependencies( extra_update_ops ):
            # Calculate the learning rate given the parameters
#             learning_rate_tensor = tf.train.exponential_decay(
#                 learning_rate,
#                 tf.train.get_global_step(),
#                 decay_steps,
#                 decay_rate,
#                 staircase=decay_staircase,
#                 name='learning_rate' )
            learning_rate_tensor = learning_rate
            optimizer = tf.train.AdamOptimizer(
                learning_rate=learning_rate_tensor,
                beta1=momentum )
            train_op = tf.contrib.layers.optimize_loss(
                loss=loss,
                global_step=tf.train.get_global_step(),
                learning_rate=learning_rate_tensor, 
                optimizer=optimizer,
                variables=net_logits_vars)
    return train_op, loss, learning_rate_tensor

def get_prediction(output_net, seq_len, merge_repeated=False):
    '''
    predict by using beam search
    input: output_net - logits (without softmax) of net
           seq_len - length of predicted sequence 
    '''
    net = tf.transpose(output_net, [1, 0, 2]) #transpose to [time, batch, logits]
    decoded, prob = tf.nn.ctc_beam_search_decoder(net, seq_len, merge_repeated=merge_repeated)
    return decoded, prob

def get_min_max_dist_image_label_pred(input_image_batch, sequence_labels, distances, prediction, table):
    sequence_labels_dense = tf.sparse_to_dense(sequence_labels.indices, sequence_labels.dense_shape, 
                                               sequence_labels.values)   
#     distances = tf.Print(distances, [distances])
    max_indx = tf.argmax(distances)
    min_indx = tf.argmin(distances)
    max_image = tf.expand_dims(tf.gather(input_image_batch, max_indx),0)
    min_image = tf.expand_dims(tf.gather(input_image_batch, min_indx),0)
    max_label = tf.gather(sequence_labels_dense, max_indx)
    max_label_string = tf.reduce_join(table.lookup(tf.cast(max_label, tf.int64)-1))
    min_label = tf.gather(sequence_labels_dense, min_indx)
    min_label_string = tf.reduce_join(table.lookup(tf.cast(min_label, tf.int64)-1))
    pred_dense = tf.sparse_to_dense(prediction[0][0].indices, prediction[0][0].dense_shape, 
                                               prediction[0][0].values)
    max_prediction = tf.gather(pred_dense, max_indx)
    max_prediction_string = tf.reduce_join(table.lookup(tf.cast(max_prediction, tf.int64)-1))
    min_prediction = tf.gather(pred_dense, min_indx)
    min_prediction_string = tf.reduce_join(table.lookup(tf.cast(min_prediction, tf.int64)-1))
    with tf.name_scope('prediction_image'):
        tf.summary.image('Max distance image', max_image)
        tf.summary.image('Min distance image', min_image)
        tf.summary.text('Max distance gt label', max_label_string)
        tf.summary.text('Min distance gt label', min_label_string)
        tf.summary.text('Max distance pred label', max_prediction_string)
        tf.summary.text('Min distance pred label', min_prediction_string)
    return tf.summary.merge_all(scope='prediction_image')

In [4]:
class OCRModel(object):
    def __init__(self, charset, input_image_batch, sequence_labels, is_training=True, learning_rate=1e-4,
                decay_steps=2**16, decay_rate=0.9, decay_staircase=False, momentum=0.9):
        self.charset = charset
        self.num_classes = len(charset)+2 #indexing starts from one, +empty char. Yes, I know that I got 0-th rudimental output, whatever
        self.is_training = is_training
        self.learning_rate = learning_rate
        self.decay_steps = decay_steps
        self.decay_rate = decay_rate 
        self.decay_staircase = decay_staircase
        self.momentum = momentum
        mapping_char = tf.constant(list(charset))
        self.table = tf.contrib.lookup.index_to_string_table_from_tensor(
                    mapping_char, default_value="blank")
        self.build(input_image_batch, sequence_labels)
        
    def build(self, input_image_batch, sequence_labels):
      
        self.input_image_batch = input_image_batch
        self.sequence_labels = sequence_labels
        self.feature_seq_length = tf.fill([tf.shape(self.input_image_batch)[0]], tf.shape(self.input_image_batch)[2]//(2*RESNET_STRIDE)) #as we know effective stride
        
        net = make_ocr_net(self.input_image_batch, self.num_classes, is_training=self.is_training)
        self.net = net
        self.train_op, self.loss, self.learning_rate_tensor = get_training(self.sequence_labels, net, self.feature_seq_length,
                                                self.learning_rate, self.decay_steps, 
                                                self.decay_rate, self.decay_staircase, self.momentum)
        self.prediction = get_prediction(net, self.feature_seq_length, merge_repeated=False) # tuple(decoded, prob). decoded - list of top paths. I use top1
        lev_dist_batch = tf.edit_distance(tf.cast(self.prediction[0][0], tf.int32), self.sequence_labels)
        self.lev_dist = tf.reduce_mean(lev_dist_batch)
        pred_dense = tf.sparse_to_dense(self.prediction[0][0].indices, self.prediction[0][0].dense_shape, 
                                               self.prediction[0][0].values)
        self.prediction_string = tf.reduce_join(self.table.lookup(tf.cast(pred_dense, tf.int64)-1), axis=1)
        
        with tf.name_scope('prediction_metrics'):
            tf.summary.scalar('CTC loss', self.loss)
            tf.summary.scalar('Levenshtein distance', self.lev_dist)
            tf.summary.scalar('Learning rate', self.learning_rate_tensor)
        self.merged_summary_metrics = tf.summary.merge_all(scope='prediction_metrics')
        self.merged_summary_image = get_min_max_dist_image_label_pred(input_image_batch, sequence_labels, 
                                                                      lev_dist_batch, self.prediction, 
                                                                      self.table)
 

# Data generators 

## train (synthetic) data generator

In [5]:
import sys
sys.path.append('../../data_generator/')
from data_generator import data_generator
dg = data_generator(backgrounds_path='../../data_generator/backgrounds/', 
                                   fonts_path='../../data_generator/valid_fonts/',
                                   valid_charset_path='../../data_generator/valid_charset.txt', 
                                    background_type = ['const','real'],
                   font_size_bound=(50, 25), max_string_lenght=25)

2018-11-28 10:34:26,473:::data_generator:::1085 background files to produce data
2018-11-28 10:34:26,475:::data_generator:::Caching background images...
2018-11-28 10:35:29,271:::data_generator:::344 fonts to produce data


In [6]:
def helper(*arg):
    '''
    we need this function to handle size of generator class which cannot be pickled 
    '''
    image, label = dg.get_image_and_label()
    image = image/255
    image = image - 0.5
    return image, label

In [7]:
import multiprocessing
pool = multiprocessing.Pool(processes=32)
def local_data_generator():
    while True:
        image_label_pairs = pool.map(helper, range(BATCH_SIZE))
        image_batch, string_batch = zip(*image_label_pairs)
        yield image_batch, string_batch



In [8]:
with open('../../data_generator/valid_charset.txt', 'r', encoding = 'utf-8') as f:
    valid_charset = f.read()
all_chars = list(valid_charset)
char_to_indx = dict(zip(all_chars,range(len(all_chars))))
indx_to_char = dict(zip(range(len(all_chars)),all_chars))

num_classes = len(all_chars)


In [9]:
import json
from skimage import io
from skimage.transform import rescale
from skimage.color import rgb2gray
import warnings
warnings.filterwarnings('ignore')
def resize_height(image, max_dim):
    h, w = image.shape
    scale_factor = max_dim/h
    output_image = rescale(image.copy(), scale = scale_factor, order = 3)
    return output_image

def string_to_label(string):
    label = [char_to_indx[s]+1 for s in string] #index start from 1
    return np.array(label)

def label_to_string(label):
    label = [indx_to_char[s-1] for s in label] #index start from 1
    return ''.join(label)

In [10]:
import json
from sklearn.model_selection import train_test_split

with open('../datasets/labeled_strings.json', 'r') as f:
    string_data = json.load(f)

def my_stratify_value(data, num_of_bins):
    '''
    :param data: is a pair sequence of image path and string
    '''
    str_lens = []
    for example in data:
        string = example['context']
        str_lens.append(len(string))
    bins = np.linspace(min(str_lens), max(str_lens)+1, num_of_bins+1)
    stratify_values = np.digitize(str_lens, bins)
    return stratify_values

def stratify_split_train_test(data, portion, func):
    '''
    :param func: function which return value (based on data example)  to be used in stratification of dataset split
    '''
    num_of_bins = 100
    while True:
        stratify_values = func(data, num_of_bins)
        try:
            split = train_test_split(data, test_size=portion, stratify=stratify_values)
        except ValueError:
            print('number of bins {} is too hight for stratification. Lowering to {}'.format(num_of_bins, num_of_bins-1))
            num_of_bins -= 1
            continue
        return split

# string_data_tupels = [('../datasets/'+string_data[i]['path'], string_data[i]['context'])
#                       for i in np.random.choice(range(len(string_data)), size=1024, replace=False)]
string_data_tupels = stratify_split_train_test(string_data, VAL_SIZE, func = my_stratify_value)[1]
string_data_tupels = [('../datasets/'+i['path'], i['context'])
                      for i in string_data_tupels]
def map_func(image_path, string):
    image_path = image_path.decode('utf-8')
    string = string.decode('utf-8')
    image = io.imread(image_path)
    image = rgb2gray(image)
    image = resize_height(image, 32)
    image = image - 0.5
    label = string_to_label(string)
    return np.expand_dims(image,-1).astype(np.float32), label.astype(np.int32)
def tf_py_map_func_wrapper(args):
    return tf.py_func(func=map_func,
               inp=(args[0], args[1]),
               Tout = (tf.float32, tf.int32))
    

number of bins 100 is too hight for stratification. Lowering to 99
number of bins 99 is too hight for stratification. Lowering to 98
number of bins 98 is too hight for stratification. Lowering to 97
number of bins 97 is too hight for stratification. Lowering to 96
number of bins 96 is too hight for stratification. Lowering to 95
number of bins 95 is too hight for stratification. Lowering to 94
number of bins 94 is too hight for stratification. Lowering to 93
number of bins 93 is too hight for stratification. Lowering to 92
number of bins 92 is too hight for stratification. Lowering to 91
number of bins 91 is too hight for stratification. Lowering to 90
number of bins 90 is too hight for stratification. Lowering to 89
number of bins 89 is too hight for stratification. Lowering to 88
number of bins 88 is too hight for stratification. Lowering to 87
number of bins 87 is too hight for stratification. Lowering to 86
number of bins 86 is too hight for stratification. Lowering to 85
number of

In [11]:
graph = tf.Graph()
with graph.as_default():
    training_dataset = tf.data.Dataset().from_generator(local_data_generator, output_types= (tf.float32, tf.int32),
                                              output_shapes = (tf.TensorShape([None, IMAGE_HEIGHT, None,1]), 
                                                                          (tf.TensorShape([None, None]))))
    training_dataset = training_dataset.prefetch(BATCH_SIZE)
    
    string_data_tupels_ph = tf.placeholder(tf.string, shape=[None, None])
    validation_dataset = tf.data.Dataset.from_tensor_slices(string_data_tupels_ph).repeat(-1)

    validation_dataset = validation_dataset.map(tf_py_map_func_wrapper, num_parallel_calls=VAL_BATCH_SIZE).prefetch(VAL_BATCH_SIZE)
    validation_dataset = validation_dataset.padded_batch(VAL_BATCH_SIZE, padded_shapes=([IMAGE_HEIGHT, None,1],[None]))
    validation_dataset = validation_dataset.prefetch(VAL_BATCH_SIZE)
    
    handle = tf.placeholder(tf.string, shape=[], name='iterator_handler')
    iterator = tf.data.Iterator.from_string_handle(handle, training_dataset.output_types, training_dataset.output_shapes)
    features, labels = iterator.get_next()
    labels_sparce = tf.contrib.layers.dense_to_sparse(labels)
    
    training_iterator = training_dataset.make_initializable_iterator()    
    validation_iterator = validation_dataset.make_initializable_iterator()
    
    tf.train.create_global_step()

    is_training = tf.placeholder_with_default(1, shape=[])
    model = OCRModel(charset = all_chars, input_image_batch=features, 
                     sequence_labels=labels_sparce, is_training=tf.cast(is_training, tf.bool), learning_rate=1e-5,) #lr lowered
    init = tf.global_variables_initializer()
    table_init = tf.tables_initializer()
    saver = tf.train.Saver()
    variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
    hists = [tf.summary.histogram(name=v.name, values=v.value()) for v in variables]
    merged_summary_hists = tf.summary.merge(hists)

INFO:tensorflow:Summary name CTC loss is illegal; using CTC_loss instead.
INFO:tensorflow:Summary name Levenshtein distance is illegal; using Levenshtein_distance instead.
INFO:tensorflow:Summary name Learning rate is illegal; using Learning_rate instead.
INFO:tensorflow:Summary name Max distance image is illegal; using Max_distance_image instead.
INFO:tensorflow:Summary name Min distance image is illegal; using Min_distance_image instead.
INFO:tensorflow:Summary name Max distance gt label is illegal; using Max_distance_gt_label instead.
INFO:tensorflow:Summary name Min distance gt label is illegal; using Min_distance_gt_label instead.
INFO:tensorflow:Summary name Max distance pred label is illegal; using Max_distance_pred_label instead.
INFO:tensorflow:Summary name Min distance pred label is illegal; using Min_distance_pred_label instead.
INFO:tensorflow:Summary name resnet_base/conv1/weights:0 is illegal; using resnet_base/conv1/weights_0 instead.
INFO:tensorflow:Summary name resnet_

INFO:tensorflow:Summary name resnet_base/resnet_v2_26/block3/unit_1/bottleneck_v2/conv2/weights:0 is illegal; using resnet_base/resnet_v2_26/block3/unit_1/bottleneck_v2/conv2/weights_0 instead.
INFO:tensorflow:Summary name resnet_base/resnet_v2_26/block3/unit_1/bottleneck_v2/conv2/biases:0 is illegal; using resnet_base/resnet_v2_26/block3/unit_1/bottleneck_v2/conv2/biases_0 instead.
INFO:tensorflow:Summary name resnet_base/resnet_v2_26/block3/unit_1/bottleneck_v2/conv3/weights:0 is illegal; using resnet_base/resnet_v2_26/block3/unit_1/bottleneck_v2/conv3/weights_0 instead.
INFO:tensorflow:Summary name resnet_base/resnet_v2_26/block3/unit_1/bottleneck_v2/conv3/biases:0 is illegal; using resnet_base/resnet_v2_26/block3/unit_1/bottleneck_v2/conv3/biases_0 instead.
INFO:tensorflow:Summary name resnet_base/resnet_v2_26/block3/unit_2/bottleneck_v2/preact/beta:0 is illegal; using resnet_base/resnet_v2_26/block3/unit_2/bottleneck_v2/preact/beta_0 instead.
INFO:tensorflow:Summary name resnet_ba

In [None]:
# from tensorflow.python.client import timeline

LOG_DIR = 'log'
with tf.Session(graph=graph) as sess:

    train_writer = tf.summary.FileWriter(os.path.join(LOG_DIR,'train'), sess.graph)
    test_writer = tf.summary.FileWriter(os.path.join(LOG_DIR, 'test'))
    pred_lev_dist = np.Inf
                
    sess.run([init, training_iterator.initializer, validation_iterator.initializer, table_init],
             {string_data_tupels_ph: string_data_tupels})
    
    training_handle = sess.run(training_iterator.string_handle())
    validation_handle = sess.run(validation_iterator.string_handle())
    
    pred_lev_dist = np.Inf
    try:
        saver.restore(sess, os.path.join(LOG_DIR,"model_lev_dist_0.33237187478661784.ckpt"))
    except:
        print('cant restore')
    for num in range(int(1e10)):
        
        _ = sess.run([model.train_op], {handle: training_handle})

        if num%1000 == 0:
            _, ms = sess.run([model.train_op, model.merged_summary_metrics],
                                      {handle: training_handle})
            train_writer.add_summary(ms, num)
            train_writer.flush()          
            
            ms, ms_img = sess.run([model.merged_summary_metrics, model.merged_summary_image], 
                                   {handle: validation_handle})
            test_writer.add_summary(ms, num)
            test_writer.add_summary(ms_img, num)
            test_writer.flush()
            
            lev_dist = []
            for _ in range(int(VAL_SIZE/VAL_BATCH_SIZE)):
                pred_strings, gt_labels = sess.run([model.prediction_string, labels], {handle: validation_handle,
                                                                                      is_training: 0})
                pred_strings = [pr.decode('utf-8').replace('blank','').upper() for pr in pred_strings]
                gt_string = [label_to_string(gl[gl!=0]) for gl in list(gt_labels)]
                lev_dist.append(np.mean([distance(ps, gs) for ps, gs in zip(pred_strings, gt_string)]))
            lev_dist = np.mean(lev_dist)
            print(lev_dist)
            if lev_dist < pred_lev_dist:
                pred_lev_dist = lev_dist
                saver.save(sess, os.path.join(LOG_DIR, "model_lev_dist_{}.ckpt".format(lev_dist))) 
            

INFO:tensorflow:Restoring parameters from log/model_lev_dist_0.33237187478661784.ckpt
0.3345634054894761
0.7771510369765138
0.48736453211373265
