In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2"
import tensorflow as tf
from tensorflow.contrib import layers as layers_lib
from tensorflow.contrib.slim.nets import resnet_v2, resnet_utils
from tensorflow.contrib.slim import batch_norm
resnet_v2_block = resnet_v2.resnet_v2_block
resnet_v2 = resnet_v2.resnet_v2

from tensorflow.contrib import slim
# session_config=tf.ConfigProto(
#         gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.05, allow_growth=True), allow_soft_placement=True, log_device_placement=False)
# sess = tf.Session(config=session_config)
config = tf.ConfigProto(allow_soft_placement = True)
sess = tf.Session(config = config)

import glob
import json
import cv2
import numpy as np
import json

import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters


In [2]:
# Source:
# https://stackoverflow.com/questions/38559755/how-to-get-current-available-gpus-in-tensorflow
def get_available_gpus():
    """
        Returns a list of the identifiers of all visible GPUs.
    """
    from tensorflow.python.client import device_lib
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

In [3]:
import multiprocessing
CPUS = multiprocessing.cpu_count()
BASE_SIZE = 256
BATCH_SIZE = 256
VAL_BATCH_SIZE = 1024
VAL_STEPS = 2**16
GPUS = get_available_gpus()

In [4]:
def resnet_v2_26_base(inputs,
                 num_classes=None,
                 is_training=True, # True - due to update batchnorm layers
                 global_pool=True,
                 output_stride=16, # effective stride 
                 reuse=None,
                 include_root_block=False, #first conv layer. Removed due to max pool supression. We need large receprive field
                 scope='resnet_v2_26'):
  
    """
    Tensorflow resnet_v2 use only bottleneck blocks (consist of 3 layers).
    Thus, this resnet layer model consist of 26 layers.
    I put stride = 2 on each block due to increase receptive field.

    """
    blocks = [
      resnet_v2_block('block1', base_depth=64, num_units=3, stride=2),
      resnet_v2_block('block2', base_depth=128, num_units=4, stride=2),
      resnet_v2_block('block3', base_depth=256, num_units=6, stride=2),
      resnet_v2_block('block4', base_depth=512, num_units=3, stride=1),
    ]
    return resnet_v2(
      inputs,
      blocks,
      num_classes,
      is_training,
      global_pool,
      output_stride,
      include_root_block,
      reuse=reuse,
      scope=scope)

def make_resnet(inputs, num_classes, is_training=True):
    '''
    Creates neural network graph.
    Image width halved and it's define timestamps width (feature sequence length) 
    No activation after output (no softmax), due to it's presence at ctc_loss() and beam_search().
    After resnet head features are resized to be [batch,1,width,channel], and after that goes 1x1 conv 
    to make anology of dense connaction for each timestamp.
    
    input: batch of images
    output: tensor of size [batch, time_stamps_width, num_classes]
    '''
    input_image_batch, countrycode, image_ratio, min_veloc, max_veloc = inputs
    with tf.variable_scope('resnet_base', values=[input_image_batch]) as sc:
        with slim.arg_scope([slim.conv2d], activation_fn=None, normalizer_fn=None):
            net = resnet_utils.conv2d_same(input_image_batch, 64, 7, stride=2, scope='conv1') #root conv for resnet
            net = slim.max_pool2d(net, [3, 3], stride=2, scope='pool1') # due to enlarge of receptive field
            net = resnet_v2_26_base(net, output_stride=8, is_training = is_training)[0] # ouput is a tuple of last tensor and all tensors 
    with tf.variable_scope('class_head', values=[net]) as sc:

        net = tf.reduce_max(net, axis=[1,2])
        net = tf.concat((net, countrycode, image_ratio, min_veloc, max_veloc), axis=1, name='concat_aulux')
#         print_tensor = tf.print(image_ratio, 'my output')
#         with tf.control_dependencies([print_tensor]):
        net = tf.layers.dense(net, 256, activation=tf.nn.relu)
        net = batch_norm(net, is_training=is_training, activation_fn=None)
        net = tf.layers.dense(net, num_classes, activation=tf.nn.sigmoid)
        return net #tf.squeeze(net,axis=[1,2])

def get_training(net_logits, target_values, 
                   learning_rate=1e-4, decay_steps=2**16, decay_rate=0.9, decay_staircase=False, 
                   momentum=0.9):
    """
    Set up training ops
    https://github.com/weinman/cnn_lstm_ctc_ocr/blob/master/src/model_fn.py
    """
    with tf.name_scope( "train" ):
        net_logits_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target_values, logits=net_logits) 
        loss_mean = tf.reduce_mean(losses, name='cross_entropy')
        # Update batch norm stats [http://stackoverflow.com/questions/43234667]
        extra_update_ops = tf.get_collection( tf.GraphKeys.UPDATE_OPS )
        with tf.control_dependencies( extra_update_ops ):
            # Calculate the learning rate given the parameters
#             learning_rate_tensor = tf.train.exponential_decay(
#                 learning_rate,
#                 tf.train.get_global_step(),
#                 decay_steps,
#                 decay_rate,
#                 staircase=decay_staircase,
#                 name='learning_rate' )
            learning_rate_tensor = learning_rate
            optimizer = tf.train.AdamOptimizer(
                learning_rate=learning_rate_tensor,
                beta1=momentum )
            train_op = tf.contrib.layers.optimize_loss(
                loss=loss_mean,
                global_step=tf.train.get_global_step(),
                learning_rate=learning_rate_tensor, 
                optimizer=optimizer,
                variables=net_logits_vars)
    return train_op, loss_mean, learning_rate_tensor

PS_OPS = [
    'Variable', 'VariableV2', 'AutoReloadVariable', 'MutableHashTable',
    'MutableHashTableOfTensors', 'MutableDenseHashTable'
]
    
# see https://github.com/tensorflow/tensorflow/issues/9517
def assign_to_device(device, ps_device):
    """Returns a function to place variables on the ps_device.

    Args:
        device: Device for everything but variables
        ps_device: Device to put the variables on. Example values are /GPU:0 and /CPU:0.

    If ps_device is not set then the variables will be placed on the default device.
    The best device for shared varibles depends on the platform as well as the
    model. Start with CPU:0 and then test GPU:0 to see if there is an
    improvement.
    """
    def _assign(op):
        node_def = op if isinstance(op, tf.NodeDef) else op.node_def
        if node_def.op in PS_OPS:
            return ps_device
        else:
            return device
    return _assign  

# Source:
# https://github.com/tensorflow/models/blob/master/tutorials/image/cifar10/cifar10_multi_gpu_train.py#L101
def average_gradients(tower_grads):
    """Calculate the average gradient for each shared variable across all towers.
    Note that this function provides a synchronization point across all towers.
    Args:
    tower_grads: List of lists of (gradient, variable) tuples. The outer list ranges
        over the devices. The inner list ranges over the different variables.
    Returns:
            List of pairs of (gradient, variable) where the gradient has been averaged
            across all towers.
    """
    average_grads = []
    for grad_and_vars in zip(*tower_grads):

        # Note that each grad_and_vars looks like the following:
        #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
        grads = [g for g, _ in grad_and_vars]
        grad = tf.reduce_mean(grads, 0)

        # Keep in mind that the Variables are redundant because they are shared
        # across towers. So .. we will just return the first tower's pointer to
        # the Variable.
        v = grad_and_vars[0][1]
        grad_and_var = (grad, v)
        average_grads.append(grad_and_var)
    return average_grads    

In [5]:
class MyModel(object):
    
    def __init__(self, num_classes, is_training, input_shape = [None, None, None, 6], learning_rate=1e-4,
                decay_steps=2**16, decay_rate=0.9, decay_staircase=False, momentum=0.9):
        
        self.num_classes = num_classes
        self.is_training = tf.cast(is_training, tf.bool)
        self.learning_rate = learning_rate
        self.decay_steps = decay_steps
        self.decay_rate = decay_rate 
        self.decay_staircase = decay_staircase
        self.momentum = momentum
        self.input_shape = input_shape
        
    def build_for_train(self, input_image_batch, target_values):
        input_image_batch.set_shape(self.input_shape)
        net = make_resnet(input_image_batch, self.num_classes, is_training=self.is_training)
        self.train_op, self.loss, self.learning_rate_tensor = get_training(net, target_values,
                                                self.learning_rate, self.decay_steps, 
                                                self.decay_rate, self.decay_staircase, self.momentum)
        top_3_metric = tf.cast(tf.nn.in_top_k(targets=target_values, predictions=net, k=3),tf.float32)
        self.top_3_metric = tf.reduce_mean(top_3_metric)
        with tf.name_scope('prediction_metrics'):
            tf.summary.scalar('sparse_softmax_cross_entropy_with_logits', self.loss)
            tf.summary.scalar('Top_3_metric', self.top_3_metric)
            tf.summary.scalar('Learning_rate', self.learning_rate_tensor)
        self.merged_summary_metrics = tf.summary.merge_all(scope='prediction_metrics')

        
    def build_for_prediction(self, input_image_batch):
        
        self.input_image_batch = input_image_batch
        net = make_ocr_net(self.input_image_batch, self.num_classes, is_training=self.is_training)
        self.net = net
        self.prediction = get_prediction(net, [tf.shape(self.net)[1]], merge_repeated=False) # tuple(decoded, prob). decoded - list of top paths. I use top1
        pred_dense = tf.sparse_to_dense(self.prediction[0][0].indices, self.prediction[0][0].dense_shape, 
                                               self.prediction[0][0].values)
        self.pred_dense = pred_dense
        self.prediction_string = tf.reduce_join(self.table.lookup(tf.cast(pred_dense, tf.int64)-1))
        
    def build_in_parallel_fashion(self, input_fn, devices, controller):
        '''
        http://blog.s-schoener.com/2017-12-15-parallel-tensorflow-intro/
        '''        
        # This list keeps track of the gradients per tower and the losses
        tower_grads = []
        losses = []
        top_3_metrics = []

        # Get the current variable scope so we can reuse all variables we need once we get
        # to the second iteration of the loop below
#         self.learning_rate_tensor = tf.train.exponential_decay(
#             self.learning_rate,
#             tf.train.get_global_step(),
#             self.decay_steps,
#             self.decay_rate,
#             staircase=self.decay_staircase,
#             name='learning_rate' )
        self.learning_rate_tensor = self.learning_rate
        optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate_tensor,
                                            beta1=self.momentum )
        with tf.variable_scope(tf.get_variable_scope()) as outer_scope:
            for i, id in enumerate(devices):
                name = 'tower_{}'.format(i)
                # Use the assign_to_device function to ensure that variables are created on the
                # controller.
                with tf.device(assign_to_device(id, controller)), tf.name_scope(name):

                    # Compute loss and gradients, but don't apply them yet
                    input_image_batch, target_values, countrycode, image_ratio, min_veloc, max_veloc = input_fn()
                    input_image_batch.set_shape(self.input_shape)
                    countrycode.set_shape([None, len(cat_to_id['country_codes'])])
                    image_ratio.set_shape([None, 1])
                    min_veloc.set_shape([None, 1])
                    max_veloc.set_shape([None, 1])
#                     print_tensor = tf.print(image_ratio, 'net got {}'.format(i))
#                     with tf.control_dependencies([print_tensor]):                    
                    net_logits = make_resnet((input_image_batch, countrycode, image_ratio, min_veloc, max_veloc), self.num_classes, is_training=self.is_training)
#                     with tf.name_scope( "train" ):
                    net_logits_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
                    loss_gpu_wise = tf.reduce_mean(
                        tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target_values, logits=net_logits), 
                        name='cross_entropy')
                    # Update batch norm stats [http://stackoverflow.com/questions/43234667]
                    extra_update_ops = tf.get_collection( tf.GraphKeys.UPDATE_OPS )
                        
                    with tf.name_scope("compute_gradients"):
                        with tf.control_dependencies( extra_update_ops ):
                            # `compute_gradients` returns a list of (gradient, variable) pairs
                            grads = optimizer.compute_gradients(loss_gpu_wise)
                            tower_grads.append(grads)

                    losses.append(loss_gpu_wise)
                    top_3_metric_gpu_wise = tf.reduce_mean(tf.cast(tf.nn.in_top_k(targets=target_values, predictions=net_logits, k=3),tf.float32))
                    top_3_metrics.append(top_3_metric_gpu_wise)
                # After the first iteration, we want to reuse the variables.
                outer_scope.reuse_variables()

        # Apply the gradients on the controlling device
        with tf.name_scope("apply_gradients"), tf.device(controller):
            # Note that what we are doing here mathematically is equivalent to returning the
            # average loss over the towers and compute the gradients relative to that.
            # Unfortunately, this would place all gradient-computations on one device, which is
            # why we had to compute the gradients above per tower and need to average them here.

            # This function is defined below; it takes the list of (gradient, variable) lists
            # and turns it into a single (gradient, variables) list.
            gradients = average_gradients(tower_grads)
            global_step = tf.train.get_global_step()
            self.train_op = optimizer.apply_gradients(gradients, global_step)
            self.loss = tf.reduce_mean(losses)
            self.top_3_metric = tf.reduce_mean(top_3_metrics)
            
            
            tf.summary.scalar('sparse_softmax_cross_entropy_with_logits', self.loss)
            tf.summary.scalar('Top_3_metric', self.top_3_metric)
            tf.summary.scalar('Learning_rate', self.learning_rate_tensor)
            self.merged_summary_metrics = tf.summary.merge_all(scope='apply_gradients')    

In [6]:
def preprocess_input(x):
    x = x.astype(np.float32)
    x /= 127.5
    x -= 1.
    return x

In [7]:
def norm_mat_to_img(mat):
    mat-=mat.min()
    mat/=mat.max()
    mat*=255
    return mat.astype(np.uint8)

def common_draw_cv2(raw_strokes, veloc, size=256, lw=6, time_color=True):
    img_veloc = np.zeros((BASE_SIZE, BASE_SIZE), np.float32)
    img_strok_order = np.zeros((BASE_SIZE, BASE_SIZE), np.uint8)

    for t, (s, v) in enumerate(zip(raw_strokes, veloc)):
        img_blank = np.zeros((BASE_SIZE, BASE_SIZE), np.uint8)
        color = 255 - min(t, 10) * 13 if time_color else 255
        for i in range(len(s[0]) - 1):
            _ = cv2.line(img_blank, (s[0][i], s[1][i]),
                         (s[0][i + 1], s[1][i + 1]), color, lw)
        img_strok_order = np.maximum(img_strok_order, img_blank)
        img_veloc += (img_blank.astype(np.bool).astype(np.uint8))*(1/v) #slower more importaint
            
    img_veloc = cv2.applyColorMap(norm_mat_to_img(img_veloc), cv2.COLORMAP_JET)
    img_strok_order = cv2.applyColorMap(img_strok_order, cv2.COLORMAP_JET)

    image = preprocess_input(np.concatenate((img_veloc, img_strok_order), axis=-1))
    if size != BASE_SIZE:
        return cv2.resize(image, (size, size, 6))
    else:
        return image   

In [8]:
def map_func(countrycode, drawing, label, veloc, image_ratio):
    drawing = drawing.decode('utf-8')
    drawing = json.loads(drawing)
    veloc = veloc.decode('utf-8').replace('nan','-1').replace('inf','-1').replace('[array(','').replace(')]','').replace('.,',',').replace('.]',']')
    veloc = json.loads(veloc)
    veloc = np.array(veloc) #hack for zero and nans in veloc values

    if len(veloc) == 1:
        veloc = [1]
    elif np.all(veloc == veloc[0]):
        veloc = np.ones(len(veloc))
    elif set(veloc) == set([0,-1]):
        veloc[veloc==-1] = 1
        veloc[veloc==0] = 0.1
    else:
        veloc[veloc==-1] = np.max(veloc[veloc!=-1])
        veloc[veloc==0] = np.min(veloc[veloc!=0])        
    min_veloc = np.expand_dims(np.min(veloc), 0)
    max_veloc = np.expand_dims(np.max(veloc), 0)

    veloc = list(veloc)
    
    label = label.decode('utf-8')
    label = cat_to_id['labels'][label]
    countrycode = countrycode.decode('utf-8')
    countrycode = cat_to_id['country_codes'][countrycode]   
    countrycode = tf.keras.utils.to_categorical(countrycode, num_classes=len(cat_to_id['country_codes']))
    
    image_ratio = np.expand_dims(image_ratio, 0)

    image = common_draw_cv2(drawing, veloc, size=256, lw=6, time_color=True)
    return (image.astype(np.float32), 
            np.int32(label), 
            countrycode.astype(np.float32), 
            np.float32(image_ratio), 
            np.float32(min_veloc),
            np.float32(max_veloc))

def tf_py_map_func_wrapper(*args):
    return tf.py_func(func=map_func,
               inp=(args[0], args[1], args[2], args[3], args[4]),
               Tout = (tf.float32, tf.int32, tf.float32, tf.float32, tf.float32, tf.float32),)

In [9]:
with open('config_label_country_code.json','r') as f:
    cat_to_id = json.load(f)
id_to_cat_label = {v:k.replace(' ','_') for k,v in cat_to_id['labels'].items()}

In [12]:
graph = tf.Graph()
with graph.as_default():
    validation_dataset = tf.data.experimental.CsvDataset(glob.glob('val_simplified_extend_clearn_split.csv'), [tf.string, tf.string, tf.string, tf.string, tf.float32], header=True, select_cols=[2, 3, 7, 8, 9],).map(tf_py_map_func_wrapper, num_parallel_calls=CPUS).prefetch(len(GPUS)*VAL_BATCH_SIZE).batch(VAL_BATCH_SIZE).repeat(-1)
    training_dataset = tf.data.experimental.CsvDataset(glob.glob('train_simplified_extend_clearn_split.csv'), [tf.string, tf.string, tf.string, tf.string, tf.float32], header=True, select_cols=[2, 3, 7, 8, 9]).skip((19506+4822)*1024).shuffle(int(1e4)).map(tf_py_map_func_wrapper, num_parallel_calls=CPUS).prefetch(len(GPUS)*BATCH_SIZE).batch(BATCH_SIZE).repeat(-1)
    
    handle = tf.placeholder(tf.string, shape=[], name='iterator_handler')
    iterator = tf.data.Iterator.from_string_handle(handle, training_dataset.output_types, training_dataset.output_shapes)
    def input_fn():
        with tf.device(None):
            # remove any device specifications for the input data
            data = iterator.get_next()
            return data
    training_iterator = training_dataset.make_initializable_iterator()    
    validation_iterator = validation_dataset.make_initializable_iterator()
    
    tf.train.create_global_step()
    
    is_training = tf.placeholder_with_default(1, shape=[], name='isTraining')
    model = MyModel(num_classes=len(cat_to_id['labels']), is_training=is_training, learning_rate=1e-3)
    model.build_in_parallel_fashion(input_fn, controller='/cpu:0', devices=GPUS)
    init = tf.global_variables_initializer()
    saver = tf.train.Saver()

In [None]:
STEMPS_NUM = int(5e7/(BATCH_SIZE*len(GPUS)))*10
LOG_DIR = 'models/resnet_checkpoints'
config = tf.ConfigProto()
config.allow_soft_placement = True
# config.log_device_placement=True
with tf.Session(graph=graph, config=config) as sess:
    train_writer = tf.summary.FileWriter(os.path.join(LOG_DIR,'train'), sess.graph)
    test_writer = tf.summary.FileWriter(os.path.join(LOG_DIR, 'test'))
    
    sess.run([init, training_iterator.initializer, validation_iterator.initializer])
    training_handle = sess.run(training_iterator.string_handle())
    validation_handle = sess.run(validation_iterator.string_handle())
    
    top_3_metric = 0
    validation_top_3_metric_list = []
    validation_loss_list = []
    
    saver.restore(sess, os.path.join(LOG_DIR,"model_0.897216796875.ckpt"))
    for train_step in range(STEMPS_NUM):
        
        print(train_step, end='\r')
        _, ms = sess.run([model.train_op, model.merged_summary_metrics], {handle: training_handle,
                                                                         is_training: 1})
        if train_step%10==0:
            train_writer.add_summary(ms, train_step)
            train_writer.flush()
        
        if train_step%1000==0:
            for val_step in range(int(VAL_STEPS/(VAL_BATCH_SIZE*len(GPUS)))):
                validation_loss, validation_top_3_metric_step = sess.run([model.loss, model.top_3_metric], {handle: validation_handle,
                                                                                                           is_training: 0})
                validation_top_3_metric_list.append(validation_top_3_metric_step)
                validation_loss_list.append(validation_loss)
            summary = tf.Summary()
            summary.value.add(tag="Validation_loss", simple_value=np.mean(validation_loss_list))
            summary.value.add(tag="Validation_Top3_metric", simple_value=np.mean(validation_top_3_metric_list))
            test_writer.add_summary(summary, train_step)
            test_writer.flush()
            if np.mean(validation_top_3_metric_step) > top_3_metric:
                top_3_metric = np.mean(validation_top_3_metric_step)
                saver.save(sess, os.path.join(LOG_DIR, "model_{}.ckpt".format(top_3_metric)))
            top_3_metric_list = []
            

INFO:tensorflow:Restoring parameters from models/resnet_checkpoints/model_0.897216796875.ckpt
1018