In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
import matplotlib.image as mimg
import numpy as np

import cv2

import warnings
warnings.simplefilter("ignore", UserWarning)

%matplotlib inline

In [3]:
from PIL import Image, ImageColor, ImageFont, ImageDraw, ImageFilter

In [4]:
# import DL stuff
import torch
import torch.nn as nn

import torch.functional as F
from torch.autograd import Variable as V
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchvision import transforms
import tensorflow as tf

import skimage.io as io

slim = tf.contrib.slim


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [5]:
# from helpers import Converter, Resize, Normalize, 

In [6]:
print('tf:', tf.__version__, '\n\r', 'torch:', torch.__version__)

tf: 1.12.0 
 torch: 1.0.0


In [7]:
import pandas as pd

In [8]:
class DataStream(Dataset):
    def __init__(self, fname, transform=None):
        self.data = pd.read_csv(fname, sep=';', header=None)
        self.imgs = self.data[0]
        self.labels = self.data[1]
        self.transform = transform

    def __len__(self):
        return len(self.imgs)

    def __getitem__(self, idx):
        x = io.imread(self.imgs[idx])
        y = self.labels[idx]
        sample = {'img' : x, 'label' : y}

        if self.transform:
            sample = self.transform(sample)

        return sample

In [9]:
class Resize(object):
    """Resize."""
    def __init__(self, size):
        self.size = size

    def __call__(self, sample):
        img = cv2.resize(sample['img'], self.size)
        return {'img' : img, 'label' : sample['label']}

class ToTensorTarget(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, sample):
        sat_img, label = sample['img'], sample['label']
        return {'img': transforms.functional.to_tensor(sat_img.copy()),
                'label' : sample['label']}
    
class NormalizeTarget(transforms.Normalize):
    """Normalize a tensor and also return the target"""

    def __call__(self, sample):
        # print(sample)
        return {'img': transforms.functional.normalize(sample['img'], self.mean, self.std),
                'label': sample['label']}

In [10]:
transform = transforms.Compose([Resize((204, 32)),
                               ToTensorTarget(),
                               NormalizeTarget([0.3956, 0.5763, 0.5616],
                                                [0.1535, 0.1278, 0.1299])])

In [11]:
dr = DataStream('./data/data.csv', transform=transform)
ds = DataLoader(dr, batch_size=256, shuffle=True)

In [12]:
sample = next(iter(ds))

In [27]:
from models import crnn
from configs import generator_cfg
from helpers import Converter

gen_cfg = generator_cfg()

In [28]:
converter = Converter(gen_cfg.alph)

In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = crnn.CRNN(3, len(gen_cfg.alph) + 1, 256).to(device)

In [30]:
X = V(sample['img'].to(device))
Y_labels, Y_lengths = converter.encode(sample['label'])

In [31]:
y_hat = model(X)

torch.Size([100, 512, 1, 52])
torch.Size([52, 100, 71])


In [33]:
criterion = nn.CTCLoss()

In [32]:
len(gen_cfg.alph) + 1

71

In [34]:
preds_size = torch.IntTensor(100).fill_(y_hat.shape[0])

In [36]:
loss = criterion(y_hat, Y_labels, preds_size, Y_lengths) / 100

In [38]:
loss.backward()

In [41]:
converter.best_path_decode(y_hat)

['0e0e0e0e0',
 'esese0',
 'e0Re0R0Rse0',
 '0e0ese0e',
 'e0e0ese0',
 'e0e0b0e0',
 'edesesese',
 'e0seRed0eReR0R0R0',
 '0e0e0',
 'eReReRe',
 'ede0e0ese0',
 '0edsede',
 'e0s0e0s0',
 '0e0e0es0e',
 '0ReR',
 'e0eReje0',
 'eReReRe0',
 'e0e0e0e0e0',
 '0e0R0e0ReRQe0e',
 '0eReRe0e0',
 '0eQe0de0e',
 'e0e0e0',
 '0eRd.ede0',
 'e0',
 '0e0eRQReRe0',
 'e0e0ede0',
 '0eRe0',
 '0eResR0ese0',
 'eReRe0e0',
 'eReRe0e0',
 'eReReRe0',
 'eReReReRe0ese0',
 'e0e0e0',
 'es0ese0',
 'e0e0Re0',
 'ese0',
 'e',
 'ese0',
 'edese0',
 '0d0e0e0ede',
 'e0Rese0',
 'eRese0',
 '0e0e0de0e0',
 'e0es0se0',
 '0e0e0e',
 'e0e0e0e0e0',
 'eReReRese0',
 'eReReR0R',
 '0ReRe',
 'eReRe0',
 'eReR0e0e',
 '0e0eRe',
 'eReReReRe0',
 '0e0e0ese0',
 'eResResR0Re0',
 'eReRsRese0',
 'e0e0eRe',
 'es0ese0',
 'e0',
 'Re0',
 'eR0e',
 'ese0',
 'jRe0',
 'eRes0ese0',
 'R0R0R0',
 'e0ese0',
 'ebsbsesbsbese0',
 '0e',
 '0Re0esR0ReRe0',
 'e0de',
 'ese0bdse0e0Rdse0',
 '0e0esesRe0eRe',
 'seRQ0e',
 'eReRe',
 'ese0',
 'edsese0s0e0',
 'eReR0R0Rese0',
 'eRe',
 '0eR

In [48]:
converter.encode(sample['label'])

(tensor([10, 11,  7, 60, 59, 53, 58, 64, 13,  7, 22, 18,  7, 60, 59, 53, 58, 64,
         12, 62, 48, 19,  8, 30,  8, 45, 45, 56, 63, 19,  8, 24,  8, 19, 13, 19,
          8, 30,  8, 34,  8, 14, 64, 52, 45, 11, 58, 48, 19,  8, 32,  8, 10,  9,
         17,  9, 13, 26, 19,  8, 27,  8, 45, 45, 52, 10, 10,  7, 60, 59, 53, 58,
         64, 13, 64, 52, 45, 45, 52, 63, 10,  9,  7, 60, 59, 53, 58, 64, 16,  7,
         60, 59, 53, 58, 64, 14,  7, 38, 45, 45, 56, 53, 53, 63, 19, 45, 56, 46,
         59, 62, 51, 19, 19, 27, 27, 17,  7, 60, 59, 53, 58, 64, 18, 64, 52, 19,
          8, 36,  8, 21,  8, 37,  8, 12, 22, 11,  6, 13,  7, 48, 11, 19,  4, 31,
         19, 10, 11,  9,  7, 60, 59, 53, 58, 64, 45,  8, 62,  8, 19,  8, 31,  8,
         22,  8, 25,  8, 19, 45, 56, 49, 63, 65, 58, 48, 10, 17,  7, 60, 59, 53,
         58, 64, 12, 31,  4, 47, 19,  8, 24,  8, 19,  8, 31,  8, 13, 25, 30, 45,
         45, 52, 53, 58, 51, 19, 19, 19, 19, 16, 64, 52, 19,  8, 25,  8, 19, 19,
         24, 45,  5, 19, 19,

In [107]:
H = 32
W = 280
C = 1
net = CRNN(C, nc=32, nh=128)
net.apply(weights_init)
X = V(torch.randn(1, C, H, W))
Y_ = net(X)

torch.Size([1, 512, 1, 71])
torch.Size([71, 1, 32])


In [None]:
opt = nn.CTCLoss(

In [200]:
16 * 7

112

In [48]:
class AttrDict(dict):
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__
    
def define_config():
    config = AttrDict()
    config.n_classes = 34
    config.lstm_size = 256
    config.width = 280
    config.height = 32
    return config

In [49]:
cfg

{'n_classes': 34, 'lstm_size': 256}

In [59]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf
from tensorflow.contrib.rnn import BasicLSTMCell

import numpy as np

tf.enable_eager_execution()

slim = tf.contrib.slim
cfg = define_config()

def cnn(inputs, scope='vgg', is_training=True):
    batch_norm_params = {'is_training': is_training}
    with slim.arg_scope([slim.conv2d], 
                        normalizer_fn=slim.batch_norm, 
                        normalizer_params=batch_norm_params,):
        with slim.arg_scope([slim.batch_norm], **batch_norm_params):
            net = slim.repeat(
                inputs, 1, 
                slim.conv2d, 
                64, [3, 3], 
                scope='conv1',
            )
            net = slim.max_pool2d(net, [2, 2], scope='pool1')
            net = slim.repeat(net, 1, slim.conv2d, 128, [3, 3], scope='conv2')
            net = slim.max_pool2d(net, [2, 2], scope='pool2', padding='SAME')
            net = slim.repeat(net, 2, slim.conv2d, 256, [3, 3], scope='conv3')
            net = slim.max_pool2d(net, [2, 2], stride=[2, 1], scope='pool3', padding='SAME')
            net = slim.repeat(net, 2, slim.conv2d, 512, [3, 3], scope='conv4')
            net = slim.max_pool2d(net, [2, 2], stride=[2, 1], scope='pool4', padding='SAME')
            net = slim.repeat(net, 1, slim.conv2d, 512, [3, 3], scope='conv5')
            return net


def crnn(images, is_training=True):
    dropout_keep_prob = 0.7 if is_training else 1.0
    cnn_net = cnn(images, is_training=is_training)
    with tf.variable_scope('Reshaping_cnn'):
        shape = cnn_net.get_shape().as_list()  # [batch, height, width, features]
        transposed = tf.transpose(cnn_net, perm=[0, 2, 1, 3],
                                  name='transposed')  # [batch, width, height, features]
        conv_reshaped = tf.reshape(transposed, [shape[0], -1, shape[1] * shape[3]],
                                   name='reshaped')  # [batch, width, height x features]

    list_n_hidden = [cfg.lstm_size, cfg.lstm_size]

    with tf.name_scope('deep_bidirectional_lstm'):
        # Forward direction cells
        fw_cell_list = [BasicLSTMCell(nh, forget_bias=1.0) for nh in list_n_hidden]
        # Backward direction cells
        bw_cell_list = [BasicLSTMCell(nh, forget_bias=1.0) for nh in list_n_hidden]

        lstm_net, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(fw_cell_list,
                                                                        bw_cell_list,
                                                                        conv_reshaped,
                                                                        dtype=tf.float32
                                                                        )
        # Dropout layer
        lstm_net = tf.nn.dropout(lstm_net, keep_prob=dropout_keep_prob)
        #logging.info('after lstm shape: %s' % lstm_net.shape)

    with tf.variable_scope('fully_connected'):
        shape = lstm_net.get_shape().as_list()  # [batch, width, 2*n_hidden]
        fc_out = slim.layers.linear(lstm_net, cfg.n_classes)  # [batch x width, n_class]
        #logging.info('fc_out shape: %s' % fc_out.shape)

        lstm_out = tf.reshape(fc_out, [shape[0], -1, cfg.n_classes],
                              name='lstm_out')  # [batch, width, n_classes]
        #logging.info('lstm_out shape: %s' % lstm_out.shape)

        # Swap batch and time axis
        logprob = tf.transpose(lstm_out, [1, 0, 2], name='transpose_time_major')  # [width(time), batch, n_classes]

        return logprob


def create_loss(sparse_code_target, logprob, seq_len_inputs):
    with tf.control_dependencies(
            [tf.less_equal(sparse_code_target.dense_shape[1], tf.reduce_max(tf.cast(seq_len_inputs, tf.int64)))]):
        loss_ctc = tf.nn.ctc_loss(labels=sparse_code_target,
                                  inputs=logprob,
                                  sequence_length=tf.cast(seq_len_inputs, tf.int32),
                                  ignore_longer_outputs_than_inputs=True,
                                 )
        loss_ctc = tf.reduce_mean(loss_ctc)
    return loss_ctc

In [60]:
ret = crnn(np.random.rand(1,224,64,3).astype(np.float32))

W0211 16:57:17.981990 140354946066176 tf_logging.py:161] <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7fa648325710>: Note that this cell is not optimized for performance. Please use tf.contrib.cudnn_rnn.CudnnLSTM for better performance on GPU.
W0211 16:57:17.984064 140354946066176 tf_logging.py:161] <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7fa648325940>: Note that this cell is not optimized for performance. Please use tf.contrib.cudnn_rnn.CudnnLSTM for better performance on GPU.
W0211 16:57:17.985587 140354946066176 tf_logging.py:161] <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7fa648325208>: Note that this cell is not optimized for performance. Please use tf.contrib.cudnn_rnn.CudnnLSTM for better performance on GPU.
W0211 16:57:17.987194 140354946066176 tf_logging.py:161] <tensorflow.python.ops.rnn_cell_impl.BasicLSTMCell object at 0x7fa6483255c0>: Note that this cell is not optimized for performance. Please use tf.contrib.cud

In [61]:
ret.shape

TensorShape([Dimension(16), Dimension(1), Dimension(34)])

In [None]:
def encode(text):
    if isinstance(text, str):
        te