In [1]:
%matplotlib inline
import os
import sys
import pylab
import random
from random import randint, uniform
from skimage.util import crop
from skimage import transform
import numpy as np
import pandas as pd
import cPickle as pkl
from lasagne import layers
from bs4 import BeautifulSoup as bs
from lasagne import updates
import lasagne as nn
from theano.tensor.nnet import softmax
from scipy.misc import imread, imresize
from nolearn.lasagne import NeuralNet, BatchIterator
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, accuracy_score

repo_location = '/workspace/.project/project/'
data_root = os.path.join(os.path.expanduser('~') + repo_location + 'datasets/')
script_root = os.path.join(os.path.expanduser('~') + repo_location + 'scripts/')
model_root = os.path.join(os.path.expanduser('~') + repo_location + 'models/')

Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled)


In [2]:
# Load dataset
train_soup = bs(open(data_root + 'icdar03/train/char/char.xml').read(), 'lxml-xml')
test_soup = bs(open(data_root + 'icdar03/test/char/char.xml').read(), 'lxml-xml')

X_train = []
y_train = []
X_test = []
y_test = []

for image in train_soup('image'):
    try:
        img = imread(data_root + 'icdar03/train/char/' + image['file'])
        X_train.append(img)
        y_train.append(image['tag'])
    except:
        pass
    
for image in test_soup('image'):
    try:
        img = imread(data_root + 'icdar03/test/char/' + image['file'])
        X_test.append(img)
        y_test.append(image['tag'])
    except:
        pass

    
data_train = pd.DataFrame({'image' : X_train, 'label' : y_train})
data_test = pd.DataFrame({'image' : X_test, 'label' : y_test})

print 'Loaded icdar03'

Loaded icdar03


In [3]:
# Reshape images to 64x64 and convert to grayscale
data_train_x = np.zeros((data_train['image'].count(), 1, 64, 64))
data_train_y = data_train['label'].values
data_test_x = np.zeros((data_test['image'].count(), 1, 64, 64))
data_test_y = data_test['label'].values

for idx, img in enumerate(data_train['image']):
    img = imresize(img, (64, 64))
    if len(img.shape) == 3:
        data_train_x[idx, ...] = img.dot([0.299, 0.587, 0.144])
    else:
        data_train_x[idx, ...] = img
        
for idx, img in enumerate(data_test['image']):
    img = imresize(img, (64, 64))
    if len(img.shape) == 3:
        data_test_x[idx, ...] = img.dot([0.299, 0.587, 0.144])
    else:
        data_test_x[idx, ...] = img
        
data_train_x = data_train_x.astype('float32')
data_test_x = data_test_x.astype('float32')
print 'icdar03 reshaped and grayscaled'

icdar03 reshaped and grayscaled


In [4]:
# Normalize by MuSigma
data_train_x /= data_train_x.std(axis = None)
data_train_x -= data_train_x.mean()

data_test_x /= data_test_x.std(axis = None)
data_test_x -= data_test_x.mean()

In [5]:
print data_train_x.shape, data_train_y.shape, data_test_x.shape, data_test_y.shape

(6185, 1, 64, 64) (6185,) (5430, 1, 64, 64) (5430,)


In [6]:
class TransIterator(BatchIterator):
    def fast_warp(self, img, tf, output_shape, mode='nearest'):
        return transform._warps_cy._warp_fast(img, tf.params, output_shape=output_shape, mode=mode)
    
    def transform(self, Xb, yb):
        Xb, yb = super(TransIterator, self).transform(Xb, yb)
        
        Xb_aug = np.empty(shape = (Xb.shape[0], 1, 64, 64), dtype = 'float32')
        yb_aug = yb

        # random rotations betweein -5 and 5 degrees
        dorotate = randint(-5,5)

        # random translations
        trans_1 = randint(-10,10)
        trans_2 = randint(-10,10)

        # random zooms
        zoom = uniform(0.8, 1.2)

        # shearing
        shear_deg = uniform(-10, 10)

        # set the transform parameters for skimage.transform.warp
        # have to shift to center and then shift back after transformation otherwise
        # rotations will make image go out of frame
        center_shift   = np.array((64, 64)) / 2. - 0.5
        tform_center   = transform.SimilarityTransform(translation=-center_shift)
        tform_uncenter = transform.SimilarityTransform(translation=center_shift)

        tform_aug = transform.AffineTransform(rotation = np.deg2rad(dorotate),
                                              scale =(1/zoom, 1/zoom),
                                              shear = np.deg2rad(shear_deg),
                                              translation = (trans_1, trans_2))

        tform = tform_center + tform_aug + tform_uncenter
        
        for j in range(Xb.shape[0]):
            Xb_aug[j][0] = self.fast_warp(Xb[j][0], tform,
                                          output_shape = (64, 64))

        return Xb_aug, yb_aug

In [11]:
# setting nn 
net = NeuralNet(
    layers = [
        ('input', layers.InputLayer),
        ('conv1', layers.Conv2DLayer),
        ('conv2', layers.Conv2DLayer),
        ('pool3', layers.MaxPool2DLayer),
        ('dropout4', layers.DropoutLayer),
        ('conv5', layers.Conv2DLayer),
        ('conv6', layers.Conv2DLayer),
        ('pool7', layers.MaxPool2DLayer),
        ('dropout8', layers.DropoutLayer),
        ('hidden13', layers.DenseLayer),
        ('dropout14', layers.DropoutLayer),
        ('hidden15', layers.DenseLayer),
        ('dropout16', layers.DropoutLayer),
        ('output', layers.DenseLayer),
    ],

    input_shape = (None, 1, 64, 64),
    conv1_num_filters = 128, conv1_filter_size = (3, 3),
    conv2_num_filters = 128, conv2_filter_size = (3, 3),
    pool3_pool_size = (2, 2),
    dropout4_p = 0.2,
    conv5_num_filters = 256, conv5_filter_size = (3, 3),
    conv6_num_filters = 256, conv6_filter_size = (3, 3),
    pool7_pool_size = (2, 2),
    dropout8_p = 0.2,
    hidden13_num_units = 1024,
    dropout14_p = 0.5,
    hidden15_num_units = 1024,
    dropout16_p = 0.5,
    output_num_units = 75, output_nonlinearity = softmax,

    batch_iterator_train = TransIterator(batch_size = 256),
    batch_iterator_test = BatchIterator(batch_size = 256),

    update = updates.adam,

    use_label_encoder = True,
    regression = False,
    max_epochs = 300,
    verbose = 1,
)

In [12]:
# train nn
#net.load_params_from(os.path.join(model_root, 'recog_for_icdar.pkl')); # or load a pretrained model!
net.fit(data_train_x, data_train_y);

# Neural Network with 46463947 learnable parameters

## Layer information

  #  name       size
---  ---------  ---------
  0  input      1x64x64
  1  conv1      128x62x62
  2  conv2      128x60x60
  3  pool3      128x30x30
  4  dropout4   128x30x30
  5  conv5      256x28x28
  6  conv6      256x26x26
  7  pool7      256x13x13
  8  dropout8   256x13x13
  9  hidden13   1024
 10  dropout14  1024
 11  hidden15   1024
 12  dropout16  1024
 13  output     75

  epoch    train loss    valid loss    train/val    valid acc  dur
-------  ------------  ------------  -----------  -----------  ------
      1       [36m4.07641[0m       [32m3.84068[0m      1.06138      0.05156  51.14s
      2       [36m3.80502[0m       [32m3.77123[0m      1.00896      0.06221  52.71s
      3       [36m3.72778[0m       [32m3.76573[0m      0.98992      0.06651  52.60s
      4       [36m3.60068[0m       [32m3.55215[0m      1.01366      0.10490  52.67s
      5       [36m3.37870[0m       [32m3.20961[0m

In [13]:
pred = net.predict(data_test_x)
print accuracy_score(data_test_y, pred)

0.79576427256


In [14]:
print classification_report(data_test_y, pred)

             precision    recall  f1-score   support

          !       0.31      0.50      0.38         8
          "       0.00      0.00      0.00         1
          &       1.00      0.57      0.73         7
          '       0.40      0.25      0.31         8
          (       0.00      0.00      0.00         1
          )       0.50      1.00      0.67         1
          ,       0.00      0.00      0.00         6
          -       0.50      0.75      0.60         4
          .       0.38      0.55      0.44        11
          0       1.00      0.04      0.08        46
          1       0.76      0.48      0.59        46
          2       0.82      0.94      0.88        49
          3       0.71      0.59      0.65        17
          4       0.78      0.58      0.67        24
          5       0.77      0.34      0.48        29
          6       0.90      0.60      0.72        15
          7       0.50      0.30      0.37        10
          8       1.00      0.67      0.80   

  'precision', 'predicted', average, warn_for)


In [15]:
net.save_params_to(os.path.join(model_root, 'recog_for_icdar_1.pkl'))