In [1]:
%matplotlib inline
import os
import sys
import pylab
import random
from random import randint, uniform
from skimage.util import crop
from skimage import transform
import numpy as np
import pandas as pd
import cPickle as pkl
from lasagne import layers
from bs4 import BeautifulSoup as bs
from lasagne import updates
import lasagne as nn
from theano.tensor.nnet import softmax
from scipy.misc import imread, imresize
from nolearn.lasagne import NeuralNet, BatchIterator
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report, accuracy_score

repo_location = '/workspace/.project/project/'
data_root = os.path.join(os.path.expanduser('~') + repo_location + 'datasets/')
script_root = os.path.join(os.path.expanduser('~') + repo_location + 'scripts/')
model_root = os.path.join(os.path.expanduser('~') + repo_location + 'models/')

Using gpu device 0: GeForce GTX TITAN X (CNMeM is disabled)


In [2]:
# Load dataset
train_soup = bs(open(data_root + 'icdar03/train/char/char.xml').read(), 'lxml-xml')
test_soup = bs(open(data_root + 'icdar03/test/char/char.xml').read(), 'lxml-xml')

X_train = []
y_train = []
X_test = []
y_test = []

for image in train_soup('image'):
    try:
        img = imread(data_root + 'icdar03/train/char/' + image['file'])
        X_train.append(img)
        y_train.append(image['tag'])
    except:
        pass
    
for image in test_soup('image'):
    try:
        img = imread(data_root + 'icdar03/test/char/' + image['file'])
        X_test.append(img)
        y_test.append(image['tag'])
    except:
        pass

    
data_train = pd.DataFrame({'image' : X_train, 'label' : y_train})
data_test = pd.DataFrame({'image' : X_test, 'label' : y_test})

# drop extra labels
data_train = data_train.loc[~data_train['label'].isin([':', '-', '.', '\'', '!', '(', '"', ')', '&', '?', u'\xa3', u'\xc9', u'\xd1', u'\xe9', ','])]
data_test = data_test.loc[~data_test['label'].isin([':', '-', '.', '\'', '!', '(', '"', ')', '&', '?', u'\xa3', u'\xc9', u'\xd1', u'\xe9', ','])]

print 'Loaded icdar03'

Loaded icdar03


In [3]:
# Reshape images to 32x32 and convert to grayscale
data_train_x = np.zeros((data_train['image'].count(), 1, 32, 32))
data_train_y = data_train['label'].values
data_test_x = np.zeros((data_test['image'].count(), 1, 32, 32))
data_test_y = data_test['label'].values

for idx, img in enumerate(data_train['image']):
    img = imresize(img, (32, 32))
    if len(img.shape) == 3:
        data_train_x[idx, ...] = img.dot([0.299, 0.587, 0.144])
    else:
        data_train_x[idx, ...] = img
        
for idx, img in enumerate(data_test['image']):
    img = imresize(img, (32, 32))
    if len(img.shape) == 3:
        data_test_x[idx, ...] = img.dot([0.299, 0.587, 0.144])
    else:
        data_test_x[idx, ...] = img
        
data_train_x = data_train_x.astype('float32')
data_test_x = data_test_x.astype('float32')
print 'icdar03 reshaped and grayscaled'

icdar03 reshaped and grayscaled


In [4]:
# Normalize by MuSigma
data_train_x /= data_train_x.std(axis = None)
data_train_x -= data_train_x.mean()

data_test_x /= data_test_x.std(axis = None)
data_test_x -= data_test_x.mean()

In [5]:
print data_train_x.shape, data_train_y.shape, data_test_x.shape, data_test_y.shape

(6113, 1, 32, 32) (6113,) (5379, 1, 32, 32) (5379,)


In [6]:
class TransIterator(BatchIterator):
    def fast_warp(self, img, tf, output_shape, mode='nearest'):
        return transform._warps_cy._warp_fast(img, tf.params, output_shape=output_shape, mode=mode)
    
    def transform(self, Xb, yb):
        Xb, yb = super(TransIterator, self).transform(Xb, yb)
        
        Xb_aug = np.empty(shape = (Xb.shape[0], 1, 32, 32), dtype = 'float32')
        yb_aug = yb

        # random rotations betweein -5 and 5 degrees
        dorotate = randint(-5,5)

        # random translations
        trans_1 = randint(-3,3)
        trans_2 = randint(-3,3)

        # random zooms
        zoom = uniform(0.8, 1.2)

        # shearing
        shear_deg = uniform(-10, 10)

        # set the transform parameters for skimage.transform.warp
        # have to shift to center and then shift back after transformation otherwise
        # rotations will make image go out of frame
        center_shift   = np.array((32, 32)) / 2. - 0.5
        tform_center   = transform.SimilarityTransform(translation=-center_shift)
        tform_uncenter = transform.SimilarityTransform(translation=center_shift)

        tform_aug = transform.AffineTransform(rotation = np.deg2rad(dorotate),
                                              scale =(1/zoom, 1/zoom),
                                              shear = np.deg2rad(shear_deg),
                                              translation = (trans_1, trans_2))

        tform = tform_center + tform_aug + tform_uncenter
        
        for j in range(Xb.shape[0]):
            Xb_aug[j][0] = self.fast_warp(Xb[j][0], tform,
                                          output_shape = (32, 32))

        return Xb_aug, yb_aug

In [7]:
# setting nn 
net = NeuralNet(
    layers = [
        ('input', layers.InputLayer),
        ('conv1', layers.Conv2DLayer),
        ('conv2', layers.Conv2DLayer),
        ('pool3', layers.MaxPool2DLayer),
        ('dropout4', layers.DropoutLayer),
        ('conv5', layers.Conv2DLayer),
        ('conv6', layers.Conv2DLayer),
        ('pool7', layers.MaxPool2DLayer),
        ('dropout8', layers.DropoutLayer),
        ('conv9', layers.Conv2DLayer),
        ('conv10', layers.Conv2DLayer),
        ('dropout12', layers.DropoutLayer),
        ('hidden13', layers.DenseLayer),
        ('dropout14', layers.DropoutLayer),
        ('hidden15', layers.DenseLayer),
        ('dropout16', layers.DropoutLayer),
        ('output', layers.DenseLayer),
    ],

    input_shape = (None, 1, 32, 32),
    conv1_num_filters = 128, conv1_filter_size = (3, 3),
    conv2_num_filters = 128, conv2_filter_size = (3, 3),
    pool3_pool_size = (2, 2),
    dropout4_p = 0,
    conv5_num_filters = 256, conv5_filter_size = (3, 3),
    conv6_num_filters = 256, conv6_filter_size = (3, 3),
    pool7_pool_size = (2, 2),
    dropout8_p = 0.2,
    conv9_num_filters = 512, conv9_filter_size = (3, 3),
    conv10_num_filters = 512, conv10_filter_size = (3, 3),
    dropout12_p = 0.2,
    hidden13_num_units = 1024,
    dropout14_p = 0.5,
    hidden15_num_units = 1024,
    dropout16_p = 0.5,
    output_num_units = 62, output_nonlinearity = softmax,

    batch_iterator_train = TransIterator(batch_size = 2500),
    batch_iterator_test = BatchIterator(batch_size = 2500),

    update = updates.adam,

    use_label_encoder = True,
    regression = False,
    max_epochs = 300,
    verbose = 1,
)

In [8]:
# train nn
#net.load_params_from(os.path.join(model_root, 'recog_for_icdar.pkl')); # or load a pretrained model!
net.fit(data_train_x, data_train_y);

# Neural Network with 6212542 learnable parameters

## Layer information

  #  name       size
---  ---------  ---------
  0  input      1x32x32
  1  conv1      128x30x30
  2  conv2      128x28x28
  3  pool3      128x14x14
  4  dropout4   128x14x14
  5  conv5      256x12x12
  6  conv6      256x10x10
  7  pool7      256x5x5
  8  dropout8   256x5x5
  9  conv9      512x3x3
 10  conv10     512x1x1
 11  dropout12  512x1x1
 12  hidden13   1024
 13  dropout14  1024
 14  hidden15   1024
 15  dropout16  1024
 16  output     62

  epoch    train loss    valid loss    train/val    valid acc  dur
-------  ------------  ------------  -----------  -----------  ------
      1       [36m4.04503[0m       [32m7.41551[0m      0.54548      0.05297  11.51s
      2       7.08244       [32m4.07214[0m      1.73924      0.04414  11.31s
      3       4.07716       4.11162      0.99162      0.03852  11.44s
      4       4.10915       4.10749      1.00041      0.03451  12.05s
      5       4.10416       4.0



In [9]:
pred = net.predict(data_test_x)
print accuracy_score(data_test_y, pred)

0.793084216397


In [10]:
print classification_report(data_test_y, pred)

             precision    recall  f1-score   support

          0       0.67      0.04      0.08        46
          1       0.74      0.63      0.68        46
          2       0.87      0.96      0.91        49
          3       0.86      0.71      0.77        17
          4       0.90      0.75      0.82        24
          5       0.79      0.38      0.51        29
          6       1.00      0.60      0.75        15
          7       0.56      0.50      0.53        10
          8       0.57      0.67      0.62         6
          9       0.71      0.33      0.45        15
          A       0.95      0.86      0.90       223
          B       0.76      0.81      0.78        47
          C       0.89      0.75      0.81       153
          D       0.74      0.85      0.79        74
          E       0.87      0.90      0.89       322
          F       0.90      0.82      0.86        76
          G       0.84      0.90      0.87        63
          H       0.91      0.89      0.90   

  'precision', 'predicted', average, warn_for)


In [11]:
net.save_params_to(os.path.join(model_root, 'recog_for_icdar.pkl'))