# Chinese Name OCR based on CRNN

In [1]:
from keras.layers.convolutional import Conv2D,MaxPooling2D,ZeroPadding2D
from keras.layers.normalization import BatchNormalization
from keras.layers.core import Reshape,Masking,Lambda,Permute
from keras.layers import Input,Dense,Flatten
from keras.preprocessing.sequence import pad_sequences
from keras.layers.recurrent import GRU,LSTM
from keras.layers.wrappers import Bidirectional
from keras.models import Model
from keras import backend as K
from keras.preprocessing import image
from keras.optimizers import Adam,SGD,Adadelta
from keras import losses
from keras.layers.wrappers import TimeDistributed
from keras.callbacks import EarlyStopping,ModelCheckpoint,TensorBoard
from keras.utils import plot_model
from matplotlib import pyplot as plt

import numpy as np 
import os
from PIL import Image,ImageDraw,ImageFont 
import json
import threading
import pandas as pd
from opencc import OpenCC 

import tensorflow as tf  
import keras.backend.tensorflow_backend as K  


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
name_corpus = pd.read_csv("./input/chinese_names_big5.csv")

In [3]:
name_corpus.columns=['name_sim_chi','len','name_tra_chi']
name_corpus.to_csv("./input/chinese_names_big5.csv",index=False)
name_corpus.drop_duplicates(inplace=True)

In total, there are 1,203,132 Chinese names with maximum name length of 3. There are 2,361 distinct characters.

In [4]:
print("total name is {}".format(len(name_corpus)))

print("possible name length is ")
name_corpus['len'].unique()

total name is 1203132
possible name length is 


array([2, 3])

Interesting to see the distribution of Chinese surname

In [5]:
name_set=list(set(name_corpus['name_tra_chi']))

char_list=[]

for name in name_set:
    char_list.extend(list(name))
    
char_to_id = {j:i for i,j in enumerate(char_list)}
id_to_char = {i:j for i,j in enumerate(char_list)}

char_df = pd.DataFrame(data=char_list)
char_df.columns=['char']
char_df['count']=1
char_stat = char_df.groupby('char').sum().sort_values(by='count',ascending=False)
char_stat

Unnamed: 0_level_0,count
char,Unnamed: 1_level_1
王,55563
李,53945
張,51714
陳,45452
劉,43730
文,36625
林,34617
華,31981
楊,30116
明,30015


In [6]:
## initialize gloabl variables

maxlabellength = 3
img_h = 32
img_w = 248
nclass = len(char_stat)
rnnunit=256
batch_size =64

Copy MingLiu font from Windows/fonts to Ubuntu /usr/share/fonts and update system font cache

- sudo mkfontscale (if package missing need to do sudo apt-get installttf-mscorefonts-installer)
- sudo mkfontdir (if package missing need to do sudo apt-get install fontconfig)
- sudo fc-cache -fv( refresh system font cache)

In [7]:
font=ImageFont.truetype('/usr/share/fonts/truetype/windows/mingliu0.ttf',24) 

Generate training image for Chinese name in MingLiu font
 - training image by random generation
 - validation image by random generation

In [8]:
def generate_image_sample(n=100,image_path='./train/train_',label_path = "./train/train_label.csv"):
    sample_name=name_corpus.sample(n)
    for index, row in sample_name.iterrows():

        img = img = Image.new('L',(img_w,img_h),(255))
        draw = ImageDraw.Draw(img)  
        name = row['name_tra_chi']
        label = ""
        for chr in name:
            label = label + chr +" "
        draw.text((0,5),label.strip() ,fill=(0),font=font)  
        img.save(image_path+str(index)+'.png')

    sample_name.reset_index().to_csv(label_path,index=False)  
    return sample_name

In [11]:
train_image = generate_image_sample(1000,'./train/train_','train/train_label.csv')
y_train = train_image['name_tra_chi']

In [12]:
validate_image = generate_image_sample(100,'./validate/valid_','validate/valid_label.csv')
y_valid = validate_image['name_tra_chi']

design of the deep learning model: VGG + Bidirectional LSTM + CTC

In [13]:
input = Input(shape=(img_h,None,1),name='the_input')

m = Conv2D(64,kernel_size=(3,3),activation='relu',padding='same',name='conv1')(input)
m = MaxPooling2D(pool_size=(2,2),strides=(2,2),name='pool1')(m)
m = Conv2D(128,kernel_size=(3,3),activation='relu',padding='same',name='conv2')(m)
m = MaxPooling2D(pool_size=(2,2),strides=(2,2),name='pool2')(m)
m = Conv2D(256,kernel_size=(3,3),activation='relu',padding='same',name='conv3')(m)
m = BatchNormalization(axis=3)(m)
m = Conv2D(256,kernel_size=(3,3),activation='relu',padding='same',name='conv4')(m)

m = ZeroPadding2D(padding=(0,1))(m)
m = MaxPooling2D(pool_size=(2,2),strides=(2,1),padding='valid',name='pool3')(m)

m = Conv2D(512,kernel_size=(3,3),activation='relu',padding='same',name='conv5')(m)
m = BatchNormalization(axis=3)(m)
m = Conv2D(512,kernel_size=(3,3),activation='relu',padding='same',name='conv6')(m)

m = ZeroPadding2D(padding=(0,1))(m)
m = MaxPooling2D(pool_size=(2,2),strides=(2,1),padding='valid',name='pool4')(m)
m = Conv2D(512,kernel_size=(2,2),activation='relu',padding='valid',name='conv7')(m)

m = BatchNormalization(axis=3)(m)
m = Permute((2,1,3),name='permute')(m)
m = TimeDistributed(Flatten(),name='timedistrib')(m)

m = Bidirectional(GRU(rnnunit,return_sequences=True,implementation=2),name='blstm1')(m)
#m = Bidirectional(LSTM(rnnunit,return_sequences=True),name='blstm1')(m)
m = Dense(rnnunit,name='blstm1_out',activation='linear',)(m)
#m = Bidirectional(LSTM(rnnunit,return_sequences=True),name='blstm2')(m)
m = Bidirectional(GRU(rnnunit,return_sequences=True,implementation=2),name='blstm2')(m)
y_pred = Dense(nclass,name='blstm2_out',activation='softmax')(m)

basemodel = Model(inputs=input,outputs=y_pred)
basemodel.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       (None, 32, 248, 1)        0         
_________________________________________________________________
conv1 (Conv2D)               (None, 32, 248, 64)       640       
_________________________________________________________________
pool1 (MaxPooling2D)         (None, 16, 124, 64)       0         
_________________________________________________________________
conv2 (Conv2D)               (None, 16, 124, 128)      73856     
_________________________________________________________________
pool2 (MaxPooling2D)         (None, 8, 62, 128)        0         
_________________________________________________________________
conv3 (Conv2D)               (None, 8, 62, 256)        295168    
_________________________________________________________________
batch_normalization_1 (Batch (None, 8, 62, 256)        1024      
__________

In [14]:
def ctc_lambda_func(args):
    y_pred,labels,input_length,label_length = args
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)

In [15]:
labels = Input(name='the_labels',shape=[maxlabellength],dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')

loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length]) 

model = Model(inputs=[input, labels, input_length, label_length], outputs=loss_out)

adadelta = Adadelta()
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer=adadelta,metrics=['accuracy'])
checkpoint = ModelCheckpoint(r'weights-{epoch:02d}.hdf5',save_weights_only=True)
earlystop = EarlyStopping(patience=10)
tensorboard = TensorBoard(r'crnn/logs',write_graph=True)

In [None]:
def image_name(index):
    
    return "./train/train_"+str(index)+".png"

def generate_image_from_file(path,batch_size=64,maxlabellength=3):
    
   
    images = pd.read_csv(path)
    images['file_name']=images['index'].apply(image_name)
    
    #print(images.head())
    
    x = np.zeros((batch_size, img_h, img_w, 1), dtype=np.float)
    labels = np.ones([batch_size,maxlabellength])
    input_length = np.zeros([batch_size,1])
    label_length = np.zeros([batch_size,1])
    
    samples = images.sample(batch_size).reset_index()

   
    while 1:
       
        for i,row in samples.iterrows():
            img1 = Image.open(row['file_name'])
            img = np.array(img1,'f')/255.0-0.5
           
            x[i] = np.expand_dims(img,axis=2)
            name = row['name_tra_chi']
            label_length[i] = len(name)        
            input_length[i] = img_w//4+1
            labels[i,:len(name)] = [char_to_id[i] for i in name]
        

        inputs = {'the_input': x,
                 'the_labels': labels,
                 'input_length': input_length,
                 'label_length': label_length,
                }
        outputs = {'ctc': np.zeros([batch_size])} 
        yield (inputs,outputs)            


In [None]:
model.fit_generator(generate_image_from_file('./train/train_label.csv',batch_size=batch_size),\
                    steps_per_epoch=len(train_image), \
                    validation_data =generate_image_from_file('./validate/valid_label.csv',batch_size=batch_size) ,\
                    validation_steps = len(validate_image),\
                    epochs=10,\
                    verbose=1,\
                    callbacks =[earlystop,checkpoint,tensorboard])

Epoch 1/10
  63/1000 [>.............................] - ETA: 18265s - loss: 29.0915 - acc: 0.7138

In [None]:
model.save_weights('crnn_model_weights.h5')
model.to_json('model_structure.json')

In [None]:
#model.predict(x, batch_size=None, verbose=0, steps=None)