In [1]:
import re
import numpy as np

from itertools import chain
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot

Using TensorFlow backend.


# Build mapping table

In [None]:
complete_kor_range = range(0xac00, 0xd7a4)
complete_eng_range = range(0x0041, 0x007b)

numbers = '0123456789'
tokens = ['<unk>', '<e>']

idx_to_chr = []
chr_to_idx = {}

# Build mapping table between chr & idx
idx_to_chr.extend(numbers)
idx_to_chr.extend(tokens)
for i in chain(complete_kor_range, complete_eng_range):
    idx_to_chr.append(chr(i))

for v, k in enumerate(idx_to_chr):
    chr_to_idx[k] = v

# Preprocess data

In [None]:
X_train = []
y_train = []

max_len = 200


def encode(c):
    try:
        idx = chr_to_idx[c]
    except:
        idx = chr_to_idx['<unk>']
        
    return idx


def get_label(s):
    label = [] 
    i = 0
    while True:
        try:
            nxt = s[i + 1]
        except:
            label.append(0)
            break
        if nxt == ' ':
            label.append(1)
            i += 2
        else:
            label.append(0)
            i += 1
    
    return label


regex = re.compile(r'\s+')
with open('test.txt', 'r', encoding='UTF-8-SIG') as f:
    for line in f:
        line = re.sub(regex, ' ', line)  # remove duplicate whitespaces
        line = line.strip().lower()
        
        if line:  # skip empty strings
            label = get_label(line)
            line = re.sub(regex, '', line)  # remove all whitespaces
            encoded = [encode(c) for c in line]
            
            X_train.append(encoded)
            y_train.append(label)
            
X_train = pad_sequences(X_train, maxlen=max_len, padding='post', truncating='post')
y_train = pad_sequences(y_train, maxlen=max_len, padding='post', truncating='post')

print('X_train.shape: ', X_train.shape)
print('y_train.shape: ', y_train.shape)

# Define model

In [None]:
from keras import layers
from keras import optimizers
from keras import models

In [None]:
def conv_block(x, filter_nums, filter_sizes):
    """A convolution block"""
    conv_blocks = []

    for fn, fs in zip(filter_nums, filter_sizes):
        conv = layers.Conv1D(filters=fn,
                             kernel_size=fs,
                             padding='same',
                             activation='relu',
                             strides=1)(x)
        #conv = layers.GlobalMaxPooling1D()(conv)
        conv_blocks.append(conv)

    return conv_blocks

In [None]:
inputs = layers.Input(shape=(max_len,))
x = layers.Embedding(len(idx_to_chr), 100, input_length=max_len)(inputs)

blocks = conv_block(x, 
                    filter_nums=(256, 256, 256, 256, 256), 
                    filter_sizes=(4, 6, 8, 10, 12))
x = layers.Concatenate()(blocks)
x = layers.(layers.LSTM(100, 
                        dropout=0.3,
                        recurrent_dropout=0.3,
                        return_sequences=True)(x)
x = layers.LSTM(50,
                dropout=0.1,
                recurrent_dropout=0.1,
                return_sequences=True)(x)
            
x = layers.TimeDistributed(layers.Dense(300, activation='relu'))(x)
x = layers.Dropout(0.3)(x)
x = layers.TimeDistributed(layers.Dense(150, activation='relu'))(x)
x = layers.TimeDistributed(layers.Dense(1, activation='sigmoid'))(x)
x = layers.Reshape((200,))(x)

model = models.Model(inputs, x)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

# Train model

In [None]:
model.fit(X_train, y_train, 
          validation_split=0.1, 
          epochs=30, 
          batch_size=30)