In [54]:
import pandas as pd
import unicodedata
import re
import random, io
from MeCab import Tagger
import numpy as np
random.seed(1)
np.random.seed(1)

In [76]:
from sklearn.preprocessing import OneHotEncoder
import keras
from keras import optimizers
from keras import backend as K
from keras import regularizers
from keras import initializers, constraints, layers, callbacks
from keras.models import Sequential
from keras.models import Model, load_model
from keras.layers import Dense, Activation, Dropout, Flatten
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.layers import Input, BatchNormalization, LSTM, GRU
from keras.layers import Bidirectional, GlobalMaxPool1D, Add
from keras.layers import GlobalAveragePooling1D, concatenate, SpatialDropout1D
from keras.utils import plot_model, to_categorical
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.callbacks import Callback
from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.engine import InputSpec, Layer
from keras.optimizers import Adam

In [10]:
def read_excel(excel_file):
    sentences_sum = []
    sentences_tax = []
    sentences_total = []

    df = pd.read_excel(excel_file, sheet_name=0, keep_default_na=False)
    for row in range(0, df.shape[0]):

        # read and normalize unicode
        content_of_sum =  unicodedata.normalize('NFKC', df.loc[row, "sum detail of account "].lower())
        content_of_tax = unicodedata.normalize('NFKC', df.loc[row, "tax"].lower())
        content_of_total = unicodedata.normalize('NFKC', df.loc[row, "total amount "].lower())

        # replace space, ()〈〉 and split string
        sentences_sum.extend(re.sub('[ ]', '', content_of_sum).split('\n'))
        sentences_tax.extend(re.sub('[ ]', '', content_of_tax).split('\n'))
        sentences_total.extend(re.sub('[ ]', '', content_of_total).split('\n'))

    # Remove null
    sentences_sum = [phrase for phrase in sentences_sum if phrase]
    sentences_tax = [phrase for phrase in sentences_tax if phrase]
    sentences_total = [phrase for phrase in sentences_total if phrase]

    # get unique
    sentences_sum = list(set(sentences_sum))
    sentences_tax = list(set(sentences_tax))
    sentences_total = list(set(sentences_total))

    data = []

    data.extend([phrase, '1'] for phrase in sentences_sum)
    data.extend([phrase, '2'] for phrase in sentences_tax)
    data.extend([phrase, '3'] for phrase in sentences_total)
    return np.array(data)

In [11]:
class Tokenizer(Tagger):
    word_index = {}
    max_len = 0

    def __init__(self):
        Tagger.__init__(self)

    def tokenize(self, sequence):
        '''Tokenize a string using mecab'''
        sequence = sequence.replace(' ', '')
        sequence = ''.join([unicodedata.normalize('NFKC', char) for char in sequence])
        sequence = self.parse(sequence).splitlines()
        sequence = [line.split('\t')[0] for line in sequence]
        sequence = [token for token in sequence if token != 'EOS']
        sequence = [list(token) if token.isnumeric() else [token] for token in sequence]
        sequence = [token for sublist in sequence for token in sublist]
        return sequence

    def fit_on_texts(self, sentences):
        all_tokens = []
        for sentence in sentences:
            tokens = self.tokenize(sentence)
            all_tokens.extend(tokens)
            if len(tokens) > self.max_len:
                self.max_len = len(tokens)
        unique_tokens = np.unique(all_tokens)

        for i, token in enumerate(unique_tokens):
            self.word_index[token] = i + 1

    def texts_to_sequences(self, sentences):
        all_tokens = []
        for sentence in sentences:
            tokens = self.tokenize(sentence)
            tokenized = []
            for token in tokens:
                tokenized.append(self.word_index[token])
            all_tokens.append(tokenized)
        return all_tokens

In [20]:
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.asarray(tokens[1:], dtype='float32')
    return data

In [39]:
def build_embedding_matrix(embedding_index, word_index, max_features, embed_size=300):
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.zeros((nb_words + 1, embed_size))
    for word, i in word_index.items():
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    return embedding_matrix

In [14]:
def create_data_for_other_class(list_words, max_len, number_sample, class_name):
    data = []
    list_words = np.array(list_words)
    for i in range(0, number_sample):
        random_size = random.randint(1, max_len)
        indices = np.random.randint(len(list_words), size=random_size)
        tokens = list_words[indices]
        sentences = ''.join(tokens)
        data.append([sentences, class_name])
    return np.array(data)

In [15]:
excel_file = '/Users/anh/Downloads/listkey.xlsx'
fasttext_file = '/Users/anh/Downloads/cc.ja.300.vec'

**Read data and preprocess**

In [16]:
excel_data = read_excel(excel_file)

**Load fasttext vector**

In [21]:
vector = load_vectors(fasttext_file)
list_words = list(vector.keys())

**Create data for class 4**

In [23]:
max_len = 20
num_sample = 50
data_other_class = create_data_for_other_class(list_words, max_len, num_sample, '4')

**Combine data**

In [30]:
data = np.concatenate((excel_data, data_other_class), axis=0)
X_data = data[:, :-1]
y = data[:, -1:].flatten()

**Tokenize**

In [38]:
sentences = X_data.flatten()
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
tokenized = tokenizer.texts_to_sequences(sentences)
X_train = pad_sequences(tokenized, maxlen=max_len)

**Create embedding matrix**

In [40]:
embed_size = 300
word_index = tokenizer.word_index
max_features = len(word_index)
embedding_matrix = build_embedding_matrix(vector, word_index, max_features, embed_size)

**Convert y to one hot vector**

In [49]:
ohe = OneHotEncoder(sparse=False)
y_ohe = ohe.fit_transform(y.reshape(-1, 1))

**Build model 1**

In [87]:
# define callbacks
file_path = "Model1.hdf5"
check_point = ModelCheckpoint(file_path, monitor="val_loss", verbose=1,
                              save_best_only=True, mode="min")
early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=3)
callbacks_list = [check_point, early_stop]

# train
model = Sequential()
model.add(Embedding(max_features+1, embed_size,
                    weights=[embedding_matrix], input_length=max_len, trainable=False))
model.add(Conv1D(64, 3, activation='relu', padding='same'))
model.add(GlobalMaxPooling1D())
model.add(Dense(4, activation='sigmoid'))
adam = optimizers.Adam(lr=0.001, decay=0.0)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()

# model training
history = model.fit(X_train, y_ohe, batch_size=32, epochs=100, callbacks=callbacks_list,
                    validation_split=0.1, shuffle=True, verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_19 (Embedding)     (None, 20, 300)           153000    
_________________________________________________________________
conv1d_36 (Conv1D)           (None, 20, 64)            57664     
_________________________________________________________________
global_max_pooling1d_24 (Glo (None, 64)                0         
_________________________________________________________________
dense_34 (Dense)             (None, 4)                 260       
Total params: 210,924
Trainable params: 57,924
Non-trainable params: 153,000
_________________________________________________________________
Train on 185 samples, validate on 21 samples
Epoch 1/100
 32/185 [====>.........................] - ETA: 6s - loss: 0.7478 - acc: 0.3438Epoch 00000: val_loss improved from inf to 0.63883, saving model to Model1.hdf5
Epoch 2/100
 32/185 [====>.....................

Epoch 27/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.0720 - acc: 1.0000Epoch 00026: val_loss improved from 0.20013 to 0.19503, saving model to Model1.hdf5
Epoch 28/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.0784 - acc: 0.9844Epoch 00027: val_loss improved from 0.19503 to 0.19029, saving model to Model1.hdf5
Epoch 29/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.1007 - acc: 0.9844Epoch 00028: val_loss improved from 0.19029 to 0.18600, saving model to Model1.hdf5
Epoch 30/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.0533 - acc: 0.9844Epoch 00029: val_loss improved from 0.18600 to 0.18146, saving model to Model1.hdf5
Epoch 31/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.0852 - acc: 0.9766Epoch 00030: val_loss improved from 0.18146 to 0.17739, saving model to Model1.hdf5
Epoch 32/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.1087 - acc: 0.9844Epoch 00031: val_loss improv

Epoch 55/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.0734 - acc: 0.9688Epoch 00054: val_loss improved from 0.10653 to 0.10459, saving model to Model1.hdf5
Epoch 56/100
Epoch 57/100
Epoch 58/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.0178 - acc: 1.0000Epoch 00057: val_loss improved from 0.10046 to 0.09858, saving model to Model1.hdf5
Epoch 59/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.0883 - acc: 0.9531Epoch 00058: val_loss improved from 0.09858 to 0.09677, saving model to Model1.hdf5
Epoch 60/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.0433 - acc: 0.9844Epoch 00059: val_loss improved from 0.09677 to 0.09494, saving model to Model1.hdf5
Epoch 61/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.0815 - acc: 0.9688Epoch 00060: val_loss improved from 0.09494 to 0.09306, saving model to Model1.hdf5
Epoch 62/100
Epoch 63/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.0464 - 

 32/185 [====>.........................] - ETA: 0s - loss: 0.0346 - acc: 0.9844Epoch 00082: val_loss improved from 0.06583 to 0.06490, saving model to Model1.hdf5
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.0713 - acc: 0.9688Epoch 00086: val_loss improved from 0.06209 to 0.06115, saving model to Model1.hdf5
Epoch 88/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.0270 - acc: 0.9844Epoch 00087: val_loss improved from 0.06115 to 0.06037, saving model to Model1.hdf5
Epoch 89/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.0174 - acc: 1.0000Epoch 00088: val_loss improved from 0.06037 to 0.05946, saving model to Model1.hdf5
Epoch 90/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.0233 - acc: 0.9844Epoch 00089: val_loss improved from 0.05946 to 0.05868, saving model to Model1.hdf5
Epoch 91/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.0748 - acc: 0.9531Ep

**Build model 2**

In [65]:
# define callbacks
file_path = "Model2.hdf5"
check_point = ModelCheckpoint(file_path, monitor="val_loss", verbose=1,
                              save_best_only=True, mode="min")
early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=3)
callbacks_list = [check_point, early_stop]

# train
model = Sequential()
model.add(Embedding(max_features+1, embed_size,
                    weights=[embedding_matrix], input_length=max_len, trainable=False))
model.add(Conv1D(64, 3, activation='relu', padding='same'))
model.add(MaxPooling1D(2))
model.add(Conv1D(64, 3, activation='relu', padding='same'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(1e-4)))
model.add(Dense(4, activation='sigmoid'))  # multi-label (k-hot encoding)

adam = optimizers.Adam(lr=0.001, decay=0.0)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()

# model training
history = model.fit(X_train, y_ohe, batch_size=32, epochs=100, callbacks=callbacks_list,
                    validation_split=0.1, shuffle=True, verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 20, 300)           153000    
_________________________________________________________________
conv1d_23 (Conv1D)           (None, 20, 64)            57664     
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, 10, 64)            0         
_________________________________________________________________
conv1d_24 (Conv1D)           (None, 10, 64)            12352     
_________________________________________________________________
global_max_pooling1d_12 (Glo (None, 64)                0         
_________________________________________________________________
dropout_12 (Dropout)         (None, 64)                0         
_________________________________________________________________
dense_23 (Dense)             (None, 32)                2080      
__________

Epoch 25/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.1074 - acc: 0.9688Epoch 00024: val_loss improved from 0.14654 to 0.13477, saving model to Model2.hdf5
Epoch 26/100
Epoch 27/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.0558 - acc: 0.9922Epoch 00026: val_loss improved from 0.12422 to 0.11196, saving model to Model2.hdf5
Epoch 28/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.0707 - acc: 0.9844Epoch 00027: val_loss improved from 0.11196 to 0.10761, saving model to Model2.hdf5
Epoch 29/100
Epoch 30/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.0910 - acc: 0.9609Epoch 00029: val_loss improved from 0.10198 to 0.09894, saving model to Model2.hdf5
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.1146 - acc: 0.9453Epoch 00036: val_loss improved from 0.07957 to 0.07095, saving model to Model2.hdf5
Epoch 38/100
 32

Epoch 54/100
Epoch 55/100
Epoch 56/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.0213 - acc: 1.0000Epoch 00055: val_loss did not improve
Epoch 57/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.0222 - acc: 1.0000Epoch 00056: val_loss did not improve
Epoch 58/100
 32/185 [====>.........................] - ETA: 0s - loss: 0.0639 - acc: 0.9688Epoch 00057: val_loss did not improve


**Build model 3**

In [92]:
file_path = "Model3.hdf5"
check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1,
                              save_best_only = True, mode = "min")
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 3)

def build_model1(lr=0.0, lr_d=0.0, units=0, spatial_dr=0.0, kernel_size1=3, kernel_size2=2, dense_units=128, dr=0.1, conv_size=32):
    inp = Input(shape = (max_len,))
    x = Embedding(max_features+1, embed_size, weights = [embedding_matrix], trainable = False)(inp)
    x1 = SpatialDropout1D(spatial_dr)(x)

    x_gru = Bidirectional(GRU(units, return_sequences = True))(x1)
    x1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool1_gru = GlobalAveragePooling1D()(x1)
    max_pool1_gru = GlobalMaxPooling1D()(x1)
    
    x3 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_gru)
    avg_pool3_gru = GlobalAveragePooling1D()(x3)
    max_pool3_gru = GlobalMaxPooling1D()(x3)
    
    x_lstm = Bidirectional(LSTM(units, return_sequences = True))(x1)
    x1 = Conv1D(conv_size, kernel_size=kernel_size1, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool1_lstm = GlobalAveragePooling1D()(x1)
    max_pool1_lstm = GlobalMaxPooling1D()(x1)
    
    x3 = Conv1D(conv_size, kernel_size=kernel_size2, padding='valid', kernel_initializer='he_uniform')(x_lstm)
    avg_pool3_lstm = GlobalAveragePooling1D()(x3)
    max_pool3_lstm = GlobalMaxPooling1D()(x3)
    
    
    x = concatenate([avg_pool1_gru, max_pool1_gru, avg_pool3_gru, max_pool3_gru,
                    avg_pool1_lstm, max_pool1_lstm, avg_pool3_lstm, max_pool3_lstm])
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(dense_units, activation='relu') (x))
    x = BatchNormalization()(x)
    x = Dropout(dr)(Dense(int(dense_units / 2), activation='relu') (x))
    x = Dense(4, activation = "sigmoid")(x)
    model = Model(inputs = inp, outputs = x)
    model.summary()
    model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = lr, decay = lr_d), metrics = ["accuracy"])
    history = model.fit(X_train, y_ohe, batch_size = 32, epochs = 100, validation_split=0.1, 
                        verbose = 1, callbacks = [check_point, early_stop])

In [93]:
model1 = build_model1(lr = 1e-3, lr_d = 0, units = 64, spatial_dr = 0.5, kernel_size1=3, kernel_size2=2, dense_units=128, dr=0.1, conv_size=32)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_4 (InputLayer)             (None, 20)            0                                            
____________________________________________________________________________________________________
embedding_20 (Embedding)         (None, 20, 300)       153000      input_4[0][0]                    
____________________________________________________________________________________________________
spatial_dropout1d_4 (SpatialDrop (None, 20, 300)       0           embedding_20[0][0]               
____________________________________________________________________________________________________
bidirectional_5 (Bidirectional)  (None, 20, 128)       140160      spatial_dropout1d_4[0][0]        
___________________________________________________________________________________________

Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100


In [100]:
model1_file_path = 'Model1.hdf5'
model1 = load_model(model1_file_path)
score1 = model1.evaluate(X_train, y_ohe, verbose = 1)
print(score1)

 32/206 [===>..........................] - ETA: 8s[0.037539691130802466, 0.98300970873786409]


In [102]:
model2_file_path = 'Model2.hdf5'
model2 = load_model(model2_file_path)
score = model2.evaluate(X_train, y_ohe, verbose = 1)
print(score2)

 32/206 [===>..........................] - ETA: 9s[0.038978623533711849, 0.98300970873786409]


In [101]:
model3_file_path = 'Model3.hdf5'
model3 = load_model(model3_file_path)
score3 = model3.evaluate(X_train, y_ohe, verbose = 1)
print(score3)


