In [1]:
import urllib.request

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from sklearn.model_selection import train_test_split

In [2]:
data_path = "../data/"
df = pd.read_csv(data_path + "ner_dataset.csv", encoding="latin1")

print(len(df))
print(df.head(5))

1048575
    Sentence #           Word  POS Tag
0  Sentence: 1      Thousands  NNS   O
1          NaN             of   IN   O
2          NaN  demonstrators  NNS   O
3          NaN           have  VBP   O
4          NaN        marched  VBN   O


# Data Preprocessing

In [3]:
# see 'Tag' distribution
print(df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique())
print(df.groupby('Tag').size().reset_index(name='counts'))

47959 35177 17
      Tag  counts
0   B-art     402
1   B-eve     308
2   B-geo   37644
3   B-gpe   15870
4   B-nat     201
5   B-org   20143
6   B-per   16990
7   B-tim   20333
8   I-art     297
9   I-eve     253
10  I-geo    7414
11  I-gpe     198
12  I-nat      51
13  I-org   16784
14  I-per   17251
15  I-tim    6528
16      O  887908


In [4]:
# fill NaN with previous value
df = df.fillna(method="ffill") # fill NaN with previous value
print(df.isnull().values.any())
print(df.tail(10))

False
              Sentence #       Word  POS    Tag
1048565  Sentence: 47958     impact   NN      O
1048566  Sentence: 47958          .    .      O
1048567  Sentence: 47959     Indian   JJ  B-gpe
1048568  Sentence: 47959     forces  NNS      O
1048569  Sentence: 47959       said  VBD      O
1048570  Sentence: 47959       they  PRP      O
1048571  Sentence: 47959  responded  VBD      O
1048572  Sentence: 47959         to   TO      O
1048573  Sentence: 47959        the   DT      O
1048574  Sentence: 47959     attack   NN      O


  df = df.fillna(method="ffill") # fill NaN with previous value


In [5]:
df['Word'] = df['Word'].str.lower() # lowercase 
print(df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique())

47959 31817 17


In [6]:
# get word and tag in each sentence
func = lambda temp: [(word, tag) for word, tag in zip(temp["Word"].values.tolist(), temp["Tag"].values.tolist())]
tagged_sentences=[t for t in df.groupby("Sentence #").apply(func)]

In [7]:
print(tagged_sentences[0])

[('thousands', 'O'), ('of', 'O'), ('demonstrators', 'O'), ('have', 'O'), ('marched', 'O'), ('through', 'O'), ('london', 'B-geo'), ('to', 'O'), ('protest', 'O'), ('the', 'O'), ('war', 'O'), ('in', 'O'), ('iraq', 'B-geo'), ('and', 'O'), ('demand', 'O'), ('the', 'O'), ('withdrawal', 'O'), ('of', 'O'), ('british', 'B-gpe'), ('troops', 'O'), ('from', 'O'), ('that', 'O'), ('country', 'O'), ('.', 'O')]


In [8]:
# get sentence and tag
sentences, tags = [], []

for tagged_sentence in tagged_sentences:
    sentence, tag = zip(*tagged_sentence)
    sentences.append(list(sentence))
    tags.append(list(tag))

print(sentences[0])
print(tags[0])

['thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'london', 'to', 'protest', 'the', 'war', 'in', 'iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'british', 'troops', 'from', 'that', 'country', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [9]:
max_sentence_len = max(len(s) for s in sentences)
avg_sentence_len = sum(map(len, sentences))/len(sentences)
print("max sentence length: {}, average sentence length: {}".format(max_sentence_len, avg_sentence_len))

max sentence length: 104, average sentence length: 21.863987989741236


# Tokenizing

In [10]:
sen_tokenizer = Tokenizer(oov_token="OOV")
tag_tokenizer = Tokenizer(lower=False) # keep the original case

sen_tokenizer.fit_on_texts(sentences)
tag_tokenizer.fit_on_texts(tags)

X_data = sen_tokenizer.texts_to_sequences(sentences)
y_data = tag_tokenizer.texts_to_sequences(tags)

vocab_size = len(sen_tokenizer.word_index) + 1
tag_size = len(tag_tokenizer.word_index) + 1

word_index= sen_tokenizer.word_index
index_word = sen_tokenizer.index_word

In [11]:
max_len = 70 # hyperparameter

X_data_pad = pad_sequences(X_data, padding="post", maxlen=max_len)
y_data_pad = pad_sequences(y_data, padding="post", maxlen=max_len)

X_train, X_test, y_train, y_test = train_test_split(X_data_pad, y_data_pad, test_size=0.2, random_state=777)

y_train_encod = to_categorical(y_train)
y_test_encod = to_categorical(y_test)

print(X_train.shape, y_train_encod.shape)
print(X_test.shape, y_test_encod.shape)

(38367, 70) (38367, 70, 18)
(9592, 70) (9592, 70, 18)


# Tokenizing with Character-Embedding

In [12]:
max_char_len = 15 # hyperparameter

words = list(set(df.Word.values)) # use set to remove duplicate words
chars = set([char for word in words for char in word]) # get all characters with no duplicate
chars = sorted(list(chars)) # sort characters

print(chars)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', '[', ']', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '~', '\x85', '\x91', '\x92', '\x93', '\x94', '\x96', '\x97', '\xa0', '°', 'é', 'ë', 'ö', 'ü']


In [13]:
char_index = {c:i+2 for i,c in enumerate(chars)} # index start from 2
char_index["OOV"] = 1 # OOV: Out of Vocabulary
char_index["PAD"] = 0 # PAD: Padding

index_char = {i:c for c,i in char_index.items()}

In [14]:
X_char = []
# padding for each word
for sentence in sentences:
    char_indices = [[char_index[char] for char in word] for word in sentence]
    char_indices_padded = pad_sequences(char_indices, maxlen=max_char_len, truncating="post", padding="post", value=0)
    X_char.append(char_indices_padded)
    
# padding for each sentence
X_char = pad_sequences(X_char, maxlen=max_len, padding="post", value=0)

In [15]:
# check the first word in the first sentence
# random state should be same with the previous one (X_train, X_test, y_train, y_test)
X_char_train, X_char_test, _, _ = train_test_split(X_char, y_data_pad, test_size=0.2, random_state=777) 

print(X_train[0])
print(index_word[X_train[0][0]])
print(' '.join([index_char[index] for index in X_char_train[0][0]]))

[ 150  928  361   17 2624    9 4130 3566    9    8 2893 1250  880  107
    3    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0]
soldiers
s o l d i e r s PAD PAD PAD PAD PAD PAD PAD


In [17]:
print(X_train.shape, X_test.shape)
print(X_char_train.shape, X_char_test.shape)
print(y_train.shape, y_test.shape)

(38367, 70) (9592, 70)
(38367, 70, 15) (9592, 70, 15)
(38367, 70) (9592, 70)


# 1. BiLSTM + CNN

In [27]:
import tensorflow as tf
from keras.layers import Embedding, Input, TimeDistributed, Dropout, concatenate, Bidirectional, LSTM, Conv1D, Dense, MaxPooling1D, Flatten
from keras import Model
from keras.initializers import RandomUniform
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
from keras.optimizers import Adam
from seqeval.metrics import f1_score, classification_report
from keras_crf import CRFModel

In [20]:
model_path = "../model/"

## Modeling

In [19]:
# hyperparameter
embedding_dim = 128
char_embedding_dim = 64
dropout_rate = 0.5
hidden_units = 256
num_filters = 30 # num of kernels
kernel_size = 3 

# word embedding
word_input = Input(shape=(None,), dtype="int32", name="word_input")
word_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, name="word_embedding")(word_input)

# char embedding
char_input = Input(shape=(None, max_char_len,), dtype="int32", name="char_input")
char_embedding = TimeDistributed(Embedding(input_dim=len(char_index), output_dim=char_embedding_dim, name="char_embedding",embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)))(char_input) # init: RandomUniform(minval=-0.5, maxval=0.5)
char_dropout = Dropout(rate=dropout_rate)(char_embedding)

# CNN
char_cnn_embedding = TimeDistributed(Conv1D(filters=num_filters, kernel_size=kernel_size, padding="same", activation="tanh", strides=1))(char_dropout)
char_cnn_embedding = TimeDistributed(MaxPooling1D(max_char_len))(char_cnn_embedding)
char_cnn_embedding = TimeDistributed(Flatten())(char_cnn_embedding)
char_cnn_embedding = Dropout(rate=dropout_rate)(char_cnn_embedding)

# concat word and char embedding
output = concatenate([word_embedding, char_cnn_embedding])
output = Bidirectional(LSTM(units=hidden_units, return_sequences=True))(output)
output = Dropout(rate=dropout_rate)(output)
output = TimeDistributed(Dense(tag_size, activation="softmax"))(output)

model = Model(inputs=[word_input, char_input], outputs=[output])
model.summary()

model.compile(loss="categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 char_input (InputLayer)     [(None, None, 15)]           0         []                            
                                                                                                  
 time_distributed (TimeDist  (None, None, 15, 64)         4736      ['char_input[0][0]']          
 ributed)                                                                                         
                                                                                                  
 dropout (Dropout)           (None, None, 15, 64)         0         ['time_distributed[0][0]']    
                                                                                                  
 time_distributed_1 (TimeDi  (None, None, 15, 30)         5790      ['dropout[0][0]']         

In [21]:
# callbacks
es = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=3)
mc = ModelCheckpoint(model_path + "ner_char_embedding_bilstm_cnn.h5", monitor="val_accuracy", mode="max", verbose=1, save_best_only=True)

In [22]:
history = model.fit([X_train, X_char_train], y_train_encod, batch_size=128, epochs=15, validation_split=0.1, callbacks=[es, mc])

Epoch 1/15
Epoch 1: val_accuracy improved from -inf to 0.97804, saving model to ../model/ner_char_embedding_bilstm_cnn.h5
Epoch 2/15


  saving_api.save_model(


Epoch 2: val_accuracy improved from 0.97804 to 0.98545, saving model to ../model/ner_char_embedding_bilstm_cnn.h5
Epoch 3/15
Epoch 3: val_accuracy improved from 0.98545 to 0.98692, saving model to ../model/ner_char_embedding_bilstm_cnn.h5
Epoch 4/15
Epoch 4: val_accuracy improved from 0.98692 to 0.98732, saving model to ../model/ner_char_embedding_bilstm_cnn.h5
Epoch 5/15
Epoch 5: val_accuracy did not improve from 0.98732
Epoch 6/15
Epoch 6: val_accuracy did not improve from 0.98732
Epoch 7/15
Epoch 7: val_accuracy improved from 0.98732 to 0.98736, saving model to ../model/ner_char_embedding_bilstm_cnn.h5
Epoch 7: early stopping


## Evaluating with F1-score

In [23]:
loaded_model = load_model(model_path + "ner_char_embedding_bilstm_cnn.h5")

i = 13 # 확인하고 싶은 테스트용 샘플의 인덱스.
# 입력한 테스트용 샘플에 대해서 예측 y를 리턴
y_predicted = model.predict([np.array([X_test[i]]), np.array([X_char_test[i]])])

y_predicted = np.argmax(y_predicted, axis=-1) # 확률 벡터를 정수 인코딩으로 변경.
labels = np.argmax(y_test_encod[i], -1) # 원-핫 인코딩을 정수 인코딩으로 변경.

print("{:15}|{:5}|{}".format("단어", "실제값", "예측값"))
print(35 * "-")

for word, tag, pred in zip(X_test[i], labels, y_predicted[0]):
    if word != 0: # PAD값은 제외함.
        print("{:17}: {:7} {}".format(sen_tokenizer.index_word[word], tag_tokenizer.index_word[tag], tag_tokenizer.index_word[pred]))

단어             |실제값  |예측값
-----------------------------------
the              : O       O
statement        : O       O
came             : O       O
as               : O       O
u.n.             : B-org   B-org
secretary-general: I-org   I-org
kofi             : B-per   B-per
annan            : I-per   I-per
met              : O       O
with             : O       O
officials        : O       O
in               : O       O
amman            : B-geo   B-geo
to               : O       O
discuss          : O       O
wednesday        : B-tim   B-tim
's               : O       O
attacks          : O       O
.                : O       O


In [24]:
def sequences_to_tag(sequences):
    result = []
    # 전체 시퀀스로부터 시퀀스를 하나씩 꺼낸다.
    for sequence in sequences:
        word_sequence = []
        # 시퀀스로부터 확률 벡터 또는 원-핫 벡터를 하나씩 꺼낸다.
        for pred in sequence:
            # 정수로 변환. 예를 들어 pred가 [0, 0, 1, 0 ,0]라면 1의 인덱스인 2를 리턴한다.
            pred_index = np.argmax(pred)
            # index_word 사용하여 정수를 태깅 정보로 변환. 'PAD'는 'O'로 변경.
            if pred_index == 0: # PAD
                word_sequence.append("O")
            else:
                word_sequence.append(tag_tokenizer.index_word[pred_index].replace("PAD", "O"))
        result.append(word_sequence)
    return result

In [25]:
y_predicted = model.predict([X_test, X_char_test])
pred_tags = sequences_to_tag(y_predicted)
test_tags = sequences_to_tag(y_test_encod)

print("F1-score: {:.1%}".format(f1_score(test_tags, pred_tags)))
print(classification_report(test_tags, pred_tags))

F1-score: 78.7%


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         art       0.00      0.00      0.00        63
         eve       0.64      0.27      0.38        52
         geo       0.82      0.85      0.84      7620
         gpe       0.94      0.94      0.94      3145
         nat       0.58      0.19      0.29        37
         org       0.59      0.57      0.58      4033
         per       0.72      0.71      0.72      3545
         tim       0.86      0.84      0.85      4067

   micro avg       0.79      0.78      0.79     22562
   macro avg       0.64      0.55      0.57     22562
weighted avg       0.79      0.78      0.78     22562


# 2. BiLSTM + CNN + CRF

## Modeling

In [53]:
# hyperparameter
embedding_dim = 128
char_embedding_dim = 64
dropout_rate = 0.5
hidden_units = 256
num_filters = 30 # num of kernels
kernel_size = 3

# word embedding
word_input = Input(shape=(None,), dtype="int32", name="word_input")
word_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, name="word_embedding")(word_input)

# char embedding
char_input = Input(shape=(None, max_char_len,), dtype="int32", name="char_input")
char_embedding = TimeDistributed(Embedding(input_dim=len(char_index), output_dim=char_embedding_dim, name="char_embedding",embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5)))(char_input) # init: RandomUniform(minval=-0.5, maxval=0.5)
char_dropout = Dropout(rate=dropout_rate)(char_embedding)

# CNN
char_cnn_embedding = TimeDistributed(Conv1D(filters=num_filters, kernel_size=kernel_size, padding="same", activation="tanh", strides=1))(char_dropout)
char_cnn_embedding = TimeDistributed(MaxPooling1D(max_char_len))(char_cnn_embedding)
char_cnn_embedding = TimeDistributed(Flatten())(char_cnn_embedding)
char_cnn_embedding = Dropout(rate=dropout_rate)(char_cnn_embedding)

# concat word and char embedding
output = concatenate([word_embedding, char_cnn_embedding])
output = Bidirectional(LSTM(units=hidden_units, return_sequences=True))(output)
output = Dropout(rate=dropout_rate)(output)
output = TimeDistributed(Dense(tag_size, activation="relu"))(output)

model = Model(inputs=[word_input, char_input], outputs=[output])
model = CRFModel(model, tag_size)
model.summary()

model.compile(optimizer=Adam(0.001), metrics=["accuracy"])

Model: "crf_model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 char_input (InputLayer)     [(None, None, 15)]           0         []                            
                                                                                                  
 time_distributed_10 (TimeD  (None, None, 15, 64)         4736      ['char_input[0][0]']          
 istributed)                                                                                      
                                                                                                  
 dropout_6 (Dropout)         (None, None, 15, 64)         0         ['time_distributed_10[0][0]'] 
                                                                                                  
 time_distributed_11 (TimeD  (None, None, 15, 30)         5790      ['dropout_6[0][0]'] 



In [56]:
# callbacks
es = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=3)
mc = ModelCheckpoint(model_path + "ner_char_embedding_bilstm_cnn_crf.ckpt", monitor="val_decode_sequence_accuracy", mode="max", verbose=1, save_best_only=True, save_weights_only=True)

In [57]:
history = model.fit([X_train, X_char_train], y_train, batch_size=128, epochs=15, validation_split=0.1, callbacks=[es, mc])

Epoch 1/15
Epoch 1: val_decode_sequence_accuracy improved from -inf to 0.97300, saving model to ../model/ner_char_embedding_bilstm_cnn_crf.ckpt
Epoch 2/15
Epoch 2: val_decode_sequence_accuracy improved from 0.97300 to 0.98363, saving model to ../model/ner_char_embedding_bilstm_cnn_crf.ckpt
Epoch 3/15
Epoch 3: val_decode_sequence_accuracy improved from 0.98363 to 0.98534, saving model to ../model/ner_char_embedding_bilstm_cnn_crf.ckpt
Epoch 4/15
Epoch 4: val_decode_sequence_accuracy did not improve from 0.98534
Epoch 5/15
Epoch 5: val_decode_sequence_accuracy improved from 0.98534 to 0.98542, saving model to ../model/ner_char_embedding_bilstm_cnn_crf.ckpt
Epoch 6/15
Epoch 6: val_decode_sequence_accuracy did not improve from 0.98542
Epoch 7/15
Epoch 7: val_decode_sequence_accuracy improved from 0.98542 to 0.98560, saving model to ../model/ner_char_embedding_bilstm_cnn_crf.ckpt
Epoch 8/15
Epoch 8: val_decode_sequence_accuracy improved from 0.98560 to 0.98583, saving model to ../model/ner_

## Evaluating with F1-score

In [58]:
model.load_weights(model_path + "ner_char_embedding_bilstm_cnn_crf.ckpt")

i = 13 # 확인하고 싶은 테스트용 샘플의 인덱스.
y_predicted = model.predict([np.array([X_test[i]]), np.array([X_char_test[i]])])[0]
labels = np.argmax(y_test_encod[i], -1) # 원-핫 인코딩을 다시 정수 인코딩으로 변경.

print("{:15}|{:5}|{}".format("단어", "실제값", "예측값"))
print(35 * "-")

for word, tag, pred in zip(X_test[i], labels, y_predicted[0]):
    if word != 0: # PAD값은 제외함.
        print("{:17}: {:7} {}".format(sen_tokenizer.index_word[word], tag_tokenizer.index_word[tag], tag_tokenizer.index_word[pred]))

단어             |실제값  |예측값
-----------------------------------
the              : O       O
statement        : O       O
came             : O       O
as               : O       O
u.n.             : B-org   B-org
secretary-general: I-org   I-org
kofi             : B-per   B-per
annan            : I-per   I-per
met              : O       O
with             : O       O
officials        : O       O
in               : O       O
amman            : B-geo   B-geo
to               : O       O
discuss          : O       O
wednesday        : B-tim   B-tim
's               : O       O
attacks          : O       O
.                : O       O


In [59]:
y_predicted = model.predict([X_test, X_char_test])[0]
print(y_predicted[:2])

[[ 1  3 10  1  2  1  1  1  1  1  1  1  1  1  1  1  1  1  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 1  1  1  1  1  1  3  1  1  1  1  1  1  1  2  9  9  1  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
   0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]


In [60]:
def sequences_to_tag_for_crf(sequences):
    result = []
    # 전체 시퀀스로부터 시퀀스를 하나씩 꺼낸다.
    for sequence in sequences:
        word_sequence = []
        # 시퀀스로부터 예측 정수 레이블을 하나씩 꺼낸다.
        for pred_index in sequence:
            # 정수를 태깅 정보로 변환. 'PAD'는 'O'로 변경.
            if pred_index == 0: # PAD
                word_sequence.append("O")
            else:
                word_sequence.append(tag_tokenizer.index_word[pred_index].replace("PAD", "O"))
        result.append(word_sequence)
    return result

pred_tags = sequences_to_tag_for_crf(y_predicted)
test_tags = sequences_to_tag(y_test_encod) # not using 'y_test_encod' because CRF does not need one-hot encoding

print("F1-score: {:.1%}".format(f1_score(test_tags, pred_tags)))
print(classification_report(test_tags, pred_tags))

F1-score: 79.9%
              precision    recall  f1-score   support

         art       0.00      0.00      0.00        63
         eve       0.48      0.27      0.35        52
         geo       0.83      0.86      0.84      7620
         gpe       0.96      0.93      0.94      3145
         nat       0.50      0.19      0.27        37
         org       0.68      0.55      0.61      4033
         per       0.76      0.69      0.72      3545
         tim       0.86      0.84      0.85      4067

   micro avg       0.82      0.78      0.80     22562
   macro avg       0.63      0.54      0.57     22562
weighted avg       0.81      0.78      0.79     22562


# 3. BiLSTM-BiLSTM + CRF

## Modeling

In [79]:
# hyperparameter
embedding_dim = 128
char_embedding_dim = 64
dropout_rate = 0.3
hidden_units = 256

# word embedding
word_input = Input(batch_shape=(None,None), dtype="int32", name="word_input")
word_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim, name="word_embedding")(word_input)

# char embedding
char_input = Input(batch_shape=(None, None, None), dtype="int32", name="char_input")
char_embedding = Embedding(input_dim=len(char_index), output_dim=char_embedding_dim, name="char_embedding",embeddings_initializer=RandomUniform(minval=-0.5, maxval=0.5))(char_input) # init: RandomUniform(minval=-0.5, maxval=0.5
# word embedding (# of char(batch_size) * char_embedding) -> word_embedding
word_embedding_char = TimeDistributed(Bidirectional(LSTM(units=hidden_units, return_sequences=False)))(char_embedding) # many-to-one

# concat word and char embedding
output = concatenate([word_embedding, word_embedding_char])
output = Dropout(rate=dropout_rate)(output)
output = Bidirectional(LSTM(units=hidden_units, return_sequences=True))(output)
output = Dropout(rate=dropout_rate)(output)
output = TimeDistributed(Dense(tag_size, activation="relu"))(output)

model = Model(inputs=[word_input, char_input], outputs=[output])
model = CRFModel(model, tag_size)
model.summary()

model.compile(optimizer=Adam(0.001), metrics=["accuracy"])

Model: "crf_model_5"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 char_input (InputLayer)     [(None, None, None)]         0         []                            
                                                                                                  
 word_input (InputLayer)     [(None, None)]               0         []                            
                                                                                                  
 char_embedding (Embedding)  (None, None, None, 64)       4736      ['char_input[0][0]']          
                                                                                                  
 word_embedding (Embedding)  (None, None, 128)            4072832   ['word_input[0][0]']          
                                                                                        



#### char_embedding = TimeDistributed(Bidirectional(LSTM(units=hidden_units, return_sequences=False)))(char_embedding)
 - many-to-one because return_sequences=False and TimeDistributed
 - many-to-one으로 하면 char_embedding의 shape이 (batch_size, max_char_len, hidden_units)가 되어서 word_embedding과 concat할 수 있음
 - 하나의 단어 벡터를 의미한다. (문자 벡터 * 단어의 문자 갯수(batch) -> 단어 벡터)


In [73]:
es = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=3)
mc = ModelCheckpoint(model_path + "ner_char_embedding_bilstm_crf.ckpt", monitor="val_decode_sequence_accuracy", mode="max", verbose=1, save_best_only=True, save_weights_only=True)

In [74]:
history = model.fit([X_train, X_char_train], y_train, batch_size=128, epochs=15, validation_split=0.1, callbacks=[es, mc])

Epoch 1/15
Epoch 1: val_decode_sequence_accuracy improved from -inf to 0.97605, saving model to ../model/ner_char_embedding_bilstm_crf.ckpt
Epoch 2/15
Epoch 2: val_decode_sequence_accuracy improved from 0.97605 to 0.98460, saving model to ../model/ner_char_embedding_bilstm_crf.ckpt
Epoch 3/15
Epoch 3: val_decode_sequence_accuracy improved from 0.98460 to 0.98662, saving model to ../model/ner_char_embedding_bilstm_crf.ckpt
Epoch 4/15
  4/270 [..............................] - ETA: 6:50 - decode_sequence_accuracy: 0.9894 - loss: 2.2092

KeyboardInterrupt: 

# Evaluating with F1-score

In [76]:
model.load_weights(model_path + "ner_char_embedding_bilstm_crf.ckpt")

i = 13 # 확인하고 싶은 테스트용 샘플의 인덱스.
# 입력한 테스트용 샘플에 대해서 예측 y를 리턴
y_predicted = model.predict([np.array([X_test[i]]), np.array([X_char_test[i]])])[0]
labels = np.argmax(y_test_encod[i], -1) # 원-핫 벡터를 정수 인코딩으로 변경.

print("{:15}|{:5}|{}".format("단어", "실제값", "예측값"))
print(35 * "-")

for word, tag, pred in zip(X_test[i], labels, y_predicted[0]):
    if word != 0: # PAD값은 제외함.
        print("{:17}: {:7} {}".format(sen_tokenizer.index_word[word], tag_tokenizer.index_word[tag], tag_tokenizer.index_word[pred]))

단어             |실제값  |예측값
-----------------------------------
the              : O       O
statement        : O       O
came             : O       O
as               : O       O
u.n.             : B-org   B-org
secretary-general: I-org   I-org
kofi             : B-per   B-per
annan            : I-per   I-per
met              : O       O
with             : O       O
officials        : O       O
in               : O       O
amman            : B-geo   B-geo
to               : O       O
discuss          : O       O
wednesday        : B-tim   B-tim
's               : O       O
attacks          : O       O
.                : O       O


In [78]:
y_predicted = model.predict([X_test, X_char_test])[0]
pred_tags = sequences_to_tag_for_crf(y_predicted)
test_tags = sequences_to_tag(y_test_encod) # not using 'y_test_encod' because CRF does not need one-hot encoding

print("F1-score: {:.1%}".format(f1_score(test_tags, pred_tags)))
print(classification_report(test_tags, pred_tags))

F1-score: 80.0%


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         art       0.00      0.00      0.00        63
         eve       1.00      0.04      0.07        52
         geo       0.80      0.89      0.84      7620
         gpe       0.94      0.94      0.94      3145
         nat       0.00      0.00      0.00        37
         org       0.70      0.49      0.58      4033
         per       0.76      0.76      0.76      3545
         tim       0.85      0.85      0.85      4067

   micro avg       0.81      0.79      0.80     22562
   macro avg       0.63      0.50      0.51     22562
weighted avg       0.80      0.79      0.79     22562
