In [1]:
import urllib.request

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from sklearn.model_selection import train_test_split

In [2]:
data_path = "../data/"
df = pd.read_csv(data_path + "ner_dataset.csv", encoding="latin1")

print(len(df))
print(df.head(5))

1048575
    Sentence #           Word  POS Tag
0  Sentence: 1      Thousands  NNS   O
1          NaN             of   IN   O
2          NaN  demonstrators  NNS   O
3          NaN           have  VBP   O
4          NaN        marched  VBN   O


# Data Preprocessing

In [3]:
# see 'Tag' distribution
print(df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique())
print(df.groupby('Tag').size().reset_index(name='counts'))

47959 35177 17
      Tag  counts
0   B-art     402
1   B-eve     308
2   B-geo   37644
3   B-gpe   15870
4   B-nat     201
5   B-org   20143
6   B-per   16990
7   B-tim   20333
8   I-art     297
9   I-eve     253
10  I-geo    7414
11  I-gpe     198
12  I-nat      51
13  I-org   16784
14  I-per   17251
15  I-tim    6528
16      O  887908


In [4]:
# fill NaN with previous value
df = df.fillna(method="ffill") # fill NaN with previous value
print(df.isnull().values.any())
print(df.tail(10))

  df = df.fillna(method="ffill") # fill NaN with previous value


False
              Sentence #       Word  POS    Tag
1048565  Sentence: 47958     impact   NN      O
1048566  Sentence: 47958          .    .      O
1048567  Sentence: 47959     Indian   JJ  B-gpe
1048568  Sentence: 47959     forces  NNS      O
1048569  Sentence: 47959       said  VBD      O
1048570  Sentence: 47959       they  PRP      O
1048571  Sentence: 47959  responded  VBD      O
1048572  Sentence: 47959         to   TO      O
1048573  Sentence: 47959        the   DT      O
1048574  Sentence: 47959     attack   NN      O


In [5]:
df['Word'] = df['Word'].str.lower() # lowercase 
print(df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique())

47959 31817 17


In [6]:
# get word and tag in each sentence
func = lambda temp: [(word, tag) for word, tag in zip(temp["Word"].values.tolist(), temp["Tag"].values.tolist())]
tagged_sentences=[t for t in df.groupby("Sentence #").apply(func)]

In [7]:
print(tagged_sentences[0])

[('thousands', 'O'), ('of', 'O'), ('demonstrators', 'O'), ('have', 'O'), ('marched', 'O'), ('through', 'O'), ('london', 'B-geo'), ('to', 'O'), ('protest', 'O'), ('the', 'O'), ('war', 'O'), ('in', 'O'), ('iraq', 'B-geo'), ('and', 'O'), ('demand', 'O'), ('the', 'O'), ('withdrawal', 'O'), ('of', 'O'), ('british', 'B-gpe'), ('troops', 'O'), ('from', 'O'), ('that', 'O'), ('country', 'O'), ('.', 'O')]


In [8]:
# get sentence and tag
sentences, tags = [], []

for tagged_sentence in tagged_sentences:
    sentence, tag = zip(*tagged_sentence)
    sentences.append(list(sentence))
    tags.append(list(tag))

print(sentences[0])
print(tags[0])

['thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'london', 'to', 'protest', 'the', 'war', 'in', 'iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'british', 'troops', 'from', 'that', 'country', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [9]:
max_sentence_len = max(len(s) for s in sentences)
avg_sentence_len = sum(map(len, sentences))/len(sentences)
print("max sentence length: {}, average sentence length: {}".format(max_sentence_len, avg_sentence_len))

max sentence length: 104, average sentence length: 21.863987989741236


# Tokenizing

In [10]:
sen_tokenizer = Tokenizer(oov_token="OOV")
tag_tokenizer = Tokenizer(lower=False) # keep the original case

sen_tokenizer.fit_on_texts(sentences)
tag_tokenizer.fit_on_texts(tags)

X_data = sen_tokenizer.texts_to_sequences(sentences)
y_data = tag_tokenizer.texts_to_sequences(tags)

vocab_size = len(sen_tokenizer.word_index) + 1
tag_size = len(tag_tokenizer.word_index) + 1

print(sen_tokenizer.word_index["OOV"])
print(vocab_size)
print(tag_size)

print(X_data[0])
print(y_data[0])

1
31819
18
[254, 6, 967, 16, 1795, 238, 468, 7, 523, 2, 129, 5, 61, 9, 571, 2, 833, 6, 186, 90, 22, 15, 56, 3]
[1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1]


In [11]:
max_len = 70 # hyperparameter

X_data_pad = pad_sequences(X_data, padding="post", maxlen=max_len)
y_data_pad = pad_sequences(y_data, padding="post", maxlen=max_len)

X_train, X_test, y_train, y_test = train_test_split(X_data_pad, y_data_pad, test_size=0.2, random_state=0)

y_train_encod = to_categorical(y_train)
y_test_encod = to_categorical(y_test)

print(X_train.shape, y_train_encod.shape)
print(X_test.shape, y_test_encod.shape)

(38367, 70) (38367, 70, 18)
(9592, 70) (9592, 70, 18)


# Modeling

In [21]:
from keras import Model
from keras.layers import Dense, LSTM, Input, Bidirectional, TimeDistributed, Embedding, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam
from keras_crf import CRFModel
from seqeval.metrics import f1_score, classification_report

In [22]:
# hyperparameter
embedding_dims = 128
hidden_units = 64
dropout_ratio = 0.3

input = Input(shape=(max_len,))
model = Embedding(input_dim=vocab_size, output_dim=embedding_dims, input_length=max_len)(input)
model = Bidirectional(LSTM(units=hidden_units, return_sequences=True))(model)
model = Dropout(dropout_ratio)(model)
model = TimeDistributed(Dense(tag_size, activation="relu"))(model)
model = Model(input,model)
model = CRFModel(model=model, units=tag_size) # CRF layer
model.summary()

model.compile(optimizer=Adam(0.001), metrics=['accuracy']) # loss function is included in CRF layer

Model: "crf_model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_5 (InputLayer)        [(None, 70)]                 0         []                            
                                                                                                  
 embedding_4 (Embedding)     (None, 70, 128)              4072832   ['input_5[0][0]']             
                                                                                                  
 bidirectional_4 (Bidirecti  (None, 70, 128)              98816     ['embedding_4[0][0]']         
 onal)                                                                                            
                                                                                                  
 dropout_4 (Dropout)         (None, 70, 128)              0         ['bidirectional_4[0]



#### no need to specify a loss for CRFModel, model will compute crf loss by itself
#### https://pypi.org/project/keras-crf/

In [23]:
model_path = "../model/"
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
mc = ModelCheckpoint(model_path + 'ner_bilstm_crf_model.ckpt', monitor='val_decode_sequence_accuracy', mode='max', verbose=1, save_best_only=True, save_weights_only=True)

In [24]:
history = model.fit(X_train, y_train, batch_size=128, epochs=15, validation_split=0.1, verbose=1, callbacks=[es, mc]) # use y_train instead of y_train_encod because CRF does not need one-hot encoding

Epoch 1/15
Epoch 1: val_decode_sequence_accuracy improved from -inf to 0.95959, saving model to ../model/ner_bilstm_crf_model.ckpt
Epoch 2/15
Epoch 2: val_decode_sequence_accuracy improved from 0.95959 to 0.97711, saving model to ../model/ner_bilstm_crf_model.ckpt
Epoch 3/15
Epoch 3: val_decode_sequence_accuracy improved from 0.97711 to 0.98275, saving model to ../model/ner_bilstm_crf_model.ckpt
Epoch 4/15
Epoch 4: val_decode_sequence_accuracy improved from 0.98275 to 0.98419, saving model to ../model/ner_bilstm_crf_model.ckpt
Epoch 5/15
Epoch 5: val_decode_sequence_accuracy improved from 0.98419 to 0.98484, saving model to ../model/ner_bilstm_crf_model.ckpt
Epoch 6/15
Epoch 6: val_decode_sequence_accuracy improved from 0.98484 to 0.98523, saving model to ../model/ner_bilstm_crf_model.ckpt
Epoch 7/15
Epoch 7: val_decode_sequence_accuracy improved from 0.98523 to 0.98564, saving model to ../model/ner_bilstm_crf_model.ckpt
Epoch 8/15
Epoch 8: val_decode_sequence_accuracy did not improve 

# Evaluate

In [37]:
model.load_weights(model_path +'ner_bilstm_crf_model.ckpt' )

i = 13 # 확인하고 싶은 테스트용 샘플의 인덱스.
y_predicted = model.predict(np.array([X_test[i]]))[0] # 입력한 테스트용 샘플에 대해서 예측 y를 리턴
labels = np.argmax(y_test_encod[i], -1) # 원-핫 인코딩을 다시 정수 인코딩으로 변경.

print("{:15}|{:5}|{}".format("단어", "실제값", "예측값"))
print(35 * "-")

for word, tag, pred in zip(X_test[i], labels, y_predicted[0]):
    if word != 0: # PAD값은 제외함.
        print("{:17}: {:7} {}".format(sen_tokenizer.index_word[word], tag_tokenizer.index_word[tag], tag_tokenizer.index_word[pred]))

단어             |실제값  |예측값
-----------------------------------
jordan           : B-org   B-gpe
's               : O       O
king             : B-per   B-per
abdullah         : I-per   I-per
joined           : O       O
a                : O       O
prayer           : O       O
service          : O       O
in               : O       O
the              : O       O
red              : B-geo   B-geo
sea              : I-geo   I-geo
port             : O       O
of               : O       O
aqaba            : B-geo   B-geo
.                : O       O


In [43]:
y_predicted = model.predict(X_test)[0]
print(y_predicted[:2])

[[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 2 9 8 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [44]:
def sequences_to_tag_for_crf(sequences):
    result = []
    # 전체 시퀀스로부터 시퀀스를 하나씩 꺼낸다.
    for sequence in sequences:
        word_sequence = []
        # 시퀀스로부터 예측 정수 레이블을 하나씩 꺼낸다.
        for pred_index in sequence:
            # 정수를 태깅 정보로 변환. 'PAD'는 'O'로 변경.
            if pred_index == 0: # PAD
                word_sequence.append("O")
            else:
                word_sequence.append(tag_tokenizer.index_word[pred_index].replace("PAD", "O"))
        result.append(word_sequence)
    return result

pred_tags = sequences_to_tag_for_crf(y_predicted)
test_tags = sequences_to_tag_for_crf(y_test) # not using 'y_test_encod' because CRF does not need one-hot encoding

print("F1-score: {:.1%}".format(f1_score(test_tags, pred_tags)))
print(classification_report(test_tags, pred_tags))

F1-score: 79.2%


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         art       0.00      0.00      0.00        91
         eve       0.75      0.14      0.23        65
         geo       0.82      0.85      0.83      7584
         gpe       0.94      0.93      0.94      3195
         nat       0.57      0.09      0.15        47
         org       0.64      0.58      0.61      4036
         per       0.74      0.70      0.72      3403
         tim       0.86      0.83      0.85      4149

   micro avg       0.80      0.78      0.79     22570
   macro avg       0.67      0.51      0.54     22570
weighted avg       0.80      0.78      0.79     22570
