In [1]:
import urllib.request

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from sklearn.model_selection import train_test_split

In [2]:
data_path = "../data/"
df = pd.read_csv(data_path + "ner_dataset.csv", encoding="latin1")

print(len(df))
print(df.head(5))

1048575
    Sentence #           Word  POS Tag
0  Sentence: 1      Thousands  NNS   O
1          NaN             of   IN   O
2          NaN  demonstrators  NNS   O
3          NaN           have  VBP   O
4          NaN        marched  VBN   O


# Data Preprocessing

In [3]:
# see 'Tag' distribution
print(df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique())
print(df.groupby('Tag').size().reset_index(name='counts'))

47959 35177 17
      Tag  counts
0   B-art     402
1   B-eve     308
2   B-geo   37644
3   B-gpe   15870
4   B-nat     201
5   B-org   20143
6   B-per   16990
7   B-tim   20333
8   I-art     297
9   I-eve     253
10  I-geo    7414
11  I-gpe     198
12  I-nat      51
13  I-org   16784
14  I-per   17251
15  I-tim    6528
16      O  887908


In [4]:
# fill NaN with previous value
df = df.fillna(method="ffill") # fill NaN with previous value
print(df.isnull().values.any())
print(df.tail(10))

  df = df.fillna(method="ffill") # fill NaN with previous value


False
              Sentence #       Word  POS    Tag
1048565  Sentence: 47958     impact   NN      O
1048566  Sentence: 47958          .    .      O
1048567  Sentence: 47959     Indian   JJ  B-gpe
1048568  Sentence: 47959     forces  NNS      O
1048569  Sentence: 47959       said  VBD      O
1048570  Sentence: 47959       they  PRP      O
1048571  Sentence: 47959  responded  VBD      O
1048572  Sentence: 47959         to   TO      O
1048573  Sentence: 47959        the   DT      O
1048574  Sentence: 47959     attack   NN      O


In [5]:
df['Word'] = df['Word'].str.lower() # lowercase 
print(df['Sentence #'].nunique(), df.Word.nunique(), df.Tag.nunique())

47959 31817 17


In [6]:
# get word and tag in each sentence
func = lambda temp: [(word, tag) for word, tag in zip(temp["Word"].values.tolist(), temp["Tag"].values.tolist())]
tagged_sentences=[t for t in df.groupby("Sentence #").apply(func)]

In [7]:
print(tagged_sentences[0])

[('thousands', 'O'), ('of', 'O'), ('demonstrators', 'O'), ('have', 'O'), ('marched', 'O'), ('through', 'O'), ('london', 'B-geo'), ('to', 'O'), ('protest', 'O'), ('the', 'O'), ('war', 'O'), ('in', 'O'), ('iraq', 'B-geo'), ('and', 'O'), ('demand', 'O'), ('the', 'O'), ('withdrawal', 'O'), ('of', 'O'), ('british', 'B-gpe'), ('troops', 'O'), ('from', 'O'), ('that', 'O'), ('country', 'O'), ('.', 'O')]


In [8]:
# get sentence and tag
sentences, tags = [], []

for tagged_sentence in tagged_sentences:
    sentence, tag = zip(*tagged_sentence)
    sentences.append(list(sentence))
    tags.append(list(tag))

print(sentences[0])
print(tags[0])

['thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'london', 'to', 'protest', 'the', 'war', 'in', 'iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'british', 'troops', 'from', 'that', 'country', '.']
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [9]:
max_sentence_len = max(len(s) for s in sentences)
avg_sentence_len = sum(map(len, sentences))/len(sentences)
print("max sentence length: {}, average sentence length: {}".format(max_sentence_len, avg_sentence_len))

max sentence length: 104, average sentence length: 21.863987989741236


# Tokenizing

In [10]:
sen_tokenizer = Tokenizer(oov_token="OOV")
tag_tokenizer = Tokenizer(lower=False) # keep the original case

sen_tokenizer.fit_on_texts(sentences)
tag_tokenizer.fit_on_texts(tags)

X_data = sen_tokenizer.texts_to_sequences(sentences)
y_data = tag_tokenizer.texts_to_sequences(tags)

vocab_size = len(sen_tokenizer.word_index) + 1
tag_size = len(tag_tokenizer.word_index) + 1

print(sen_tokenizer.word_index["OOV"])
print(vocab_size)
print(tag_size)

print(X_data[0])
print(y_data[0])

1
31819
18
[254, 6, 967, 16, 1795, 238, 468, 7, 523, 2, 129, 5, 61, 9, 571, 2, 833, 6, 186, 90, 22, 15, 56, 3]
[1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 8, 1, 1, 1, 1, 1]


In [11]:
max_len = 70 # hyperparameter

X_data_pad = pad_sequences(X_data, padding="post", maxlen=max_len)
y_data_pad = pad_sequences(y_data, padding="post", maxlen=max_len)

X_train, X_test, y_train, y_test = train_test_split(X_data_pad, y_data_pad, test_size=0.2, random_state=0)

y_train_encod = to_categorical(y_train)
y_test_encod = to_categorical(y_test)

print(X_train.shape, y_train_encod.shape)
print(X_test.shape, y_test_encod.shape)

(38367, 70) (38367, 70, 18)
(9592, 70) (9592, 70, 18)


# Modeling

In [12]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding
from keras.optimizers import Adam

In [13]:
# hyperparameter
embedding_dim = 128
hidden_units = 256

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len, mask_zero=True))
model.add(Bidirectional(LSTM(units=hidden_units, return_sequences=True)))
model.add(TimeDistributed(Dense(tag_size, activation="softmax")))
model.summary()

model.compile(loss="categorical_crossentropy", optimizer=Adam(0.001), metrics=["accuracy"])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 70, 128)           4072832   
                                                                 
 bidirectional (Bidirection  (None, 70, 512)           788480    
 al)                                                             
                                                                 
 time_distributed (TimeDist  (None, 70, 18)            9234      
 ributed)                                                        
                                                                 
Total params: 4870546 (18.58 MB)
Trainable params: 4870546 (18.58 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________




 #### If mask_zero is set to True, as a consequence, index 0 cannot be used in the vocabulary (input_dim should equal size of vocabulary + 1).

In [14]:
history = model.fit(X_train, y_train_encod, batch_size=128, epochs=6, validation_split=0.1)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [22]:
i = 13 # 확인하고 싶은 테스트용 샘플의 인덱스.
y_predicted = model.predict(np.array([X_test[i]])) # 입력한 테스트용 샘플에 대해서 예측 y를 리턴
y_predicted = np.argmax(y_predicted, axis=-1) # 확률 벡터를 정수 인코딩으로 변경함.
labels = np.argmax(y_test_encod[i], -1) # 원-핫 인코딩을 다시 정수 인코딩으로 변경함.

print("{:15}|{:5}|{}".format("단어", "실제값", "예측값"))
print(35 * "-")

for word, tag, pred in zip(X_test[i], labels, y_predicted[0]):
    if word != 0: # PAD값은 제외함.
        print("{:17}: {:7} {}".format(sen_tokenizer.index_word[word], tag_tokenizer.index_word[tag], tag_tokenizer.index_word[pred]))

단어             |실제값  |예측값
-----------------------------------
jordan           : B-org   B-gpe
's               : O       O
king             : B-per   B-per
abdullah         : I-per   I-per
joined           : O       O
a                : O       O
prayer           : O       O
service          : O       O
in               : O       O
the              : O       O
red              : B-geo   B-geo
sea              : I-geo   I-geo
port             : O       O
of               : O       O
aqaba            : B-geo   B-geo
.                : O       O


# Evaluate with F1-score
## Precision vs Recall
 - the precision is the number of true positive results divided by the number of all positive results, including those not identified correctly. (=positive predictive value) (True라고 분류한 것 중에서 실제로 True인 것의 비율)
 - the recall is the number of true positive results divided by the number of all samples that should have been identified as positive. (=sensitivity) (실제 True인 것 중에서 모델이 True라고 예측한 것의 비율)
## F1-score
### https://en.wikipedia.org/wiki/F-score
 - the F1 score is the harmonic mean of precision and recall
 - the F1 score is the harmonic mean of the precision and recall, where an F1 score reaches its best value at 1 (perfect precision and recall) and worst at 0.
 - the F1 score weights recall more than precision by a factor of beta. beta == 1.0 means recall and precision are equally important.
 - the support is the number of occurrences of each class in y_true.

In [25]:
from seqeval.metrics import f1_score, classification_report

In [44]:
def sequences_to_tag(sequences):
    result = []
    # 전체 시퀀스로부터 시퀀스를 하나씩 꺼낸다.
    for sequence in sequences:
        word_sequence = []
        # 시퀀스로부터 확률 벡터 또는 원-핫 벡터를 하나씩 꺼낸다.
        for pred in sequence:
            # 정수로 변환. 예를 들어 pred가 [0, 0, 1, 0 ,0]라면 1의 인덱스인 2를 리턴한다.
            pred_index = np.argmax(pred)
            # index_word 사용하여 정수를 태깅 정보로 변환. 'PAD'는 'O'로 변경.
            if pred_index == 0: # PAD
                word_sequence.append("O")
            else:
                word_sequence.append(tag_tokenizer.index_word[pred_index].replace("PAD", "O"))
        result.append(word_sequence)
    return result

In [45]:
y_predicted = model.predict([X_test])
pred_tags = sequences_to_tag(y_predicted)
test_tags = sequences_to_tag(y_test_encod)

print("F1-score: {:.1%}".format(f1_score(test_tags, pred_tags)))
print(classification_report(test_tags, pred_tags))

F1-score: 78.3%
              precision    recall  f1-score   support

         art       0.00      0.00      0.00        91
         eve       0.68      0.29      0.41        65
         geo       0.82      0.85      0.84      7584
         gpe       0.96      0.92      0.94      3195
         nat       0.55      0.26      0.35        47
         org       0.59      0.59      0.59      4036
         per       0.72      0.70      0.71      3403
         tim       0.83      0.85      0.84      4149

   micro avg       0.78      0.79      0.78     22570
   macro avg       0.64      0.56      0.58     22570
weighted avg       0.78      0.79      0.78     22570
