In [1]:
import re
import urllib.request
import numpy as np

import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

from sklearn.model_selection import train_test_split

In [4]:
data_path = "../data/"
urllib.request.urlretrieve("https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/12.%20RNN%20Sequence%20Labeling/dataset/train.txt", filename=data_path+"train_tag.txt")

f = open(data_path+'train_tag.txt', 'r')
tagged_sentences = []
sentence = []

for line in f:
    if len(line)==0 or line.startswith('-DOCSTART') or line[0]=="\n":
        if len(sentence) > 0:
            tagged_sentences.append(sentence)
            sentence = []
        continue
    splits = line.split(' ') # 공백을 기준으로 속성을 구분한다.
    splits[-1] = re.sub(r'\n', '', splits[-1]) # 줄바꿈 표시 \n을 제거한다.
    word = splits[0].lower() # 단어들은 소문자로 바꿔서 저장한다.
    sentence.append([word, splits[-1]]) # 단어와 개체명 태깅만 기록한다.
    
print(len(tagged_sentences))
print(tagged_sentences[0])

14041
[['eu', 'B-ORG'], ['rejects', 'O'], ['german', 'B-MISC'], ['call', 'O'], ['to', 'O'], ['boycott', 'O'], ['british', 'B-MISC'], ['lamb', 'O'], ['.', 'O']]


# Data Preprocessing

In [6]:
sentences, tags = [], []

for tagged_sentence in tagged_sentences:
    sen, tag = zip(*tagged_sentence)
    sentences.append(list(sen))
    tags.append(list(tag))
    
print(sentences[0])
print(tags[0])

['eu', 'rejects', 'german', 'call', 'to', 'boycott', 'british', 'lamb', '.']
['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


In [7]:
# check data distribution
max_sentence_len = max(len(sentence) for sentence in sentences)
avg_sentence_len = sum(map(len, sentences))/len(sentences)

print(max_sentence_len, avg_sentence_len)

113 14.501887329962253


In [12]:
# check length of sentences
def len_sen(num):
    cnt = 0
    for sentence in X_train:
        if len(sentence) > num:
            cnt += 1
    print((1-(cnt/len(X_train))) * 100)

In [15]:
print(len_sen(70)) # if padding with max 70 length, 99.98% of sentences are included
max_len = 70

99.98575600028488
None


# Tokenizing

In [16]:
# hyperparameters
vocab_size = 4000 # use most frequent 4000 words

sen_tokenizer = Tokenizer(num_words=vocab_size, oov_token='OOV')
sen_tokenizer.fit_on_texts(sentences)

tag_tokenizer = Tokenizer()
tag_tokenizer.fit_on_texts(tags)

print(len(sen_tokenizer.word_index))
print(len(tag_tokenizer.word_index))

21010
9


In [17]:
X_train = sen_tokenizer.texts_to_sequences(sentences)
y_train = tag_tokenizer.texts_to_sequences(tags)

print(X_train[0])
print(y_train[0])

decoded = []
for idx in X_train[0]:
    decoded.append(sen_tokenizer.index_word[idx])
    
print(decoded) # less frequent words are replaced with 'OOV' 

[989, 1, 205, 629, 7, 3939, 216, 1, 3]
[4, 1, 7, 1, 1, 1, 7, 1, 1]
['eu', 'OOV', 'german', 'call', 'to', 'boycott', 'british', 'OOV', '.']


In [25]:
# padding
X_train_pad = pad_sequences(X_train, padding='post', maxlen=max_len)
y_train_pad = pad_sequences(y_train, padding='post', maxlen=max_len)

# split train and test data
X_train_pad_split, X_test_pad_split, y_train_pad_split, y_test_pad_split = train_test_split(X_train_pad, y_train_pad, test_size=0.2, random_state=777)

# one-hot encoding
y_train_pad_encoded = to_categorical(y_train_pad_split)
y_test_pad_encoded = to_categorical(y_test_pad_split)

print(X_train_pad_split.shape, X_test_pad_split.shape)
print(y_train_pad_encoded.shape, y_test_pad_encoded.shape)

(11232, 70) (2809, 70)
(11232, 70, 10) (2809, 70, 10)


# Modeling

In [26]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Bidirectional, TimeDistributed
from keras.optimizers import Adam

In [27]:
# hyperparameters
embedding_dim = 128
hidden_units = 128

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len, mask_zero=True))
model.add(Bidirectional(LSTM(units=hidden_units, return_sequences=True)))
model.add(TimeDistributed(Dense(units=len(tag_tokenizer.word_index)+1, activation='softmax')))
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 70, 128)           512000    
                                                                 
 bidirectional_1 (Bidirecti  (None, 70, 256)           263168    
 onal)                                                           
                                                                 
 time_distributed_1 (TimeDi  (None, 70, 10)            2570      
 stributed)                                                      
                                                                 
Total params: 777738 (2.97 MB)
Trainable params: 777738 (2.97 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [29]:
history = model.fit(X_train_pad_split, y_train_pad_encoded, batch_size=128, epochs=8, validation_split=0.2)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [30]:
model.evaluate(X_test_pad_split, y_test_pad_encoded)[1]



0.9526053071022034

# Inferencing

In [31]:
i = 10 # 확인하고 싶은 테스트용 샘플의 인덱스.

# 인덱싱
index_to_word = sen_tokenizer.index_word
index_to_ner = tag_tokenizer.index_word

# 입력한 테스트용 샘플에 대해서 예측 y를 리턴
y_predicted = model.predict(np.array([X_test_pad_split[i]]))

# 확률 벡터를 정수 레이블로 변경.
y_predicted = np.argmax(y_predicted, axis=-1)

# 원-핫 벡터를 정수 인코딩으로 변경.
labels = np.argmax(y_test_pad_encoded[i], -1)

print("{:15}|{:5}|{}".format("단어", "실제값", "예측값"))
print(35 * "-")

for word, tag, pred in zip(X_test_pad_split[i], labels, y_predicted[0]):
    if word != 0: # PAD값은 제외함.
        print("{:17}: {:7} {}".format(index_to_word[word], index_to_ner[tag].upper(), index_to_ner[pred].upper()))


단어             |실제값  |예측값
-----------------------------------
sarah            : B-PER   B-PER
brady            : I-PER   I-PER
,                : O       O
whose            : O       O
republican       : B-MISC  B-MISC
husband          : O       O
was              : O       O
OOV              : O       O
OOV              : O       O
in               : O       O
an               : O       O
OOV              : O       O
attempt          : O       O
on               : O       O
president        : O       O
ronald           : B-PER   B-PER
reagan           : I-PER   I-PER
,                : O       O
took             : O       O
centre           : O       O
stage            : O       O
at               : O       O
the              : O       O
democratic       : B-MISC  B-MISC
national         : I-MISC  I-MISC
convention       : I-MISC  I-MISC
on               : O       O
monday           : O       O
night            : O       O
to               : O       O
OOV              : O       O
pre

#### O is the most frequent tag, so accuracy is high which means model is not good