In [1]:
import numpy as np
import matplotlib.pyplot as plt

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

import nltk
from sklearn.model_selection import train_test_split

In [5]:
nltk.download('treebank')
tagged_sentences = nltk.corpus.treebank.tagged_sents()
#  check dataset
print(len(tagged_sentences))
print(tagged_sentences[0])

[nltk_data] Downloading package treebank to
[nltk_data]     /Users/godpeny/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


3914
[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]


# Data Preprocessing

In [11]:
# split sentences and tags from dataset
sentences, tags =[], []

for tagged_sentence in tagged_sentences:
    sentence, tag = zip(*tagged_sentence)
    # save dataset
    sentences.append(list(sentence))
    tags.append(list(tag))
    
    
print(sentences[3])
print(tags[3])

['A', 'form', 'of', 'asbestos', 'once', 'used', '*', '*', 'to', 'make', 'Kent', 'cigarette', 'filters', 'has', 'caused', 'a', 'high', 'percentage', 'of', 'cancer', 'deaths', 'among', 'a', 'group', 'of', 'workers', 'exposed', '*', 'to', 'it', 'more', 'than', '30', 'years', 'ago', ',', 'researchers', 'reported', '0', '*T*-1', '.']
['DT', 'NN', 'IN', 'NN', 'RB', 'VBN', '-NONE-', '-NONE-', 'TO', 'VB', 'NNP', 'NN', 'NNS', 'VBZ', 'VBN', 'DT', 'JJ', 'NN', 'IN', 'NN', 'NNS', 'IN', 'DT', 'NN', 'IN', 'NNS', 'VBN', '-NONE-', 'TO', 'PRP', 'RBR', 'IN', 'CD', 'NNS', 'IN', ',', 'NNS', 'VBD', '-NONE-', '-NONE-', '.']


In [12]:
max_len_sentence = max(len(sentence) for sentence in sentences)
avg_len_sentence = sum(map(len, sentences)) / len(sentences)

print(max_len_sentence, avg_len_sentence)

271 25.722023505365357


# Tokenizing

In [34]:
sen_tokenizer = Tokenizer()
tag_tokenizer = Tokenizer()

sen_tokenizer.fit_on_texts(sentences)
tag_tokenizer.fit_on_texts(tags)

X_train = sen_tokenizer.texts_to_sequences(sentences)
y_train = tag_tokenizer.texts_to_sequences(tags)

sen_size = len(sen_tokenizer.word_index) + 1
tag_size = len(tag_tokenizer.word_index) + 1

print(X_train[:3])
print(y_train[:3])

[[5601, 3746, 1, 2024, 86, 331, 1, 46, 2405, 2, 131, 27, 6, 2025, 332, 459, 2026, 3], [31, 3746, 20, 177, 4, 5602, 2915, 1, 2, 2916, 637, 147, 3], [2917, 5603, 1, 1136, 86, 331, 8, 602, 177, 4, 3747, 1046, 892, 893, 1, 34, 483, 9, 6, 2025, 332, 4, 51, 1047, 435, 2918, 3]]
[[3, 3, 8, 10, 6, 7, 8, 21, 13, 4, 1, 2, 4, 7, 1, 3, 10, 9], [3, 3, 17, 1, 2, 3, 3, 8, 4, 3, 19, 1, 9], [3, 3, 8, 10, 6, 7, 14, 7, 1, 2, 3, 3, 3, 3, 8, 11, 16, 5, 4, 7, 1, 2, 4, 7, 7, 1, 9]]


In [35]:
# check length of sentences
def len_sen(num):
    cnt = 0
    for sentence in X_train:
        if len(sentence) > num:
           cnt += 1
    print((1-(cnt/len(X_train))) * 100)

In [36]:
# padding
len_sen(150) # if padding with max 150 length, 99.97% of sentences are included
max_len = 150

X_train_pad = pad_sequences(X_train, padding='post', maxlen=max_len)
y_train_pad = pad_sequences(y_train, padding='post', maxlen=max_len)

99.97445068983137


In [37]:
# split train and test dataset
X_train_pad, X_test_pad, y_train_pad, y_test_pad = train_test_split(X_train_pad, y_train_pad, test_size=0.2, random_state=777)

print(X_train_pad.shape, y_train_pad.shape)
print(X_test_pad.shape, y_test_pad.shape)

(3131, 150) (3131, 150)
(783, 150) (783, 150)


# Modeling

In [38]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, InputLayer, Bidirectional, TimeDistributed, Embedding
from keras.optimizers import Adam

In [41]:
# hyperparameters
embedding_dims = 128
hidden_units = 128

model = Sequential()
model.add(Embedding(input_dim=sen_size, output_dim=embedding_dims, mask_zero=True)) # mask_zero=True : padding 0
model.add(Bidirectional(LSTM(units=hidden_units, return_sequences=True)))
model.add(TimeDistributed(Dense(tag_size, activation='softmax')))
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])
model.summary()



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 128)         1457664   
                                                                 
 bidirectional_1 (Bidirecti  (None, None, 256)         263168    
 onal)                                                           
                                                                 
 time_distributed_1 (TimeDi  (None, None, 47)          12079     
 stributed)                                                      
                                                                 
Total params: 1732911 (6.61 MB)
Trainable params: 1732911 (6.61 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


##### mask_zero : Boolean, whether or not the input value 0 is a special "padding" value that should be masked out. (0를 연산에서 제외)
##### TimeDistributed : You can then use TimeDistributed to apply the same Conv2D layer to each of the timesteps, independently.(LSTM을 다 대 다 구조로 사용하여 LSTM의 모든 시점에 대해서 출력층을 사용)
##### SparseCategoricalCrossentropy : Use this crossentropy loss function when there are two or more label classes.
##### CategoricalCrossentropy : Use this crossentropy loss function when there are two or more label classes. We expect labels to be provided in a one_hot representation. (https://www.tensorflow.org/api_docs/python/tf/keras/losses/SparseCategoricalCrossentropy)

In [42]:
history = model.fit(X_train_pad, y_train_pad, batch_size=128, epochs=7, validation_data=(X_test_pad, y_test_pad))

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


# Inference

In [58]:
index_to_word = sen_tokenizer.index_word
index_to_tag = tag_tokenizer.index_word

i = 10 # 확인하고 싶은 테스트용 샘플의 인덱스.
y_predicted = model.predict(np.array([X_test_pad[i]])) # 입력한 테스트용 샘플에 대해서 예측값 y를 리턴
y_predicted = np.argmax(y_predicted, axis=-1) # 확률 벡터를 정수 레이블로 변환.

print(X_test_pad.shape, y_test_pad.shape, y_predicted.shape)

print("{:15}|{:5}|{}".format("단어", "실제값", "예측값"))
print(35 * "-")

for word, tag, pred in zip(X_test_pad[i], y_test_pad[i], y_predicted[0]):
    if word != 0: # PAD값은 제외함.
        print("{:17}: {:7} {}".format(index_to_word[word], index_to_tag[tag].upper(), index_to_tag[pred].upper()))

(783, 150) (783, 150) (1, 150)
단어             |실제값  |예측값
-----------------------------------
in               : IN      IN
addition         : NN      NN
,                : ,       ,
buick            : NNP     NNP
is               : VBZ     VBZ
a                : DT      DT
relatively       : RB      RB
respected        : VBN     VBN
nameplate        : NN      NN
among            : IN      IN
american         : NNP     NNP
express          : NNP     NNP
card             : NN      NN
holders          : NNS     NNS
,                : ,       ,
says             : VBZ     VBZ
0                : -NONE-  -NONE-
*t*-1            : -NONE-  -NONE-
an               : DT      DT
american         : NNP     NNP
express          : NNP     NNP
spokeswoman      : NN      NN
.                : .       .
