<a href="https://colab.research.google.com/github/changyong93/Natural-language-processing-with-chat-bot/blob/main/%EB%94%A5%EB%9F%AC%EB%8B%9D%EC%9D%84_%EC%9D%B4%EC%9A%A9%ED%95%9C_%EC%9E%90%EC%97%B0%EC%96%B4%EC%B2%98%EB%A6%AC_%EC%9E%85%EB%AC%B8(13_1~2_NLP%EB%A5%BC_%EC%9C%84%ED%95%9C_%EC%8B%A0%EA%B2%BD%EB%A7%9D(CNN)_%EC%96%91%EB%B0%A9%ED%96%A5_LSTM%EC%9D%84_%EC%9D%B4%EC%9A%A9%ED%95%9C_%ED%92%88%EC%82%AC_%ED%83%9C%EA%B9%85.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 양방향 LSTM을 이용한 품사 태깅(Part-of-speech Tagging using Bi-LSTM)

## 품사 태깅 데이터에 대한 이해와 전처리

In [None]:
import nltk
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [None]:
nltk.download('treebank')

In [None]:
tagged_sentences = nltk.corpus.treebank.tagged_sents() # 토큰화에 품사 태깅이 된 데이터 받아오기
print("품사 태깅이 된 문장 개수: ", len(tagged_sentences)) # 문장 샘플의 개수 출력

In [None]:
print(tagged_sentences[0])

In [None]:
sentences, pos_tags = [], [] 
for tagged_sentence in tagged_sentences:
  sentence, tag_info = zip(*tagged_sentence)
  sentences.append(list(sentence))
  pos_tags.append(list(tag_info))

In [None]:
print(sentences[0])
print(pos_tags[0])

In [None]:
print('샘플의 최대 길이 : %d' % max(len(l) for l in sentences))
print('샘플의 평균 길이 : %f' % (sum(map(len, sentences))/len(sentences)))
plt.hist([len(s) for s in sentences], bins=50)
plt.xlabel('length of samples')
plt.ylabel('number of samples')
plt.show()

In [None]:
def tokenize(samples):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(samples)
  return tokenizer

In [None]:
src_tokenizer = tokenize(sentences)
tar_tokenizer = tokenize(pos_tags)

In [None]:
vocab_size = len(src_tokenizer.word_index) + 1
tag_size = len(tar_tokenizer.word_index) + 1
print('단어 집합의 크기 : {}'.format(vocab_size))
print('태깅 정보 집합의 크기 : {}'.format(tag_size))

In [None]:
X_train = src_tokenizer.texts_to_sequences(sentences)
y_train = tar_tokenizer.texts_to_sequences(pos_tags)

In [None]:
max_len = 150
X_train = pad_sequences(X_train, padding='post', maxlen=max_len)
y_train = pad_sequences(y_train, padding='post', maxlen=max_len)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=.2, random_state=777)

In [None]:
y_train = to_categorical(y_train, num_classes=tag_size)
y_test = to_categorical(y_test, num_classes=tag_size)

In [None]:
print('훈련 샘플 문장의 크기 : {}'.format(X_train.shape))
print('훈련 샘플 레이블의 크기 : {}'.format(y_train.shape))
print('테스트 샘플 문장의 크기 : {}'.format(X_test.shape))
print('테스트 샘플 레이블의 크기 : {}'.format(y_test.shape))

## 양방향 LSTM(Bi-directional LSTM)으로 POS Tagger 만들기

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Bidirectional, Embedding, LSTM,TimeDistributed
from tensorflow.keras.optimizers import Adam

In [None]:
model = Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = 128, input_length = max_len, mask_zero = True))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
# model.add(TimeDistributed(Dense(tag_size, activation=('softmax')))) # many to many에서는 각 시간대의 해당 셀에 대한 hidden_state를 아웃풋으로 전달할 때 TimeDistreibuted를 써야 했지만 이제는 불필요odel = Sequential()
model.add(Dense(tag_size, activation=('softmax')))
model.compile(loss='categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])
model.summary()


In [None]:
model.fit(X_train, y_train, batch_size=128, epochs=6,  validation_data=(X_test, y_test))

In [None]:
print("\n 테스트 정확도: %.4f" % (model.evaluate(X_test, y_test)[1]))

In [None]:
index_to_word=src_tokenizer.index_word
index_to_tag=tar_tokenizer.index_word

i= 10
y_pred = model.predict(np.array([X_test[i]]))
y_pred = np.argmax(y_pred,axis = -1)
true = np.argmax(y_test[i], axis = -1)  

print("{:15}|{:5}|{}".format("단어", "실제값", "예측값"))
print(35 * "-")

for w, t, pred in zip(X_test[i], true, y_pred[0]):
    if w != 0: # PAD값은 제외함.
        print("{:17}: {:7} {}".format(index_to_word[w], index_to_tag[t].upper(), index_to_tag[pred].upper()))