- Sequence Labeling

  RNN 다 대 다 & bidirectional RNN -> 개체명 인식기, 품사 태거

  sequence labeling : 입력 시퀀스에 대해 레이블 시퀀스를 각각 부여

- bidirectional RNN을 이용한 품사 태깅



In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.legacy import data
from torchtext.legacy import datasets
import time
import random
device = 'cpu'

text = data.Field(lower = True)
ud = data.Field(unk_token = None)
ptb = data.Field(unk_token = None)

fields = (("text", text), ("udtags", ud), ("ptbtags", ptb))

train, valid, test = datasets.UDPOS.splits(fields)

In [3]:
# vocabulary
min_freq = 5

text.build_vocab(train,min_freq = min_freq, vectors = 'glove.6B.100d') # GloVe
ud.build_vocab(train)
ptb.build_vocab(train)

ud.vocab.freqs.most_common(20)

.vector_cache/glove.6B.zip: 862MB [02:39, 5.41MB/s]                           
100%|█████████▉| 399999/400000 [00:17<00:00, 22225.98it/s]


[('NOUN', 34781),
 ('PUNCT', 23679),
 ('VERB', 23081),
 ('PRON', 18577),
 ('ADP', 17638),
 ('DET', 16285),
 ('PROPN', 12946),
 ('ADJ', 12477),
 ('AUX', 12343),
 ('ADV', 10548),
 ('CCONJ', 6707),
 ('PART', 5567),
 ('NUM', 3999),
 ('SCONJ', 3843),
 ('X', 847),
 ('INTJ', 688),
 ('SYM', 599)]

In [4]:
# dataloader
batchsize = 64

train_iter, valid_iter, test_iter = data.BucketIterator.splits(
    (train, valid, test),
    batch_size = batchsize,
    device = device
)

In [5]:
class RNNPOSTagger(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
    super().__init__()

    self.embedding = nn.Embedding(vocab_size,embedding_dim)
    self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers = n_layers, bidirectional = bidirectional)
    self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, text):
    embedded = self.dropout(self.embedding(text))

    outputs, (hidden, cell) = self.rnn(embedded)

    predictions = self.fc(self.dropout(outputs))

    return predictions

In [6]:
model = RNNPOSTagger(len(text.vocab), 100, 128, len(ud.vocab), 2, True, 0.25).to(device)

In [7]:
pretrained_embeddings = text.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

unk = text.vocab.stoi[text.unk_token] # 0
pad = text.vocab.stoi[text.pad_token] # 1

model.embedding.weight.data[unk] = torch.zeros(100)
model.embedding.weight.data[pad] = torch.zeros(100)

In [8]:
TAG_PAD_IDX = ud.vocab.stoi[ud.pad_token]

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index = TAG_PAD_IDX).to(device)

In [9]:
def categorical_accuracy(preds,y,tag_pad_idx):
  # 미니 배치 정확도 출력
  max_preds = preds.argmax(dim = 1, keepdim = True)
  non_pad_elements = (y != tag_pad_idx).nonzero()
  correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
  return correct.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]])

In [10]:
def train(model,iter, optim, criterion,tag_pad_idx):
  epoch_loss = 0
  epoch_acc = 0

  model.train()

  for batch in iter:
    text = batch.text
    tags = batch.udtags

    optim.zero_grad()
    predictions = model(text)

    predictions = predictions.view(-1,predictions.shape[-1])
    tags = tags.view(-1)
    loss = criterion(predictions,tags)
    acc = categorical_accuracy(predictions,tags,tag_pad_idx)

    loss.backward()
    optim.step()

    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(iter), epoch_acc / len(iter)

In [11]:
def evaluate(model, iterator, criterion, tag_pad_idx):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()
    with torch.no_grad():
        for batch in iterator:

            text = batch.text
            tags = batch.udtags

            predictions = model(text)

            predictions = predictions.view(-1, predictions.shape[-1])
            tags = tags.view(-1)

            loss = criterion(predictions, tags)

            acc = categorical_accuracy(predictions, tags, tag_pad_idx)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [12]:
best_valid_loss = float('inf')

for epoch in range(10):

    train_loss, train_acc = train(model, train_iter, optimizer, criterion, TAG_PAD_IDX)
    valid_loss, valid_acc = evaluate(model, valid_iter, criterion, TAG_PAD_IDX)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')

    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01
	Train Loss: 1.090 | Train Acc: 66.17%
	 Val. Loss: 0.660 |  Val. Acc: 79.91%
Epoch: 02
	Train Loss: 0.391 | Train Acc: 87.67%
	 Val. Loss: 0.528 |  Val. Acc: 83.50%
Epoch: 03
	Train Loss: 0.307 | Train Acc: 90.05%
	 Val. Loss: 0.486 |  Val. Acc: 84.46%
Epoch: 04
	Train Loss: 0.267 | Train Acc: 91.31%
	 Val. Loss: 0.459 |  Val. Acc: 85.07%
Epoch: 05
	Train Loss: 0.239 | Train Acc: 92.14%
	 Val. Loss: 0.456 |  Val. Acc: 85.14%
Epoch: 06
	Train Loss: 0.220 | Train Acc: 92.78%
	 Val. Loss: 0.428 |  Val. Acc: 85.58%
Epoch: 07
	Train Loss: 0.205 | Train Acc: 93.22%
	 Val. Loss: 0.412 |  Val. Acc: 86.29%
Epoch: 08
	Train Loss: 0.191 | Train Acc: 93.69%
	 Val. Loss: 0.419 |  Val. Acc: 86.10%
Epoch: 09
	Train Loss: 0.179 | Train Acc: 94.07%
	 Val. Loss: 0.409 |  Val. Acc: 87.02%
Epoch: 10
	Train Loss: 0.168 | Train Acc: 94.43%
	 Val. Loss: 0.400 |  Val. Acc: 88.02%


In [13]:
test_loss, test_acc = evaluate(model,test_iter, criterion,TAG_PAD_IDX)

print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.401 |  Test Acc: 87.30%


- Sequence to Sequence, seq2seq

  인코더와 디코더로 구성, 번역에서 주로 사용

  인코더 안 lstm or gru 셀들로 구성

  인코더 rnn 셀은 인코더 rnn 셀의 마지막 시점 은닉 상태를 디코더로 넘겨주는데 이를 **컨텍스트 벡터**라 한다.

  - 테스트 단계

    디코더는 <sos>가 입력되면 다음 등장 확률이 높은 단어 예측 -> <eos>가 다음 단어로 예측 될 때까지 예측한 단어를 다음시점 rnn 셀로 넣는 것을 반복

  - 훈련 단계와 교사 강요

    디코더에게 인코더가 보낸 컨벡스트 벡터와 실제 정답을 알려주며 훈련 : 교사 강요 teacher forcing

  - embedding layer

  - rnn 셀

  - 디코더

    인코더의 마지막 rnn 셀의 은닉 상태인 컨텍스트 벡터를 첫번째 은닉상태 값으로 사용. 다음에 등장할 단어 예측. 모든 단어로부터 softmax function을 이용하여 하나의 단어를 골라야 함.
 