In [43]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext.legacy import data, datasets
import time
import random

seed = 1004
random.seed(seed)
torch.manual_seed(seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [51]:
# 필드 정의
text = data.Field(lower=True)
ud_tags = data.Field(unk_token = None)
ptb_tags = data.Field(unk_token = None)

fields = (('text', text), ('udtags', ud_tags), ('ptbtags', ptb_tags))

In [52]:
# dataset 만들기
train_data, valid_data, test_data = datasets.UDPOS.splits(fields)

In [53]:
print(f'훈련 샘플 개수 : {len(train_data)}')
print(f'검증 샘플 개수 : {len(valid_data)}')
print(f'테스트 샘플 개수 : {len(test_data)}')
# 훈련 데이터의 3개 필드 확인
print(train_data.fields)

훈련 샘플 개수 : 12543
검증 샘플 개수 : 2002
테스트 샘플 개수 : 2077
{'text': <torchtext.legacy.data.field.Field object at 0x000001F26E912788>, 'udtags': <torchtext.legacy.data.field.Field object at 0x000001F26E912F08>, 'ptbtags': <torchtext.legacy.data.field.Field object at 0x000001F26E912FC8>}


In [54]:
# 샘플1 의 text, udtags
print(vars(train_data.examples[0])['text'])
print(vars(train_data.examples[0])['udtags'])
# 샘플2 의 udtags
print(vars(train_data.examples[1])['udtags'])

['al', '-', 'zaman', ':', 'american', 'forces', 'killed', 'shaikh', 'abdullah', 'al', '-', 'ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'qaim', ',', 'near', 'the', 'syrian', 'border', '.']
['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'ADJ', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'PUNCT', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT']
['PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'AUX', 'AUX', 'VERB', 'PRON', 'NOUN', 'ADP', 'NOUN', 'PART', 'VERB', 'PUNCT', 'PUNCT']


## 단어 집합 생성

In [55]:
# 최소허용 빈도
min_freq = 5

# 사전 훈련된 워드 임베딩 Glove 다운로드
text.build_vocab(train_data, min_freq = min_freq, vectors = 'glove.6B.100d')
ud_tags.build_vocab(train_data)
ptb_tags.build_vocab(train_data)

# 상위 빈도수 단어 20개를 출력
print(text.vocab.freqs.most_common(20))

[('the', 9076), ('.', 8640), (',', 7021), ('to', 5137), ('and', 5002), ('a', 3782), ('of', 3622), ('i', 3379), ('in', 3112), ('is', 2239), ('you', 2156), ('that', 2036), ('it', 1850), ('for', 1842), ('-', 1426), ('have', 1359), ('"', 1296), ('on', 1273), ('was', 1244), ('with', 1216)]


- 토치텍스트는 기본적으로 빈도수가 가장 높은 단어부터 작은 숫자를 부여합니다.  
- 물론, `<unk>` 는 0번, <pad>는 1번으로 자동으로 부여되므로 제외

In [56]:
# 상위 정수 인덱스 10개 출력
print(text.vocab.itos[:10])
# 상위 빈도순 udtags 출력
print(ud_tags.vocab.freqs.most_common())

['<unk>', '<pad>', 'the', '.', ',', 'to', 'and', 'a', 'of', 'i']
[('NOUN', 34781), ('PUNCT', 23679), ('VERB', 23081), ('PRON', 18577), ('ADP', 17638), ('DET', 16285), ('PROPN', 12946), ('ADJ', 12477), ('AUX', 12343), ('ADV', 10548), ('CCONJ', 6707), ('PART', 5567), ('NUM', 3999), ('SCONJ', 3843), ('X', 847), ('INTJ', 688), ('SYM', 599)]


In [57]:
# 레이블에 속한 단어들의 분포를 출력해보자
def tag_percentage(tag_cnt) :
    total_cnt = sum([cnt for tag, cnt in tag_cnt]) 
    tag_cnt_per = [(tag, cnt, cnt/total_cnt) for tag, cnt in tag_cnt]
    return tag_cnt_per

print('Tag Occurences Percentage\n')
for tag, cnt, per in tag_percentage(ud_tags.vocab.freqs.most_common()) :
    print(f'{tag}\t{cnt}\t{per*100:4.1f}%')

Tag Occurences Percentage

NOUN	34781	17.0%
PUNCT	23679	11.6%
VERB	23081	11.3%
PRON	18577	 9.1%
ADP	17638	 8.6%
DET	16285	 8.0%
PROPN	12946	 6.3%
ADJ	12477	 6.1%
AUX	12343	 6.0%
ADV	10548	 5.2%
CCONJ	6707	 3.3%
PART	5567	 2.7%
NUM	3999	 2.0%
SCONJ	3843	 1.9%
X	847	 0.4%
INTJ	688	 0.3%
SYM	599	 0.3%


In [58]:
batch_size = 64

train_iter, valid_iter, test_iter = data.BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size = batch_size,
        device = device)

In [59]:
batch = next(iter(train_iter))

In [60]:
print(batch)
batch.text.shape


[torchtext.legacy.data.batch.Batch of size 64 from UDPOS]
	[.text]:[torch.LongTensor of size 46x64]
	[.udtags]:[torch.LongTensor of size 46x64]
	[.ptbtags]:[torch.LongTensor of size 46x64]


torch.Size([46, 64])

In [61]:
class RNNPOSTagger(nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout) :
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional = bidirectional)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    
    def forward(self, text) :
        embedded= self.dropout(self.embedding(text))
        
        outputs, (hidden, cell) = self.rnn(embedded)
        
        pred = self.fc(self.dropout(outputs))
        
        return pred

In [62]:
# params
input_dim = len(text.vocab)
embedding_dim = 100
hidden_dim = 128
output_dim = len(ud_tags.vocab)
n_layers = 2
bidirectional = True
dropout = 0.25

model = RNNPOSTagger(
        input_dim, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional,
        dropout)

In [66]:
def count_parameters(model) :
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,027,510 trainable parameters


In [67]:
# 사전 훈련된 워드 임베딩 사용하기 ( glove)

pretrained_embeddings = text.vocab.vectors
print(pretrained_embeddings.shape)

torch.Size([3921, 100])


In [68]:
model.embedding.weight.data.copy_(pretrained_embeddings) # embedding vector value copy

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.1020,  0.7700,  0.1169,  ..., -0.1416, -0.1932, -0.4225],
        [-0.0263,  0.0179, -0.5016,  ..., -0.8688,  0.9409, -0.2882],
        [ 0.1519,  0.4712,  0.0895,  ..., -0.4702, -0.3127,  0.1078]])

In [71]:
unk_idx = text.vocab.stoi[text.unk_token]
pad_idx = text.vocab.stoi[text.pad_token]
print(udk_idx)
print(pad_idx)

0
1


In [72]:
model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim) # 0번 엠베딩 벡터에는 0값을 채움
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim) # 1번 엠베딩 벡터에는 1값을 채움
print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.1020,  0.7700,  0.1169,  ..., -0.1416, -0.1932, -0.4225],
        [-0.0263,  0.0179, -0.5016,  ..., -0.8688,  0.9409, -0.2882],
        [ 0.1519,  0.4712,  0.0895,  ..., -0.4702, -0.3127,  0.1078]])


## 옵티마이저와 비용함수 구현

In [73]:
tag_pad_idx = ud_tags.vocab.stoi[ud_tags.pad_token]
print(tag_pad_idx)

0


In [75]:
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=tag_pad_idx)

In [76]:
model  =model.to(device)
criterion = criterion.to(device)

In [77]:
pred = model(batch.text)
pred.shape

torch.Size([46, 64, 18])

In [78]:
pred = pred.view(-1, pred.shape[-1]) # 시퀀스 길이와 배치 길이를 펼쳐 줌
pred.shape

torch.Size([2944, 18])

In [79]:
batch.udtags.shape # 첫 번째 배치의 시퀀스 길이 * 배치 크기

torch.Size([46, 64])

In [80]:
batch.udtags.view(-1).shape

torch.Size([2944])

## 훈련 및 평가 

In [89]:
def categorical_accuracy(preds, y, tag_pad_idx) :
    '''
    미니 배치에 대한 정확도를 출력
    '''
    max_preds = preds.argmax(dim=1, keepdim=True)
    non_pad_elements = (y != tag_pad_idx).nonzero()
    corr = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return corr.sum() / torch.FloatTensor([y[non_pad_elements].shape[0]])

def train(model, iters, opt, criterion, tag_pad_idx) :
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iters :
        
        text = batch.text
        tags = batch.udtags
        
        optimizer.zero_grad()
        
        pred = model(text)
        
        pred = pred.view(-1, pred.shape[-1])
        tags = tags.view(-1)
        
        loss = criterion(pred, tags)
        
        acc = categorical_accuracy(pred, tags, tag_pad_idx)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()

        epoch_acc += acc.item()
        
    return epoch_loss / len(iters), epoch_acc / len(iters)
        
def _eval(model, iters, criterion, tag_pad_idx) :
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()

    with torch.no_grad() :
        
        for batch in iters :
            
            text = batch.text
            tags = batch.udtags
            
            pred = model(text)
            
            pred = pred.view(-1, pred.shape[-1])
            tags = tags.view(-1)
            
            loss = criterion(pred, tags)
            
            acc = categorical_accuracy(pred, tags, tag_pad_idx)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()   
            
    return epoch_loss / len(iters), epoch_acc / len(iters)

In [94]:
n_epochs = 10

best_valid_loss = float('inf')

for epoch in range(n_epochs) :
    
    train_loss, train_acc = train(model, train_iter, optimizer, criterion, tag_pad_idx)
    valid_loss, valid_acc = train(model, valid_iter, optimizer, criterion, tag_pad_idx)
    
    if valid_loss < best_valid_loss :
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
        
    print(f'epoch : {epoch+1:02}')
    print(f'\t train loss : {train_loss:.3f} | train accuracy : {train_acc * 100:.2f}%')
    print(f'\t validation loss : {valid_loss:.3f} | validation accuracy : {valid_acc * 100:.2f}%')          

epoch : 01
	 train loss : 1.084 | train accuracy : 66.24%
	 validation loss : 0.700 | validation accuracy : 77.90%
epoch : 02
	 train loss : 0.374 | train accuracy : 88.15%
	 validation loss : 0.503 | validation accuracy : 85.36%
epoch : 03
	 train loss : 0.298 | train accuracy : 90.31%
	 validation loss : 0.431 | validation accuracy : 86.69%
epoch : 04
	 train loss : 0.258 | train accuracy : 91.51%
	 validation loss : 0.368 | validation accuracy : 88.38%
epoch : 05
	 train loss : 0.235 | train accuracy : 92.28%
	 validation loss : 0.334 | validation accuracy : 89.25%
epoch : 06
	 train loss : 0.217 | train accuracy : 92.87%
	 validation loss : 0.315 | validation accuracy : 89.50%
epoch : 07
	 train loss : 0.202 | train accuracy : 93.27%
	 validation loss : 0.288 | validation accuracy : 90.15%
epoch : 08
	 train loss : 0.190 | train accuracy : 93.73%
	 validation loss : 0.270 | validation accuracy : 90.91%
epoch : 09
	 train loss : 0.179 | train accuracy : 94.03%
	 validation loss : 0.