In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
sentences = open("europarl-10_sample.en", "rt", encoding="utf8").readlines()
print("{:,}".format(len(sentences)))

100,000


In [5]:
import nltk
nltk.download('perluniprops')
nltk.download('nonbreaking_prefixes')

[nltk_data] Downloading package perluniprops to
[nltk_data]     C:\Users\Chankoo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping misc\perluniprops.zip.


True

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from nltk.tokenize.moses import MosesTokenizer
from torch.utils.data import Dataset, DataLoader
from collections import Counter

from time import time

## Vocabulary 생성
### Parameters
- **corpus**: vocabulary 만들 때 사용할 문장
- **max_size**: 최대 vocabulary 크기
- **lang**: 사용할 언어

### Special tokens
- **PAD**: 문장길이를 맞춰주기 위한 토큰
- **EOS**: 문장의 마지막을 의미하는 토큰
- **UNK**: vocabulary에 존재하지 않는 단어를 의미하는 토큰

In [9]:
class Vocabulary(object):
    def __init__(self, corpus: list, max_size=10000, lang='en'):
        self.special_tokens = {
            0: '<PAD>',
            1: '<EOS>',
            2: '<UNK>'
        }
        
        self.dictionary = {}
        self.dictionary.update(self.special_tokens)
        self.reversed_dictionary = {}
        
        self.max_size = max_size
        self.tokenizer = MosesTokenizer(lang)
        
        self._build(corpus)
        
    def _build(self, sentences):
        counter = Counter()
        
        for sentence in sentences:
            sentence = sentence.strip()
            words = self.tokenizer.tokenize(sentence)
            counter.update(words)
        
        len_special_tokens = len(self.special_tokens)
        for i, (word, counter) in enumerate(counter.most_common(self.max_size - len_special_tokens)):
            self.dictionary[i+len_special_tokens] = word
        
        self.reversed_dictionary = dict(zip(self.dictionary.values(), self.dictionary.keys()))
    
    def to_vector(self, words: list):
        vector = []
        for word in words:
            # if the word doesn't exist, it returns index of '<UNK>'
            index = self.reversed_dictionary.get(word, 2)
            vector.append(index)
        return  vector
    
    def to_string(self, vector: list):
        words = []
        for index in vector:
            word = self.dictionary[index]
            words.append(word)
            
        return words
    
vocab = Vocabulary(sentences)

In [10]:
print(list(vocab.dictionary.items())[:10])
print(list(vocab.reversed_dictionary.items())[:10])

[(0, '<PAD>'), (1, '<EOS>'), (2, '<UNK>'), (3, '.'), (4, 'is'), (5, 'the'), (6, 'to'), (7, 'The'), (8, ','), (9, 'I')]
[('<PAD>', 0), ('<EOS>', 1), ('<UNK>', 2), ('.', 3), ('is', 4), ('the', 5), ('to', 6), ('The', 7), (',', 8), ('I', 9)]


## 데이터 로딩을 위한 dataset class 생성
- torch.utils.data 모듈 안에 있는 **Dataset** 클래스를 상속
- 모든 문장 뒤에는 문장의 끝을 알리는 **EOS(End of Sentence)** 토큰을 추가
- 문장 내의 단어들을 해당하는 **index**로 변환

### Parameters
- **corpus**: train 시 사용할 문장
- **vocab**: 만들어 놓은 vocabulary
- **lang**: 사용할 언어

In [11]:
class SequenceDataset(Dataset):
    def __init__(self, corpus: list, vocab: Vocabulary, lang='en'):
        super(SequenceDataset, self).__init__()
        self.vocab = vocab
        self.corpus = corpus
        self.tokenizer = MosesTokenizer(lang)
    
    def __getitem__(self, item):
        source_sent = self.corpus[item].strip()
        source_sent = self.tokenizer.tokenize(source_sent)
        
        source_sent.append('<EOS>')
        source_sent = self.vocab.to_vector(source_sent)
        
        return source_sent
    
    def __len__(self):
        return len(self.corpus)
    


## collate_fn
- **dataloader**에 사용되는 함수
- 서로 다른 길이를 가진 문장들이 같은 길이를 갖도록 PAD 토큰 추가

In [12]:
def collate_fn(source):
    max_src = max([len(s) for s in source])
    
    for s in source:
        s += [vocab.reversed_dictionary['<PAD>']] * (max_src - len(s))
        
    return source
    

## Language Model 클래스
- *'나는 학교에 간다'* 문장이 존재
- **'나는'** 이 입력으로 주어지면 **'학교에'**를 예측
- **'학교에'**가 주어지면 **'간다'**를 예측

### Paramters
- **vocab_size**: 최대 vocabulary 크기를 의미
- **embed_size**: 하나의 단어를 vector 형태로 변환할 때, vector의 크기를 의미
- **hidden_size**: RNN(Recurrent Neural Network)를 계산할 시, 사용하는 matrix의 크기를 의미
- **num_layers**: RNN의 layer 수

In [13]:
class LM(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(LM, self).__init__()
        self.hidden_size = hidden_size
        
        self.embed = nn.Embedding(vocab_size, embed_size, padding_idx=vocab.reversed_dictionary['<PAD>'])
        self.rnn = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
      
    def forward(self, x, h):
        #x.shape = batchsize x sequence length x vocab size
        x = self.embed(x)
        #x.shape = batchsize x sequence length x embeded size
        
        out, h = self.rnn(x, h)
        out = out.reshape(-1, self.hidden_size)
        out = self.linear(out)
        return out, h
    
    def predict(self, x, h):
        seq_len = x.shape[1]
        x = self.embed(x)

        out, h = self.rnn(x, h)
        out = out[:, :-1].reshape(-1, self.hidden_size)
        out = self.linear(out)
        out = F.softmax(out, dim=1).argmax(dim=1)
        return out

In [14]:
vocab_size = 10000
embed_size = 128
hidden_size = 512
num_layers = 1

LM_model = LM(vocab_size, embed_size, hidden_size, num_layers)

In [15]:
batch_size = 32
dataset = SequenceDataset(sentences, vocab)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=collate_fn)

criterion = nn.CrossEntropyLoss(ignore_index=vocab.reversed_dictionary['<PAD>'], reduction='sum')
optimizer = torch.optim.Adam(LM_model.parameters(), lr=0.002)

start = time()
for step, s in enumerate(dataloader):
    s = torch.tensor(s, dtype=torch.long, requires_grad=False)
    h = torch.zeros(num_layers, batch_size, hidden_size, requires_grad=False)
    outputs, _ = LM_model(s[:, :-1], h)
    targets = s[:, :-1].reshape(-1, )

    loss = criterion(outputs, targets) / batch_size
    LM_model.zero_grad()
    loss.backward()
    nn.utils.clip_grad_norm_(LM_model.parameters(), max_norm=0.5) # gradient vanishing 방지
    optimizer.step()

    if step % 10 == 0:
        print("Step: {:7} Loss: {:.6f}".format(step, loss))
    
    if loss <= 3.0:
        break
print("Time: {}".format(time() - start))

Step:       0 Loss: 64.468842
Step:      10 Loss: 39.141621
Step:      20 Loss: 26.681957
Step:      30 Loss: 24.786127
Step:      40 Loss: 18.379395
Step:      50 Loss: 15.402044
Step:      60 Loss: 15.963209
Step:      70 Loss: 12.279917
Step:      80 Loss: 17.347952
Step:      90 Loss: 11.721835
Step:     100 Loss: 8.945443
Step:     110 Loss: 8.908700
Step:     120 Loss: 8.429061
Step:     130 Loss: 5.869609
Step:     140 Loss: 5.861592
Step:     150 Loss: 6.861679
Step:     160 Loss: 5.978749
Step:     170 Loss: 4.370531
Step:     180 Loss: 5.703103
Step:     190 Loss: 6.271022
Step:     200 Loss: 3.485357
Step:     210 Loss: 3.499555
Step:     220 Loss: 6.591218
Step:     230 Loss: 5.113854
Time: 69.27584052085876


In [16]:
results = LM_model.predict(s, h)
results = results.chunk(batch_size)

for ori, pred in zip(s[:, :-1], results):
    ori = ori.cpu().numpy()
    pred = pred.cpu().numpy()
    
    ori = vocab.to_string(ori)
    pred = vocab.to_string(pred)
    
    print("Original: {}".format(" ".join(ori)))
    print("Predicted: {}".format(" ".join(pred)))
    print()

Original: But they were betrayed . <EOS> <PAD> <PAD> <PAD> <PAD>
Predicted: But they were May . <EOS> basic . <EOS> .

Original: The next item is the vote . <EOS> <PAD> <PAD>
Predicted: The next item is the vote . <EOS> basic we

Original: We must support innovation . <EOS> <PAD> <PAD> <PAD> <PAD>
Predicted: We must support innovation . <EOS> basic <EOS> <EOS> <EOS>

Original: Then it is one for Parliament to deal with .
Predicted: Then it is one for Parliament to deal with .

Original: This can be put down to one simple reason .
Predicted: This can be put down to one simple reason .

Original: This is unfair , as Mr Haarder also said .
Predicted: This is unfair , as Mr Busquin also said .

Original: A divided Europe can have no part in this .
Predicted: A divided Europe can have no part in this .

Original: . <EOS> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
Predicted: . <EOS> basic <EOS> <EOS> . <EOS> <EOS> <EOS> <EOS>

Original: We have therefore voted against the report . <EOS>