# Pytorchチュートリアル　〜torxhtextのデータを使って翻訳モデルを作る〜

In [3]:
from pathlib import Path
import pandas as pd
import numpy as np
import random
from typing import Tuple
import math
import time

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch import Tensor
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

## 使用するデータの取得

今回使用するデータはtorchtextで用意されているものである。  
そこで、torchtextに関して理解する。torchtextの解説は[このブログ](https://qiita.com/itok_msi/items/1f3746f7e89a19dafac5)を参考にしている。  

Fieldクラスは読み込んだデータに施す前処理とその結果を管理するクラス  
テキストをディープラーニングのモデルに取り込めるようにするためには、モデルが理解できるようにテキストを数値ベクトルに置き換えなければいけない。  
そのためのプロセスは、定型的ではあるものの、各プロセスでやり方が色々あるため一見複雑に見える。そのため、その工程を管理する事ができれば楽に  
必要なデータに変換できる。そのためのクラスである。  
テキストを数値ベクトルに変換する工程は以下の通りである。（今回は文章を各単語に分割して単語を１単位とする学習データを想定している）  
1. テキストを各単語に分割  
1. 各単語に前処理を行う
1. 各単語をインデックスに変換
1. インデックスを数値ベクトルに変換

　今回は文章を単語に分割するために「[https://spacy.io/](spaCy)」を使っている。これは自然言語処理を行うためのオープンソース・ソフトウェア・ライブラリである。  
このライブラリはPyTorch以外にも、自然言語処理を扱う他のモジュール（TeonsorFlow, scikit-learn, Gensimなど）でも使うことができる。

In [5]:
SRC = Field(
    tokenize = "spacy",
    tokenizer_language="de",
    init_token = '<sos>',
    eos_token = '<eos>',
    lower = True
)

TRG = Field(
    tokenize = "spacy",
    tokenizer_language="en",
    init_token = '<sos>',
    eos_token = '<eos>',
    lower = True
)

In [6]:
train_data, valid_data, test_data = Multi30k.splits(
    exts = ('.de', '.en'),
    fields = (SRC, TRG)
)

Multi30kのデータは翻訳機械学習用のデータセットであり、指定した言語のデータを取得することができる。  
上記では、ドイツ語と英語を指定し、fileds引数に上記で指定した文章を分割する関数を代入している。  
このデータには主要なものとして「Filedsクラス」と「文章」が入っている

In [7]:
train_data.fields

{'src': <torchtext.data.field.Field at 0x11bdab588>,
 'trg': <torchtext.data.field.Field at 0x11defe668>}

In [8]:
examples = train_data.examples
example = examples[0]
src = example.src
trg = example.trg
print(src)
print(trg)

['zwei', 'junge', 'weiße', 'männer', 'sind', 'im', 'freien', 'in', 'der', 'nähe', 'vieler', 'büsche', '.']
['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']


In [9]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [10]:
src_tmp = list(train_data.fields['src'].vocab.stoi.items())[:10]
trg_tmp = list(train_data.fields['trg'].vocab.stoi.items())[:10]

print(src_tmp)
print(trg_tmp)

[('<unk>', 0), ('<pad>', 1), ('<sos>', 2), ('<eos>', 3), ('.', 4), ('ein', 5), ('einem', 6), ('in', 7), ('eine', 8), (',', 9)]
[('<unk>', 0), ('<pad>', 1), ('<sos>', 2), ('<eos>', 3), ('a', 4), ('.', 5), ('in', 6), ('the', 7), ('on', 8), ('man', 9)]


単語をインデックス化し、固定長の長さに合わせ、指定したバッチサイズでミニバッチ化する

In [11]:
BATCH_SIZE = 128

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size = BATCH_SIZE
)

In [12]:
batch = iter(train_iterator).__next__()
print(batch.src)
print(batch.trg)
print(batch.src.shape)

tensor([[  2,   2,   2,  ...,   2,   2,   2],
        [  5,  18,   8,  ...,  54,   8,   5],
        [ 13, 330, 113,  ...,  74,  36,  49],
        ...,
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1]])
tensor([[  2,   2,   2,  ...,   2,   2,   2],
        [  4,  16,   4,  ...,  19,   4,   4],
        [  9, 326,  87,  ...,  17,  38, 348],
        ...,
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1],
        [  1,   1,   1,  ...,   1,   1,   1]])
torch.Size([33, 128])


## 翻訳モデルの構築

GRUの定義はこのURLに記載してある。  
https://pytorch.org/docs/stable/nn.html#gru

In [13]:
class Encoder(nn.Module):
    
    def __init__(self, input_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout):
        
        super().__init__()

        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.dropout = dropout

        self.embedding = nn.Embedding(input_dim, emb_dim)

        self.rnn = nn.GRU(
            emb_dim, 
            enc_hid_dim, 
            bidirectional = True
        )

        self.fc = nn.Linear(enc_hid_dim * 2, dec_hid_dim)

        self.dropout = nn.Dropout(dropout)
        

    def forward(self, src):

        embedded = self.dropout(self.embedding(src))

        outputs, hidden = self.rnn(embedded)

        hidden = torch.tanh(self.fc(
            torch.cat((
                hidden[-2, :, :], 
                hidden[-1, :, :]), 
                dim = 1
            )
        ))

        return outputs, hidden

In [14]:
class Attention(nn.Module):
    
    def __init__(self, enc_hid_dim, dec_hid_dim, attn_dim):
        
        super().__init__()

        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim

        self.attn_in = (enc_hid_dim * 2) + dec_hid_dim

        self.attn = nn.Linear(self.attn_in, attn_dim)

        
    def forward(self, decoder_hidden, encoder_outputs):

        src_len = encoder_outputs.shape[0]

        repeated_decoder_hidden = decoder_hidden.unsqueeze(1).repeat(
            1, 
            src_len, 
            1
        )

        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        energy = torch.tanh(self.attn(
            torch.cat((
                repeated_decoder_hidden,
                encoder_outputs
            ), dim = 2
        )))

        attention = torch.sum(energy, dim=2)

        return F.softmax(attention, dim=1)

In [15]:
class Decoder(nn.Module):
    
    def __init__(self, output_dim, emb_dim, enc_hid_dim, dec_hid_dim, dropout, attention):
        
        super().__init__()

        self.emb_dim = emb_dim
        self.enc_hid_dim = enc_hid_dim
        self.dec_hid_dim = dec_hid_dim
        self.output_dim = output_dim
        self.dropout = dropout
        self.attention = attention

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.GRU(
            (enc_hid_dim * 2) + emb_dim, 
            dec_hid_dim
        )

        self.out = nn.Linear(self.attention.attn_in + emb_dim, output_dim)

        self.dropout = nn.Dropout(dropout)


    def _weighted_encoder_rep(self, decoder_hidden, encoder_outputs):

        a = self.attention(decoder_hidden, encoder_outputs)

        a = a.unsqueeze(1)

        encoder_outputs = encoder_outputs.permute(1, 0, 2)

        weighted_encoder_rep = torch.bmm(a, encoder_outputs)

        weighted_encoder_rep = weighted_encoder_rep.permute(1, 0, 2)

        return weighted_encoder_rep


    def forward(self, input, decoder_hidden, encoder_outputs):

        input = input.unsqueeze(0)

        embedded = self.dropout(self.embedding(input))

        weighted_encoder_rep = self._weighted_encoder_rep(
            decoder_hidden,
            encoder_outputs
        )

        rnn_input = torch.cat((embedded, weighted_encoder_rep), dim = 2)

        output, decoder_hidden = self.rnn(rnn_input, decoder_hidden.unsqueeze(0))

        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted_encoder_rep = weighted_encoder_rep.squeeze(0)

        output = self.out(
            torch.cat((
                output,
                weighted_encoder_rep,
                embedded
            ), dim = 1)
        )

        return output, decoder_hidden.squeeze(0)

In [16]:
class Seq2Seq(nn.Module):
    
    def __init__(self, encoder, decoder):
        
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder

        
    def forward(self, src, trg, teacher_forcing_ratio=0.5):

        batch_size = src.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        outputs = torch.zeros(
            max_len, 
            batch_size, 
            trg_vocab_size
        )

        encoder_outputs, hidden = self.encoder(src)

        # first input to the decoder is the <sos> token
        output = trg[0,:]

        for t in range(1, max_len):
            output, hidden = self.decoder(
                output, 
                hidden, 
                encoder_outputs
            )
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            output = (trg[t] if teacher_force else top1)

        return outputs

In [17]:
def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)

In [18]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

### モデルのパラメータの設定

In [19]:
INPUT_DIM = len(SRC.vocab)     # 入力データの単語数
OUTPUT_DIM = len(TRG.vocab)    # 出力データの単語数

ENC_EMB_DIM = 32               # Encoder用のembeddingの次元数
DEC_EMB_DIM = 32               # Decoder用のembeddingの次元数
ENC_HID_DIM = 64               # Encoder用の隠れ層の次元数
DEC_HID_DIM = 64               # Decoder用の隠れ層の次元数
ATTN_DIM = 8                   # Attentionの隠れ層の次元数
ENC_DROPOUT = 0.5              # Encoder用のDropout確率
DEC_DROPOUT = 0.5              # Decoder用のDropout確率

In [20]:
enc = Encoder(
    input_dim=INPUT_DIM, 
    emb_dim=ENC_EMB_DIM, 
    enc_hid_dim=ENC_HID_DIM, 
    dec_hid_dim=DEC_HID_DIM, 
    dropout=ENC_DROPOUT
)

attn = Attention(
    enc_hid_dim=ENC_HID_DIM, 
    dec_hid_dim=DEC_HID_DIM, 
    attn_dim=ATTN_DIM
)

dec = Decoder(
    output_dim=OUTPUT_DIM, 
    emb_dim=DEC_EMB_DIM, 
    enc_hid_dim=ENC_HID_DIM,
    dec_hid_dim=DEC_HID_DIM, 
    dropout=DEC_DROPOUT, 
    attention=attn
)

model = Seq2Seq(
    encoder=enc, 
    decoder=dec
)

model.apply(init_weights)

optimizer = optim.Adam(model.parameters())

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,856,685 trainable parameters


In [21]:
def train(model, iterator, optimizer, criterion, clip):

    # モデルを学習モードにする
    model.train()

    epoch_loss = 0

    for _, batch in enumerate(iterator):

        src = batch.src
        trg = batch.trg

        optimizer.zero_grad()

        output = model(src, trg)

        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)

        loss = criterion(output, trg)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [25]:
def evaluate(model, iterator, criterion):

    # 確認モードに切り替える（Dropoutを行わないなどの切り替え）
    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for _, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

In [26]:
def epoch_time(start_time, end_time):
    
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    
    return elapsed_mins, elapsed_secs

In [28]:
N_EPOCHS = 30
CLIP = 1
PAD_IDX = TRG.vocab.stoi['<pad>']

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
best_valid_loss = float('inf')


train_loss_list = []
valid_loss_list = []
for epoch in range(N_EPOCHS):

    start_time = time.time()

    train_loss = train(
        model, 
        train_iterator, 
        optimizer, 
        criterion, 
        CLIP
    )
    
    valid_loss = evaluate(
        model, 
        valid_iterator, 
        criterion
    )

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
    
    train_loss_list.append(train_loss)
    valid_loss_list.append(valid_loss)

    
test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

Epoch: 01 | Time: 9m 8s
	Train Loss: 4.668 | Train PPL: 106.505
	 Val. Loss: 4.876 |  Val. PPL: 131.090
Epoch: 02 | Time: 9m 41s
	Train Loss: 4.476 | Train PPL:  87.914
	 Val. Loss: 4.836 |  Val. PPL: 125.923
Epoch: 03 | Time: 9m 25s
	Train Loss: 4.371 | Train PPL:  79.153
	 Val. Loss: 4.758 |  Val. PPL: 116.523
Epoch: 04 | Time: 8m 51s
	Train Loss: 4.267 | Train PPL:  71.339
	 Val. Loss: 4.696 |  Val. PPL: 109.538
Epoch: 05 | Time: 8m 58s
	Train Loss: 4.160 | Train PPL:  64.052
	 Val. Loss: 4.672 |  Val. PPL: 106.939
Epoch: 06 | Time: 8m 56s
	Train Loss: 4.060 | Train PPL:  57.958
	 Val. Loss: 4.554 |  Val. PPL:  95.030
Epoch: 07 | Time: 9m 0s
	Train Loss: 3.966 | Train PPL:  52.797
	 Val. Loss: 4.491 |  Val. PPL:  89.252
Epoch: 08 | Time: 8m 57s
	Train Loss: 3.893 | Train PPL:  49.052
	 Val. Loss: 4.426 |  Val. PPL:  83.572
Epoch: 09 | Time: 8m 59s
	Train Loss: 3.806 | Train PPL:  44.966
	 Val. Loss: 4.298 |  Val. PPL:  73.527
Epoch: 10 | Time: 8m 59s
	Train Loss: 3.723 | Train PPL: 

In [29]:
result = pd.DataFrame({
    'epoch' : np.arange(1, len(train_loss_list)+1),
    'train_loss' : train_loss_list,
    'valid_loss' : valid_loss_list
})
result.to_csv('model_result.csv', index=False)
result

Unnamed: 0,epoch,train_loss,valid_loss
0,1,4.668191,4.875884
1,2,4.476356,4.83567
2,3,4.371384,4.75809
3,4,4.267446,4.696274
4,5,4.159692,4.672259
5,6,4.059719,4.554195
6,7,3.96646,4.491463
7,8,3.892877,4.425706
8,9,3.805901,4.297657
9,10,3.722802,4.19862
