In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn, optim

import numpy as np

import time
import math

import matplotlib.pyplot as plt

import tqdm

import easydict

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

In [2]:

class MultiHeadAttention(nn.Module):
    def __init__(self, 
                 emb_dim: int, 
                 num_heads: int, 
                 drop_out: float = 0.0,
                 bias: bool = False, 
                 encoder_decoder_attention: bool = False,
                 causal: bool = False):
        '''Initialize MultiHeadAttention class variables.
        
        Args:
            emb_dim (int): Dimension of a word * number of heads.
            num_heads (int): Number of attention heads.
            drop_out (float): Drop out rate.
            bias (bool): Boolean that indicating whether to use bias or not.
            encoder_decoder_attention (bool): Boolean that indicating whether the multi head
                                              attention is encoder-decoder attention or not.
            causal (bool): Boolean that indicating whether to use causal mask or not.
        '''
        super().__init__()
        self.emb_dim = emb_dim
        self.num_heads = num_heads
        self.head_dim = emb_dim // num_heads
        assert self.head_dim * num_heads == emb_dim, "emb_dim must be divisible by num_heads"
        
        self.drop_out = drop_out
        self.encoder_decoder_attention = encoder_decoder_attention
        self.causal = causal
        
        self.wk = nn.Linear(self.emb_dim, self.emb_dim, bias = bias)
        self.wq = nn.Linear(self.emb_dim, self.emb_dim, bias = bias)
        self.wv = nn.Linear(self.emb_dim, self.emb_dim, bias = bias)
        self.output = nn.Linear(self.emb_dim, self.emb_dim, bias = bias)
    
    def multi_head_scaled_dot_product(self,
                                      query: torch.Tensor,
                                      key: torch.Tensor,
                                      value: torch.Tensor,
                                      attention_mask: torch.BoolTensor):
        '''Perform multi-head version of scaled dot product.
        
        Args:
            query (Tensor): shape '(batch size, # attention head, seqence length, demension of head)'
            key (Tensor): shape '(batch size, # attention head, seqence length, demension of head)'
            value (Tensor): shape '(batch size, # attention head, seqence length, demension of head)'
            attention_mask: This mask can be either causal mask or padding mask.
                            shape '(batch size, source squence length)' for padding mask.
                            shape '(sequence length, target sequence length)' for causal mask.
        Returns:
            attn_output (Tensor): output of attention mechanism. shape '(batch size, seq_len, emb_dim)'
            attn_weights (Tensor): value of attention weight of each word. shape '(batch size, # attn head, seq_len, seq_len)'
        '''
        
        attn_weights = torch.matmul(query, key.transpose(-1,-2)) / math.sqrt(self.head_dim)
        '''shape of attn_weights : (batch size, # attn head, seq_len, seq_len)'''
        
        if attention_mask is not None:
            if self.causal:
                '''Masking future info for encoder-decoder attention.'''
                attn_weights = attn_weights.masked_fill(attention_mask.unsqueeze(0).unsqueeze(1), float('-inf'))
                '''
                shape of attention_mask : (trg_len, trg_len).
                shape of attention_mask.unsqueeze(0).unsqueeze(1) : (1, 1, trg_len, trg_len).
                '''
            else:
                '''Masking padding token so that it is not used for attention.'''
                attn_weights = attn_weights.masked_fill(attention_mask.unsqueeze(1).unsqueeze(2), float('-inf'))
                '''
                shape of attention_mask : (batch_size, src_len)
                shape of attention_mask.unsqueeze(1).unsqueeze(2) : (batch_size, 1, 1, src_len)
                '''
        attn_weights = F.softmax(attn_weights, dim = -1)
        attn_probs = F.dropout(attn_weights, p=self.drop_out, training=self.training)
        
        attn_output = torch.matmul(attn_probs, value)
        '''shape of attn_output : (batch size, # attn head, seq_len, head_dim)'''
        attn_output = attn_output.permute(0, 2, 1, 3).contiguous()
        '''shape of attn_output : (batch size, seq_len, # attn head, head_dim)'''
        shape = attn_output.size()[:-2] + (self.emb_dim,)
        attn_output = attn_output.view(*shape)
        '''shape of attn_output : (batch size, seq_len, emb_dim)'''
        attn_output = self.output(attn_output)
        
        return attn_output, attn_weights
    
    def transform_to_multi_head(self, 
                                x: torch.Tensor):
        ''' Reshape input
        
        Args:
            x (Tensor): shape '(batch_size, seq_len, emb_dim)'
        
        Returns:
            Tensor: shape '(batch_size, # attn head, seq_len, head_dim)'
        '''
        
        shape = x.size()[:-1] + (self.num_heads, self.head_dim,)
        x = x.view(*shape)
        
        return x.permute(0, 2, 1, 3)
    
    def forward(self,
                query: torch.Tensor,
                key: torch.Tensor,
                attention_mask: torch.Tensor = None):
        '''
        Args:
            query (Tensor): shape '(batch_size, seq_len, emb_dim)'
            key (Tensor): shape '(batch_size, seq_len, emb_dim)'
            attention_mask (Tensor): shape '(batch size, squence length)' for padding mask.
                                     shape '(sequence length, sequence length)' for causal mask.
        
        Returns:
            attn_output (Tensor): output of attention mechanism. shape '(batch size, seq_len, emb_dim)'
            attn_weights (Tensor): value of attention weight of each word. shape '(batch size, # attn head, seq_l
        '''
        
        q = self.wq(query)
        
        # encoder-decoder attention
        if self.encoder_decoder_attention:
            '''
            query is output of encoder
            key is input of decoder
            '''
            k = self.wk(key)
            v = self.wv(key)
        
        # self attention
        else:
            '''
            Both of query and key are input of encoder(query is same with key).
            '''
            k = self.wk(query)
            v = self.wv(query)
        
        q = self.transform_to_multi_head(q)
        k = self.transform_to_multi_head(k)
        v = self.transform_to_multi_head(v)
        
        attn_output, attn_weights = self.multi_head_scaled_dot_product(q,k,v,attention_mask)
            
        return attn_output, attn_weights

In [3]:
# temp_mha = MultiHeadAttention(emb_dim=512, num_heads=8)
# x = torch.rand(3, 45, 512)
# out, attn = temp_mha(query=x, key=x, attention_mask=None)
# print(out.size(), attn.size())

In [4]:
class PositionWiseFeedForward(nn.Module):
    
    def __init__(self,
                 emb_dim: int,
                 hid_dim: int,
                 drop_out: float = 0.1):
        '''Initialize position-wise feed forward network.
        
        Args:
            emb_dim (int): word embdding dimension.
            hid_dim (int): hidden dimesion.
            drop_out (float): drop out rate.
        '''
        super().__init__()
        self.linear_1 = nn.Linear(emb_dim, hid_dim)
        self.linear_2 = nn.Linear(hid_dim, emb_dim)
        self.activation = nn.ReLU()
        self.drop_out = drop_out
    
    def forward(self,
                 x: torch.Tensor):
        '''
        Args:
            x (Tensor): shape '(batch_size, seq_len, emb_dim)'
        
        Return:
            x (Tensor): shape '(batch_size, seq_len, emb_dim)'
        '''

        x = self.linear_1(x)
        x = self.activation(x)
        x = F.dropout(x, p=self.drop_out, training=self.training)
        
        x = self.linear_2(x)
        x = F.dropout(x, p=self.drop_out, training=self.training)
        
        return x

In [5]:
# temp_pwff = PositionWiseFeedForward(512, 256)
# x = torch.rand(3, 45, 512)
# out = temp_pwff(x)
# print(out.size())

In [6]:
class SinusoidalPositionalEncodedEmbedding(nn.Embedding):
    def __init__(self, 
                 max_position: int, 
                 embedding_dim: int):
        '''Initialize positional embedding.
        
        Args:
            max_position (int): maximum length of input sequence length.
                                 That is, it can encode position up to max_positions'th position.
            embedding_dim (int): embedding dimension.
        '''
        super().__init__(max_position, embedding_dim)
        self.weight = self._init_weight(self.weight)

    def _init_weight(self, initial_embedding_table: nn.Parameter):
        '''Make positional embedding table
        
        Args:
            initial_embedding_table (Parameter): initialized embedding table.
        
        Returns:
            pe (Parameter): position embedding table.
        
        '''
        max_pos, emb_dim = initial_embedding_table.shape
        pe = nn.Parameter(torch.zeros(max_pos, emb_dim))

        pos_id = torch.arange(0, max_pos).unsqueeze(1)
        freq = torch.pow(10000., -torch.arange(0, emb_dim, 2, dtype=torch.float) / emb_dim)
        pos_freq = pos_id * freq
        pe[:, 0::2] = torch.sin(pos_freq)
        pe[:, 1::2] = torch.cos(pos_freq)
        
        pe.detach_()
        
        return pe

    def forward(self, 
                input_ids: torch.Tensor):
        '''
        Args:
            input_ids (Tensor): shape '(batch_size, seq_len)'
        
        Return:
            Tensor : shape '(seq_len, emb_dim)'
        '''
        batch_size, seq_len = input_ids.shape[:2]
        positions = torch.arange(seq_len, dtype=torch.long, device=self.weight.device)
        return super().forward(positions)

In [7]:
# pos = SinusoidalPositionalEncodedEmbedding(512, 512)
# pos_encoding = pos(torch.zeros(2, 64, 512))
# print(pos.weight.shape)
# print(pos_encoding.shape)
# plt.pcolormesh(pos_encoding.numpy(), cmap='RdBu')
# plt.xlabel('Depth')
# plt.xlim((0, 512))
# plt.ylabel('Position')
# plt.colorbar()
# plt.show()

In [8]:
# config = easydict.EasyDict({
#     "emb_dim":6,
#     "ffn_dim":256,
#     "num_attention_heads":2,
#     "attention_drop_out":0.0,
#     "drop_out":0.2,
#     "max_position":512,
#     "num_encoder_layers":3,
#     "num_decoder_layers":3,
    
# })

In [9]:
class EncoderLayer(nn.Module):
    def __init__(self, 
                 config):
        '''Initialize encoder layer
        
        Args:
            config (Config): configuration parameters.
        '''
        
        super().__init__()

        self.drop_out = config.drop_out
        
        # self multi-head attention
        self.self_attn = MultiHeadAttention(emb_dim = config.emb_dim,
                                            num_heads = config.num_attention_heads,
                                            drop_out = config.attention_drop_out)                      
        self.attn_layer_norm = nn.LayerNorm(config.emb_dim)
        
        #position-wise feed forward
        self.position_wise_feed_forward = PositionWiseFeedForward(config.emb_dim,
                                                               config.ffn_dim,
                                                               config.drop_out)
        self.feed_forward_layer_norm = nn.LayerNorm(config.emb_dim)
    
    def forward(self, 
                x: torch.Tensor, 
                encoder_padding_mask: torch.Tensor):
        '''
        Args:
            x (Tensor): shape '(batch_size, src_len, emb_dim)'
            encoder_padding_mask (Tensor): binary BoolTensor. shape '(batch_size, src_len)'
            
        Returns:
            x (Tensor): encoded output. shape '(batch_size, src_len, emb_dim)'
            self_attn_weights: self attention socre
        '''
        residual = x
        x, self_attn_weights = self.self_attn(query=x, 
                                              key=x, 
                                              attention_mask=encoder_padding_mask)
        x = F.dropout(x, p=self.drop_out, training = self.training)
        x = self.attn_layer_norm(x + residual)
        
        residual = x
        x = self.position_wise_feed_forward(x)
        x = self.attn_layer_norm(x + residual)
        
#         clamping
        if x.isnan().any() or x.isinf().any():
            clamp_value = torch.finfo(x.dtype).max - 1000
            x = torch.clamp(x, min = -clamp_value, max = clamp_value)
        return x, self_attn_weights

In [10]:
# x = torch.rand(3, 12, 6)
# padding_mask = torch.zeros(3,12, dtype=torch.bool)
# padding_mask[0,6:] = True
# padding_mask[1, 3:] = True
# padding_mask[2, 10:] = True


In [11]:
# temp_el = EncoderLayer(config)
# out, attn_weights = temp_el(x, padding_mask)

# '''패딩마스크가 잘 들어가고 있는지 확인.'''
# print(attn_weights.shape)
# print(attn_weights)

In [12]:
# print(x.shape)
# print(x)

In [13]:
class Encoder(nn.Module):
    def __init__(self, 
                 config, 
                 embedding_table: nn.Embedding):
        '''Initialize stack of Encoder layers
        
        Args:
            config (Config):Configuration parameters.
            embedding_table (nn.Embedding): instance of nn.Embedding for Encoder input tokens.
                                            input tokens shape '(batch_size, src_len)'
                                            embedding table shape '(num_voca, emb_dim)'
        '''
        super().__init__()
        
        self.drop_out = config.drop_out
        
        self.embedding_table = embedding_table
        self.embed_positions = SinusoidalPositionalEncodedEmbedding(config.max_position,
                                                                    config.emb_dim)
        self.layers = nn.ModuleList([EncoderLayer(config) for _ in range(config.num_encoder_layers)])                     
        
    def forward(self, 
                input_indices: torch.Tensor, 
                padding_mask = None):
        '''
        Args:
            input_indices (Tensor): input to Encoder. shape '(batch_size, src_len)'
            padding_mask (Tensor): padding mask. shape '(batch_size, src_len)'
            
        Returns:
            x (Tensor): Encoder output. shape '(batch_size, src_len, emb_dim)'
            self_attn_scores (list): list of attention weights of each Encoder layer.
        '''
        
        inputs_embed = self.embedding_table(input_indices)
        pos_embed = self.embed_positions(input_indices)
        x = inputs_embed + pos_embed
        x = F.dropout(x, p = self.drop_out, training = self.training)
        
        self_attn_weights = []
        for encoder_layer in self.layers:
            x, attn_weights = encoder_layer(x, padding_mask)
            self_attn_weights.append(attn_weights.detach().clone())
        return x, self_attn_weights
        

In [14]:
# config = easydict.EasyDict({
#     "emb_dim":6,
#     "ffn_dim":256,
#     "num_attention_heads":2,
#     "attention_drop_out":0.0,
#     "drop_out":0.2,
#     "max_position":512,
#     "num_encoder_layers":3,
#     "num_decoder_layers":3,
    
# })

In [15]:
# x = torch.randint(0,9,(3,4))
# padding_mask = torch.zeros(3,4, dtype=torch.bool)
# padding_mask[0,1:] = True
# padding_mask[1,2:] = True
# padding_mask[2,3:] = True
# print(x)
# print(padding_mask)

In [16]:
# emb = nn.Embedding(10, 6)
# e = Encoder(config, emb)
# enc_out, attn_scores = e(x, padding_mask)
# print(enc_out.shape)
# print(enc_out)

In [17]:
# print(attn_scores[0].shape)
# print(attn_scores)

In [18]:
class DecoderLayer(nn.Module):
    def __init__(self, 
                 config):
        '''Initialize decoder layer
        
        Args:
            config (Config): configuration parameters.
        '''
        
        super().__init__()
        self.drop_out = config.drop_out
        
        # masked multi_head attention
        self.self_attn = MultiHeadAttention(emb_dim = config.emb_dim,
                                            num_heads = config.num_attention_heads,
                                            drop_out = config.attention_drop_out,
                                            causal = True)
        self.self_attn_layer_norm = nn.LayerNorm(config.emb_dim)
        
        # encoder-decoder attention
        self.enc_dec_attn = MultiHeadAttention(emb_dim = config.emb_dim,
                                                       num_heads = config.num_attention_heads,
                                                       drop_out = config.attention_drop_out,
                                                       encoder_decoder_attention = True)
        self.enc_dec_attn_layer_norm = nn.LayerNorm(config.emb_dim)
        
        #position-wise feed forward
        self.position_wise_feed_forward = PositionWiseFeedForward(config.emb_dim,
                                                               config.ffn_dim,
                                                               config.drop_out)
        self.feed_forward_layer_norm = nn.LayerNorm(config.emb_dim)
    
    def forward(self,
                x: torch.Tensor,
                encoder_output: torch.Tensor,
                enc_dec_attention_padding_mask: torch.Tensor = None,
                causal_mask: torch.Tensor = None):
        
        '''
        Args:
            x (Tensor): Input to decoder layer. shape '(batch_size, trg_len, emb_dim)'.
            encoder_output (Tensor): Output of encoder. shape '(batch_size, src_len, emb_dim)'
            enc_dec_attention_padding_mask (Tensor): Binary BoolTensor for masking padding of
                                                     encoder output.
                                                     shape '(batch_size, src_len)'.
            causal_mask (Tensor): Binary BoolTensor for masking future information in decoder.
                                  shape '(batch_size, trg_len)'
        
        Returns:
            x (Tensor): Output of decoder layer. shape '(batch_size, trg_len, emb_dim)'.
            self_attn_weights (Tensor): Masked self attention weights of decoder. 
                                        shape '(batch_size, trg_len, trg_len)'.
            enc_dec_attn_weights (Tensor): Encoder-decoder attention weights.
                                           shape '(batch_size, trg_len, src_len)'.
        '''
        
        # msked self attention
        residual = x
        x, self_attn_weights = self.self_attn(query = x,
                                              key = x,
                                              attention_mask = causal_mask)
        x = F.dropout(x, p = self.drop_out, training = self.training)
        x = self.self_attn_layer_norm(x + residual)
        
        # encoder-decoder attention
        residual = x
        x, enc_dec_attn_weights = self.enc_dec_attn(query = x,
                                                    key = encoder_output,
                                                    attention_mask = enc_dec_attention_padding_mask)
        x = F.dropout(x, p = self.drop_out, training = self.training)
        x = self.enc_dec_attn_layer_norm(x + residual)
        
        # position-wise feed forward
        residual = x
        x = self.position_wise_feed_forward(x)
        x = self.feed_forward_layer_norm(x + residual)
        
        return x, self_attn_weights, enc_dec_attn_weights

In [19]:
# config = easydict.EasyDict({
#     "emb_dim":6,
#     "ffn_dim":256,
#     "num_attention_heads":2,
#     "attention_drop_out":0.0,
#     "drop_out":0.2,
#     "max_position":512,
#     "num_encoder_layers":3,
#     "num_decoder_layers":3,
    
# })

In [20]:
# x = torch.rand(3, 5, 6) 
# '''batch = 3, trg_len = 5, emb_dim = 6'''
# padding_mask = torch.zeros(3,4, dtype=torch.bool)
# padding_mask[0,1:] = True
# padding_mask[1,1:] = True
# padding_mask[2,2:] = True
# padding_mask

In [21]:
# causal_mask = torch.zeros(5,5, dtype=torch.bool) # shape (trg_len, trg_len)
# causal_mask[0,1:] = True
# causal_mask[1,2:] = True
# causal_mask[2,3:] = True
# causal_mask[3,4:] = True

# causal_mask

In [22]:
# d = DecoderLayer(config)
# dec_out, self_attn_weigths, enc_dec_attn_weights = d(x,enc_out,padding_mask,causal_mask)

In [23]:
# print(dec_out.shape)
# dec_out

In [24]:
# print(self_attn_weigths.shape)
# self_attn_weigths

In [25]:
# print(enc_dec_attn_weights.shape) 
# '''(batch_size, num_head, trg_len, src_len)'''
# enc_dec_attn_weights

In [26]:
class Decoder(nn.Module):
    
    def __init__(self, 
                 config,
                 embedding_table: nn.Embedding):
        '''Initialize stack of Encoder layers
        
        Args:
            config (Config):Configuration parameters.
            embedding_table (nn.Embedding): instance of nn.Embedding for Decoder input tokens.
                                            input tokens shape '(batch_size, trg_len)'
                                            embedding table shape '(num_voca, emb_dim)'
        '''
        
        super().__init__()
        
        self.drop_out = config.drop_out
        
        self.embedding_table = embedding_table
        self.embed_positions = SinusoidalPositionalEncodedEmbedding(config.max_position,
                                                                    config.emb_dim)
        self.layers = nn.ModuleList([DecoderLayer(config) for _ in range(config.num_decoder_layers)])
    
    def forward(self,
                input_indices: torch.Tensor,
                encoder_output: torch.Tensor,
                enc_dec_attention_padding_mask: torch.Tensor = None,
                causal_mask: torch.Tensor = None):
        '''
        Args:
            input_indeces (Tensor): input to decoder. shape '(batch_size, trg_len)'
            encoder_output (Tensor): output of encoder. shape '(batch_size, src_len, emb_dim)'
            enc_dec_attention_padding_masl (Tensor): Binary BoolTensor for masking padding of
                                                     encoder output.
                                                     shape '(batch_size, src_len)'.
            causal_mask (Tensor): Binary BoolTensor for masking future information in decoder.
                                  shape '(batch_size, trg_len)'
        
        Returns:
            x (Tensor): output of decoder. shape '(batch_size, trg_len, emb_dim)'
            enc_dec_attn_weigths (list): list of enc-dec attention weights of each Decoder layer.
        '''
        
        inputs_embed = self.embedding_table(input_indices)
        pos_embed = self.embed_positions(input_indices)
        x = inputs_embed + pos_embed
        x = F.dropout(x, p = self.drop_out, training = self.training)
        
        enc_dec_attn_weights = []
        for decoder_layer in self.layers:
            x, _, attn_weights = decoder_layer(x, 
                                               encoder_output,
                                               enc_dec_attention_padding_mask,
                                               causal_mask)
            enc_dec_attn_weights.append(attn_weights.detach().clone())
        return x, enc_dec_attn_weights

In [27]:
# config = easydict.EasyDict({
#     "emb_dim":6,
#     "ffn_dim":256,
#     "num_attention_heads":2,
#     "attention_drop_out":0.0,
#     "drop_out":0.2,
#     "max_position":512,
#     "num_encoder_layers":3,
#     "num_decoder_layers":3,
    
# })

In [28]:
# x = torch.randint(0,9,(3,5)) # batch_size = 3  trg_len = 4
# padding_mask = torch.zeros(3,4, dtype=torch.bool)
# padding_mask[0,1:] = True
# padding_mask[1,1:] = True
# padding_mask[2,3:] = True
# padding_mask

In [29]:
# causal_mask = torch.zeros(5,5, dtype=torch.bool) # shape (trg_len, trg_len)
# causal_mask[0,1:] = True
# causal_mask[1,2:] = True
# causal_mask[2,3:] = True
# causal_mask[3,4:] = True

# causal_mask

In [30]:
# emb = nn.Embedding(10, 6)
# d = Decoder(config, emb)
# dec_out, enc_dec_attn_weights = d(x, enc_out, padding_mask, causal_mask)

In [31]:
# print(dec_out.shape)
# dec_out

In [32]:
# print(enc_dec_attn_weights[0].shape)
# enc_dec_attn_weights

In [67]:
class Transformer(nn.Module):
    def __init__(self, 
                 SRC: Field, 
                 TRG: Field, 
                 config):
        '''Initialize transformer
        
        Args:
            SRC (Field): source data class
            TRG (Field): target data class
            config (Config): configuration parameters.
        '''
        
        super().__init__()

        self.config = config
        self.SRC = SRC
        self.TRG = TRG
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        
        self.enc_embedding = nn.Embedding(len(SRC.vocab), 
                                          config.emb_dim,
                                          padding_idx = SRC.vocab.stoi['<pad>'])
        self.dec_embedding = nn.Embedding(len(TRG.vocab), 
                                          config.emb_dim,
                                          padding_idx = TRG.vocab.stoi['<pad>'])
        
        self.encoder = Encoder(config, self.enc_embedding)
        self.decoder = Decoder(config, self.dec_embedding)
        
        self.linear = nn.Linear(config.emb_dim, len(TRG.vocab))
        
#         self.init_weights()
        
    def init_weights(self):
        for name, param in self.named_parameters():
            if param.requires_grad:
                if 'weigth' in name:
                    nn.init.normal_(param.data, mean = 0, std = 0.01)
                else:
                    nn.init.constant_(param.data, 0)
    
#     def generate_mask(self, 
#                       src: torch.LongTensor, 
#                       trg: torch.LongTensor):
#         '''Generate padding mask and causal mask
        
#         Args:
#             src (LongTensor): input to encoder. shape '(batch_size, src_len)'
#             trg (LongTensor): input to decoder. shape '(batch_size, trg_len)'
        
#         Returns:
#             padding_mask (Tensor): shape '(batch_size, src_len)'
#             causal_mask (Tensor): shape '(trg_len, trg_len)'
#         '''
        
#         device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#         # padding mask
#         padding_mask = src.eq(self.SRC.vocab.stoi['<pad>']).to(device)
#         # causal mask
#         tmp = torch.ones(trg.size(1), trg.size(1), dtype = torch.bool)
# #         mask = torch.arange(tmp.size(-1))
# #         causal_mask = tmp.masked_fill_(mask < (mask + 1).view(tmp.size(-1), 1), False).to(device)
#         causal_mask = torch.tril(tmp,-1).transpose(0,1).contiguous().to(device)
#         return padding_mask, causal_mask
    
    def generate_causal_mask(self,  
                             trg: torch.LongTensor):
        '''Generate padding mask and causal mask
        
        Args:
            trg (LongTensor): input to decoder. shape '(batch_size, trg_len)'
        
        Returns:
            causal_mask (Tensor): shape '(trg_len, trg_len)'
        '''
        tmp = torch.ones(trg.size(1), trg.size(1), dtype = torch.bool)
        causal_mask = torch.tril(tmp,-1).transpose(0,1).contiguous().to(self.device)
        
        return causal_mask
    
    def generate_padding_mask(self, 
                              src: torch.LongTensor):
        '''Generate padding mask
        
        Args:
            src (LongTensor): input to encoder. shape '(batch_size, src_len)'
        
        Returns:
            padding_mask (Tensor): shape '(batch_size, src_len)'
        '''
        padding_mask = src.eq(self.SRC.vocab.stoi['<pad>']).to(self.device)
        
        return padding_mask
    
    def forward(self,
                src: torch.LongTensor,
                trg: torch.LongTensor):
        '''
        Args:
            src (LongTensor): input to encoder. shape '(batch_size, src_len)'
            trg (LongTensor): input to decoder. shape '(batch_size, trg_len)'
        
        Returns:
            output (Tensor): output of transformer. 
                             shape '(batch_size, trg_len, # trg vocab)'
            encoder_attn_weights (list): list of attention weights of each Encoder layer.
            enc_dec_attn_weights (list): list of enc-dec attention weights of each Decoder layer.
        '''
        
#         padding_mask, causal_mask = self.generate_mask(src, trg)
        
        padding_mask = self.generate_padding_mask(src)
        causal_mask = self.generate_causal_mask(trg)
        
        encoder_output, encoder_attn_weights = self.encoder(input_indices = src,
                                                            padding_mask = padding_mask)
        
        decoder_output, enc_dec_attn_weights = self.decoder(input_indices = trg,
                                                            encoder_output = encoder_output,
                                                            enc_dec_attention_padding_mask = padding_mask,
                                                            causal_mask = causal_mask)
        
        output = self.linear(decoder_output)
        
        return output, encoder_attn_weights, enc_dec_attn_weights
    
    def predict(self,
                src: torch.LongTensor):
        '''
        Args:
            src (LongTensor): input to encoder. shape '(batch_size, src_len)'
        
        Returns:
            output_tokens (LongTensor): predicted tokens. shape'(batch_size, max_position)'
        '''
        padding_mask = self.generate_padding_mask(src)
        
        encoder_output, _ = self.encoder(input_indices = src,
                                         padding_mask = padding_mask)
        output_tokens = (torch.ones((self.config.batch_size, self.config.max_position))\
                         * self.TRG.vocab.stoi['<pad>']).long().to(self.device) 
        ## (batch_size, max_position)
        output_tokens[:,0] = self.TRG.vocab.stoi['<sos>']
        for trg_index in range(1, self.config.max_position):
            trg = output_tokens[:,:trg_index] # (batch_size, trg_index)
            causal_mask = self.generate_causal_mask(trg) # (trg_index, trg_index)
            output, _ = self.decoder(input_indices = trg,
                                             encoder_output = encoder_output,
                                             enc_dec_attention_padding_mask = padding_mask,
                                             causal_mask = causal_mask) # (batch_size, trg_index, emb_dim)
            output = self.linear(output) # (batch_size, trg_index, # trg vocab)
            output = torch.argmax(output, dim = -1) # (batch_size, trg_index)
            output_tokens[:,trg_index] = output[:,-1]
        
        return output_tokens
        
        

In [53]:
config = easydict.EasyDict({
    "emb_dim":32,
    "ffn_dim":128,
    "num_attention_heads":2,
    "attention_drop_out":0.0,
    "drop_out":0.2,
    "max_position":256,
    "num_encoder_layers":3,
    "num_decoder_layers":3,
    'batch_size':64,
    'learning_rate':5e-4,
    'n_epochs':200,
    'gradient_clip':1
})

In [35]:
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

def prepare_data(batch_size):
    '''prepare data
    
    Args:
        batch_size (int): batch size.
        
    Returns:
        SRC (Field): source data Field class
        TRG (Field): target data Field class
        train_iterator (BucketIterator): training data iterator
        valid_iterator (BucketIterator): validation data iterator
        test_iterator (BucketIterator): test data iterator
    '''
    
    SRC = Field(tokenize = "spacy",
            tokenizer_language="de",
            init_token = '<sos>',
            eos_token = '<eos>',
            batch_first=True,
            lower = True)

    TRG = Field(tokenize = "spacy",
                tokenizer_language="en",
                init_token = '<sos>',
                eos_token = '<eos>',
                batch_first=True,
                lower = True)

    train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'),
                                                        fields = (SRC, TRG))

    SRC.build_vocab(train_data, min_freq = 2)
    TRG.build_vocab(train_data, min_freq = 2)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
        (train_data, valid_data, test_data),
        batch_size = batch_size,
        device = device,
        shuffle=True)
    
    data_loders = dict()
    data_loders['train'] = train_iterator
    data_loders['val'] = valid_iterator
    data_loders['test'] = test_iterator
    
    return SRC, TRG, data_loders

In [68]:
def get_network(SRC: Field,
                TRG: Field,
                config):
    '''Get network.
    
    Args:
        SRC (Field): source data Field class.
        TRG (Field): target data Field class.
        config (Config): configuration parameters.
    
    Returns:
        model (Module): transformer model.
        criterion (CrossEntropyLoss): loss function. 
        optimizer (Adam): optimizer.
    '''
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = Transformer(SRC, TRG, config).to(device)
    criterion = nn.CrossEntropyLoss(ignore_index = TRG.vocab.stoi['<pad>'])
    optimizer = optim.Adam(model.parameters(),lr = config.learning_rate)
    
    return model, criterion, optimizer

In [44]:
def train(model: nn.Module,
          data_loaders: dict,
          criterion,
          optimizer,
          config):
    '''Training model
    
    Args:
        model (nn.Module): transformer model.
        data_loaders (dict): training/validation data iterator.
        criterion : loss function. 
        optimizer : optimizer.
        config (Config): configuration parameters.
    '''
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    
    
    print_loss_every = 1
    for epoch in range(config.n_epochs): 
        for phase in ['train', 'val']:
            
            if phase == 'train':
                model.train()
            else:
                model.eval()
                
            loss_val_sum = 0
            
            for batch in data_loaders[phase]:
                
                optimizer.zero_grad()
 
                src = batch.src.to(device)
                trg = batch.trg.to(device)
                
                with torch.set_grad_enabled(phase == 'train'):
                    output, _, _ = model(src, trg)

                    output = output[:,:-1,:].reshape(-1, output.shape[-1])
                    trg = trg[:,1:].reshape(-1)

                    loss = criterion(output, trg)
                    
                    if phase == 'train':
                        loss.backward()
                    # gradient clipping
#                         torch.nn.utils.clip_grad_norm_(model.parameters(), config.gradient_clip)
                        optimizer.step()

                loss_val_sum += loss

            if ((epoch % print_loss_every) == 0) or (epoch == (config.n_epochs - 1)):
                loss_val_avg = loss_val_sum / len(data_loders[phase])
                print(
                    f"epoch:[{epoch+1}/{config.n_epochs}] {phase} cost:[{loss_val_avg:.3f}]"
                )
    print('training done!!')
    
    return model

In [73]:
import time
import os
def save_model(model: nn.Module,
               optimizer,
               epoch: int):
    if not os.path.isdir('ckpt'):
        os.mkdir('ckpt')
    state = {'epoch' : epoch, 
            'model' : model.state_dict(),
            'optimizer' : optimizer.state_dict()}
    now = time.localtime()
    path = 'ckpt/' + f"{now.tm_year}-{now.tm_mon}-{now.tm_mday}_{now.tm_hour}_{now.tm_min}_{now.tm_sec}" + '.pt'

    torch.save(state, path)

In [45]:
SRC, TRG, data_loaders = prepare_data(config.batch_size)

In [71]:
model, criterion, optimizer = get_network(SRC, TRG, config)

In [72]:
model = train(model, data_loaders, criterion, optimizer, config)


epoch:[1/200] train cost:[11.415]
epoch:[1/200] val cost:[9.094]
epoch:[2/200] train cost:[8.869]
epoch:[2/200] val cost:[7.890]
epoch:[3/200] train cost:[8.138]
epoch:[3/200] val cost:[7.329]
epoch:[4/200] train cost:[7.714]
epoch:[4/200] val cost:[6.919]
epoch:[5/200] train cost:[7.420]
epoch:[5/200] val cost:[6.668]
epoch:[6/200] train cost:[7.197]
epoch:[6/200] val cost:[6.454]
epoch:[7/200] train cost:[7.002]
epoch:[7/200] val cost:[6.274]
epoch:[8/200] train cost:[6.840]
epoch:[8/200] val cost:[6.103]
epoch:[9/200] train cost:[6.689]
epoch:[9/200] val cost:[5.968]
epoch:[10/200] train cost:[6.558]
epoch:[10/200] val cost:[5.871]
epoch:[11/200] train cost:[6.435]
epoch:[11/200] val cost:[5.736]
epoch:[12/200] train cost:[6.324]
epoch:[12/200] val cost:[5.630]
epoch:[13/200] train cost:[6.227]
epoch:[13/200] val cost:[5.531]
epoch:[14/200] train cost:[6.133]
epoch:[14/200] val cost:[5.456]
epoch:[15/200] train cost:[6.042]
epoch:[15/200] val cost:[5.394]
epoch:[16/200] train cost:[

epoch:[125/200] train cost:[3.637]
epoch:[125/200] val cost:[3.673]
epoch:[126/200] train cost:[3.629]
epoch:[126/200] val cost:[3.653]
epoch:[127/200] train cost:[3.630]
epoch:[127/200] val cost:[3.651]
epoch:[128/200] train cost:[3.618]
epoch:[128/200] val cost:[3.658]
epoch:[129/200] train cost:[3.605]
epoch:[129/200] val cost:[3.676]
epoch:[130/200] train cost:[3.602]
epoch:[130/200] val cost:[3.640]
epoch:[131/200] train cost:[3.594]
epoch:[131/200] val cost:[3.641]
epoch:[132/200] train cost:[3.595]
epoch:[132/200] val cost:[3.630]
epoch:[133/200] train cost:[3.581]
epoch:[133/200] val cost:[3.619]
epoch:[134/200] train cost:[3.566]
epoch:[134/200] val cost:[3.616]
epoch:[135/200] train cost:[3.564]
epoch:[135/200] val cost:[3.639]
epoch:[136/200] train cost:[3.564]
epoch:[136/200] val cost:[3.627]
epoch:[137/200] train cost:[3.555]
epoch:[137/200] val cost:[3.641]
epoch:[138/200] train cost:[3.546]
epoch:[138/200] val cost:[3.649]
epoch:[139/200] train cost:[3.544]
epoch:[139/20

NameError: name 'os' is not defined

In [74]:
save_model(model, optimizer, config.n_epochs)

In [86]:
def index_to_sentence(indeces, vocab):
    res = []
    eos = vocab.stoi['<eos>']
    for i in indeces:
        if i == eos:
            res.append(vocab.itos[i])
            break
        else:
            res.append(vocab.itos[i])
    return ' '.join(res)
    

def test_model(model, data_loaders, SRC, TRG):
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    for batch in data_loaders['test']:
        src = batch.src.to(device)
        trg = batch.trg.to(device)
        predicted = model.predict(src)
        
        for ins in range(predicted.size(0)):
            print(f'source : {index_to_sentence(src[ins],SRC.vocab)}')
            print(f'target : {index_to_sentence(trg[ins], TRG.vocab)}')
            print(f'predic : {index_to_sentence(predicted[ins], TRG.vocab)}')
            print('*********************************************')

                  
        
#             print([index_to_sentence(src[ins],SRC.vocab),\
#                    index_to_sentence(trg[ins], TRG.vocab),\
#                    index_to_sentence(predicted[ins], TRG.vocab)])
        
        break

In [87]:
test_model(model, data_loaders, SRC, TRG)

source : <sos> ein cowboy <unk> seinen arm . <eos>
target : <sos> a cowboy wrapping up his arm with a bandage . <eos>
predic : <sos> a cowboy hat his arm . <eos>
*********************************************
source : <sos> der afroamerikaner protestiert gegen <unk> <unk> . <eos>
target : <sos> the african american man <unk> against <unk> <unk> . <eos>
predic : <sos> the african american man is protesting against pollution . <eos>
*********************************************
source : <sos> ein afroamerikaner geht die straße hinunter . <eos>
target : <sos> an african american man walking down the street . <eos>
predic : <sos> an african american man walking down the street . <eos>
*********************************************
source : <sos> ein junger mann wirft einen football . <eos>
target : <sos> a young man about to throw a football . <eos>
predic : <sos> a young man throws a football ball . <eos>
*********************************************
source : <sos> eine gruppe macht tricks 

predic : <sos> firefighters are coming out of a subway . <eos>
*********************************************
source : <sos> ein mann verwendet <unk> geräte . <eos>
target : <sos> a man is using electronic equipment . <eos>
predic : <sos> a man uses cupcakes . <eos>
*********************************************
source : <sos> ein mann an seinem hochzeitstag . <eos>
target : <sos> a man on his wedding day . <eos>
predic : <sos> a man on his wedding day . <eos>
*********************************************
source : <sos> hunde laufen auf einer hunderennbahn . <eos>
target : <sos> dogs run at a dog racetrack . <eos>
predic : <sos> dogs are running on a dog . <eos>
*********************************************
source : <sos> ein am strand <unk> auto . <eos>
target : <sos> a car parked at the beach . <eos>
predic : <sos> a <unk> man on the beach . <eos>
*********************************************
source : <sos> ein kind planscht im wasser . <eos>
target : <sos> a child is splashing in the 

In [57]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# model.to(device)
for batch in data_loaders['test']:
    
#     src = batch.src[0,:].unsqueeze(0).to(device)
    src = batch.src.to(device)
#     trg = batch.trg.to(device)
    
    p = model.predict(src)
    break
# batch.src[0,:].unsqueeze(0)
p

tensor([[2, 4, 4,  ..., 4, 4, 4],
        [2, 4, 4,  ..., 4, 4, 4],
        [2, 4, 4,  ..., 4, 4, 4],
        ...,
        [2, 4, 4,  ..., 4, 4, 4],
        [2, 4, 4,  ..., 4, 4, 4],
        [2, 4, 4,  ..., 4, 4, 4]], device='cuda:0')

In [58]:
p[0]

tensor([2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4], device='cuda:0')

In [75]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# model.to(device)
for batch in data_loaders['test']:
    
#     src = batch.src[0,:].unsqueeze(0).to(device)
    src = batch.src.to(device)
#     trg = batch.trg.to(device)
    
    p = model.predict(src)
    break
# batch.src[0,:].unsqueeze(0)
p

tensor([[  2,   4, 429,  ...,   3,   5,   3],
        [  2,   7, 324,  ...,   5,   3,   5],
        [  2,  21, 324,  ...,   5,   3,   5],
        ...,
        [  2,   4,  14,  ...,   5,   3,   5],
        [  2,  24,  14,  ...,   3,   5,   3],
        [  2,  16,  30,  ...,   3,   5,   3]], device='cuda:0')

In [76]:
p1 = p[0]
p1

tensor([  2,   4, 429,  67,  27, 394,   5,   3,   5,   3,   5,   3,   5,   3,
          5,   3,   5,   3,   5,   3,   5,   3,   5,   3,   5,   3,   5,   3,
          5,   3,   5,   3,   5,   3,   5,   3,   5,   3,   5,   3,   5,   3,
          5,   3,   5,   3,   5,   3,   5,   3,   5,   3,   5,   3,   5,   3,
          5,   3,   5,   3,   5,   3,   5,   3,   5,   3,   5,   3,   5,   3,
          5,   3,   5,   3,   5,   3,   5,   3,   5,   3,   5,   3,   5,   3,
          5,   3,   5,   3,   5,   3,   5,   3,   5,   3,   5,   3,   5,   3,
          5,   3,   5,   3,   5,   3,   5,   3,   5,   3,   5,   3,   5,   3,
          5,   3,   5,   3,   5,   3,   5,   3,   5,   3,   5,   3,   5,   3,
          5,   3,   5,   3,   5,   3,   5,   3,   5,   3,   5,   3,   5,   3,
          5,   3,   5,   3,   5,   3,   5,   3,   5,   3,   5,   3,   5,   3,
          5,   3,   5,   3,   5,   3,   5,   3,   5,   3,   5,   3,   5,   3,
          5,   3,   5,   3,   5,   3,   5,   3,   5,   3,   5,  

In [78]:
print(TRG.vocab.itos[4])
print(TRG.vocab.itos[429])
print(TRG.vocab.itos[67])
print(TRG.vocab.itos[27])
print(TRG.vocab.itos[394])
print(TRG.vocab.itos[5])
print(TRG.vocab.itos[3])

a
cowboy
hat
his
arm
.
<eos>


In [63]:
i = 0
for param_tensor in model.state_dict():
    print(param_tensor, '\t', model.state_dict()[param_tensor])
    i+=1
    if i == 3:
        
        break

enc_embedding.weight 	 tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')
dec_embedding.weight 	 tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')
encoder.embedding_table.weight 	 tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0')


In [69]:
model2, criterion2, optimizer2 = get_network(SRC, TRG, config)

In [79]:
i = 0
for param_tensor in model.state_dict():
    print(param_tensor, '\t', model.state_dict()[param_tensor])
    i+=1
    if i == 3:
        
        break

enc_embedding.weight 	 tensor([[ 0.2050, -0.1782, -0.3098,  ..., -1.4686, -0.3315, -1.2724],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4540, -1.2224, -0.4631,  ..., -1.2043, -1.1045, -0.4408],
        ...,
        [ 0.1277,  0.1854,  0.1556,  ..., -0.7096,  0.1593, -1.2922],
        [-0.9370, -1.2006,  0.4956,  ..., -1.7500, -0.4906, -1.7067],
        [-0.6994,  0.7447, -0.7396,  ..., -1.4942, -0.8825,  0.4950]],
       device='cuda:0')
dec_embedding.weight 	 tensor([[ 0.5049, -0.4566,  0.4133,  ...,  2.3915, -0.3424, -0.3108],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1504,  1.7340, -0.0179,  ..., -1.4123, -0.4329, -1.1875],
        ...,
        [ 1.1088, -3.1266, -0.5490,  ..., -1.1389, -0.1900,  1.8774],
        [ 0.6562, -0.1499,  1.0171,  ...,  1.3236,  0.1412,  0.5640],
        [ 1.1741,  0.7020,  0.0589,  ..., -1.0486,  0.4446, -0.2674]],
       device='cuda:0')
encoder.embedding_table.weight 	 tenso

In [58]:
!nvidia-smi

Tue Mar 02 17:17:25 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 456.71       Driver Version: 456.71       CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 166... WDDM  | 00000000:26:00.0  On |                  N/A |
| 30%   38C    P8    11W / 125W |   5367MiB /  6144MiB |      1%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|       

In [28]:
import math
import time
from tqdm import tqdm

best_valid_loss = float('inf')
def train(model: nn.Module,
          iterator: BucketIterator,
          optimizer: optim.Optimizer,
          criterion: nn.Module,
          clip: float):

    model.train()

    epoch_loss = 0

    for idx, batch in enumerate(iterator):

        src = batch.src
        trg = batch.trg

        optimizer.zero_grad()

        output, enc_attention_scores, _ = model(src, trg)

        output = output[:,:-1,:].reshape(-1, output.shape[-1])
        trg = trg[:,1:].reshape(-1)

        loss = criterion(output, trg)

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)


def evaluate(model: nn.Module,
             iterator: BucketIterator,
             criterion: nn.Module):

    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for _, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output, attention_score, _ = model(src, trg) #turn off teacher forcing

            output = output[:,:-1,:].reshape(-1, output.shape[-1])
            trg = trg[:,1:].reshape(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)

for epoch in tqdm(range(config.n_epochs), total=config.n_epochs):
    train_loss = train(model, data_loders['train'], optimizer, criterion, config.gradient_clip)
    valid_loss = evaluate(model, data_loders['val'], criterion)
    
#     if best_valid_loss < valid_loss:
#         break
#     else:
#         best_valid_loss = valid_loss

    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

test_loss = evaluate(model, data_loders['test'], criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

  1%|▊                                                                                 | 1/100 [00:09<15:25,  9.35s/it]

	Train Loss: 8.604 | Train PPL: 5453.463
	 Val. Loss: 8.524 |  Val. PPL: 5035.597


  2%|█▋                                                                                | 2/100 [00:18<15:16,  9.35s/it]

	Train Loss: 8.451 | Train PPL: 4681.116
	 Val. Loss: 8.371 |  Val. PPL: 4321.222


  3%|██▍                                                                               | 3/100 [00:27<15:05,  9.33s/it]

	Train Loss: 8.302 | Train PPL: 4033.552
	 Val. Loss: 8.222 |  Val. PPL: 3722.213


  4%|███▎                                                                              | 4/100 [00:37<14:54,  9.32s/it]

	Train Loss: 8.157 | Train PPL: 3488.838
	 Val. Loss: 8.077 |  Val. PPL: 3218.466


  4%|███▎                                                                              | 4/100 [00:44<17:53, 11.19s/it]


KeyboardInterrupt: 

In [24]:
i = 0
for param_tensor in model.state_dict():
    i+= 1
    print(param_tensor, '\t', model.state_dict()[param_tensor].size())
print(i)

enc_embedding.weight 	 torch.Size([7854, 64])
dec_embedding.weight 	 torch.Size([5893, 64])
encoder.embedding_table.weight 	 torch.Size([7854, 64])
encoder.embed_positions.weight 	 torch.Size([512, 64])
encoder.layers.0.self_attn.wk.weight 	 torch.Size([64, 64])
encoder.layers.0.self_attn.wq.weight 	 torch.Size([64, 64])
encoder.layers.0.self_attn.wv.weight 	 torch.Size([64, 64])
encoder.layers.0.self_attn.output.weight 	 torch.Size([64, 64])
encoder.layers.0.attn_layer_norm.weight 	 torch.Size([64])
encoder.layers.0.attn_layer_norm.bias 	 torch.Size([64])
encoder.layers.0.position_wise_feed_forward.linear_1.weight 	 torch.Size([256, 64])
encoder.layers.0.position_wise_feed_forward.linear_1.bias 	 torch.Size([256])
encoder.layers.0.position_wise_feed_forward.linear_2.weight 	 torch.Size([64, 256])
encoder.layers.0.position_wise_feed_forward.linear_2.bias 	 torch.Size([64])
encoder.layers.0.feed_forward_layer_norm.weight 	 torch.Size([64])
encoder.layers.0.feed_forward_layer_norm.bias 	

In [None]:
def evaluate(model: nn.Module,
             data_loders: dict,
             criterion,
             optimizer,
             config):