<a href="https://colab.research.google.com/github/eisbetterthanpi/pytorch/blob/main/transformer_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# https://github.com/bentrevett/pytorch-seq2seq/blob/master/6%20-%20Attention%20is%20All%20You%20Need.ipynb
# https://colab.research.google.com/github/bentrevett/pytorch-seq2seq/blob/master/6%20-%20Attention%20is%20All%20You%20Need.ipynb
# https://www.mihaileric.com/posts/transformers-attention-in-disguise/
# https://jalammar.github.io/illustrated-transformer/
# http://nlp.seas.harvard.edu/2018/04/03/attention.html

# position embedding has a "vocabulary" size of 100, model can accept sentences up to 100 tokens long
# we use a learned positional encoding instead of a static one
# we use the standard Adam optimizer with a static learning rate instead of one with warm-up and cool-down steps
# we do not use label smoothing


In [None]:
# @title setup

# https://pytorch.org/tutorials/beginner/translation_transformer.html
# https://colab.research.google.com/github/pytorch/tutorials/blob/gh-pages/_downloads/c64c91cf87c13c0e83586b8e66e4d74e/translation_transformer.ipynb

# https://github.com/pytorch/data
%pip install portalocker
%pip install torchdata

# Create source and target language tokenizer. Make sure to install the dependencies.
!pip install -U torchdata
!pip install -U spacy
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm


In [2]:
# @title data

from torchtext.datasets import multi30k, Multi30k
# modify the URLs for the dataset since the links to the original dataset are broken https://github.com/pytorch/text/issues/1756#issuecomment-1163664163
multi30k.URL["train"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/training.tar.gz"
multi30k.URL["valid"] = "https://raw.githubusercontent.com/neychev/small_DL_repo/master/datasets/Multi30k/validation.tar.gz"

SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'

from torchtext.data.utils import get_tokenizer
de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')


UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3 # unknown, pad, bigining, end of sentence
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

from torchtext.vocab import build_vocab_from_iterator
train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))

de_tokens = [de_tokenizer(data_sample[0]) for data_sample in train_iter]
en_tokens = [en_tokenizer(data_sample[1]) for data_sample in train_iter]

de_vocab = build_vocab_from_iterator(de_tokens, min_freq=1, specials=special_symbols, special_first=True)
en_vocab = build_vocab_from_iterator(en_tokens, min_freq=1, specials=special_symbols, special_first=True)
de_vocab.set_default_index(UNK_IDX)
en_vocab.set_default_index(UNK_IDX)

import torch

def de_transform(o):
    o=de_tokenizer(o)
    o=de_vocab(o)
    return torch.cat((torch.tensor([BOS_IDX]), torch.tensor(o), torch.tensor([EOS_IDX])))

def en_transform(o):
    o=en_tokenizer(o)
    o=en_vocab(o)
    return torch.cat((torch.tensor([BOS_IDX]), torch.tensor(o), torch.tensor([EOS_IDX])))


from torch.nn.utils.rnn import pad_sequence
# function to collate data samples into batch tensors
def collate_fn(batch): # convert a batch of raw strings into batch tensors
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(de_transform(src_sample.rstrip("\n")))
        tgt_batch.append(en_transform(tgt_sample.rstrip("\n")))
    src_batch = pad_sequence(src_batch, batch_first=True, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, batch_first=True, padding_value=PAD_IDX)
    return src_batch, tgt_batch


torch.manual_seed(0)

train_iter = Multi30k(split='train', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
val_iter = Multi30k(split='valid', language_pair=(SRC_LANGUAGE, TGT_LANGUAGE))
batch_size = 128 # 128
train_loader = torch.utils.data.DataLoader(train_iter, batch_size=batch_size, collate_fn=collate_fn)
val_loader = torch.utils.data.DataLoader(val_iter, batch_size=batch_size, collate_fn=collate_fn)

# vocab_transform = {SRC_LANGUAGE:de_vocab, TGT_LANGUAGE:en_vocab}
# text_transform = {SRC_LANGUAGE:de_transform, TGT_LANGUAGE:en_transform}




In [None]:
# @title 6att down
import torch
import torch.nn as nn
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

import math
class PositionalEncoder(nn.Module):
    def __init__(self, emb_dim, max_seq_length=512, dropout=0.1):
        super(PositionalEncoder, self).__init__()
        self.emb_dim = emb_dim
        self.drop = nn.Dropout(dropout)
        pe = torch.zeros(max_seq_length, emb_dim)
        pos = torch.arange(0, max_seq_length).unsqueeze(1) # https://nlp.seas.harvard.edu/annotated-transformer/
        # div_term = torch.exp(torch.arange(0, emb_dim, 2) * -(math.log(10000.0) / emb_dim))
        div_term = torch.exp(torch.arange(0, emb_dim, 2, dtype=torch.float) * -(math.log(10000.0) / emb_dim)) #gpt
        pe[:, 0::2] = torch.sin(pos * div_term)
        pe[:, 1::2] = torch.cos(pos * div_term)
        pe = pe.unsqueeze(0) # [1, seq_len, d_model] # batch_first
        # pe = pe.unsqueeze(-2) # [seq_len, 1, d_model] #
        self.register_buffer('pe', pe)

    def forward(self, x): # x [batch_size, seq_len, d_model]
        return self.drop(x + self.pe[:, : x.size(1)]) # batch_first
        # return self.drop(x + self.pe[:x.size(0), :]) #

class MHA(nn.Module):
    def __init__(self, d_model, n_heads, dropout):
        super().__init__()
        assert d_model % n_heads == 0
        self.d_model = d_model
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads
        self.q = nn.Linear(d_model, d_model)
        self.k = nn.Linear(d_model, d_model)
        self.v = nn.Linear(d_model, d_model)
        self.out = nn.Linear(d_model, d_model)
        self.drop = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.tensor((self.head_dim,), dtype=torch.float, device=device))

    def forward(self, query, key, value, mask = None):
        batch_size = query.shape[0] # batch_first
        # batch_size = query.shape[1] #
        #query = [batch size, query len, hid dim]
        #key = [batch size, key len, hid dim]
        #value = [batch size, value len, hid dim]
        Q = self.q(query) #Q = [batch size, query len, hid dim]
        K = self.k(key) #K = [batch size, key len, hid dim]
        V = self.v(value) #V = [batch size, value len, hid dim]
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).transpose(1, 2) #Q = [batch size, n heads, query len, head dim]
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).transpose(1, 2) #K = [batch size, n heads, key len, head dim]
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).transpose(1, 2) #V = [batch size, n heads, value len, head dim]
        # scaled dot-product attention
        energy = torch.matmul(Q, K.transpose(2,3)) / self.scale #energy = [batch size, n heads, query len, key len]
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        attention = torch.softmax(energy, dim = -1) #attention = [batch size, n heads, query len, key len]
        # why dropout applied directly to the attn?
        x = torch.matmul(self.drop(attention), V) #x = [batch size, n heads, query len, head dim]
        x = x.transpose(1, 2).contiguous() #x = [batch size, query len, n heads, head dim]
        x = x.view(batch_size, -1, self.d_model) #x = [batch size, query len, hid dim]
        x = self.out(x) #x = [batch size, query len, hid dim]
        return x, attention

class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, ff_dim, dropout):
        super().__init__()
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.self_attn = MHA(d_model, n_heads, dropout)
        self.ff = nn.Sequential(
            nn.Linear(d_model, ff_dim), nn.ReLU(), # ReLU GELU
            nn.Dropout(dropout), nn.Linear(ff_dim, d_model)
        )
        self.drop = nn.Dropout(dropout)

    def forward(self, src, src_mask): #src = [batch size, src len, hid dim] #src_mask = [batch size, 1, 1, src len]
        src = self.norm1(src + self.drop(self.self_attn(src, src, src, src_mask)[0])) #src = [batch size, src len, hid dim]
        src = self.norm2(src + self.drop(self.ff(src))) #src = [batch size, src len, hid dim]
        return src

class Encoder(nn.Module):
    def __init__(self, d_model, n_layers, n_heads, ff_dim, dropout):
        super().__init__()
        self.layers = nn.ModuleList([EncoderLayer(d_model, n_heads, ff_dim, dropout) for _ in range(n_layers)])
        self.norm = nn.LayerNorm(d_model)

    def forward(self, src, src_mask):
        for layer in self.layers:
            src = layer(src, src_mask) #src = [batch size, src len, hid dim]
        return self.norm(src)

class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, ff_dim, dropout):
        super().__init__()
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.self_attn = MHA(d_model, n_heads, dropout)
        self.enc_attn = MHA(d_model, n_heads, dropout)
        self.ff = nn.Sequential(
            nn.Linear(d_model, ff_dim), nn.ReLU(), # ReLU GELU
            nn.Dropout(dropout), nn.Linear(ff_dim, d_model)
        )
        self.drop = nn.Dropout(dropout)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        #enc_src = [batch size, src len, hid dim] #src_mask = [batch size, 1, 1, src len]
        #trg = [batch size, trg len, hid dim] #trg_mask = [batch_size, 1, trg len, trg len]
        trg = self.norm1(trg + self.drop(self.self_attn(trg, trg, trg, trg_mask)[0])) #trg = [batch size, trg len, hid dim]
        # print("dec lyr fwd",trg.shape, enc_src.shape) # [batch_size, trg_len, 512], [batch_size, src_len, 512] ; translate [1, 1 to trg_len, 512]*dec_layers, [1, src_len, 512]
        trg = self.norm2(trg + self.drop(self.enc_attn(trg, enc_src, enc_src, src_mask)[0])) #trg = [batch size, trg len, hid dim]
        trg = self.norm3(trg + self.drop(self.ff(trg))) # og 6att
        return trg #trg = [batch size, trg len, hid dim]

class Decoder(nn.Module):
    def __init__(self, d_model, n_layers, n_heads, ff_dim, dropout):
        super().__init__()
        self.layers = nn.ModuleList([DecoderLayer(d_model, n_heads, ff_dim, dropout) for _ in range(n_layers)])
        self.norm = nn.LayerNorm(d_model)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        for layer in self.layers:
            trg = layer(trg, enc_src, trg_mask, src_mask)
        return self.norm(trg)

class Seq2Seq(nn.Module):
    def __init__(self, in_dim, out_dim, d_model = 512, nhead = 8, enc_layers = 3, dec_layers = 3, ff_dim = 512, dropout = 0.1):
        super().__init__()
        self.encoder = Encoder(d_model, enc_layers, nhead, ff_dim, dropout)
        self.decoder = Decoder(d_model, dec_layers, nhead, ff_dim, dropout)
        self.pos_enc = PositionalEncoder(d_model, dropout=dropout)
        self.src_tok_emb = nn.Embedding(in_dim, d_model)
        self.trg_tok_emb = nn.Embedding(out_dim, d_model)
        self.d_model = d_model
        self.fc_out = nn.Linear(d_model, out_dim)

        for p in self.parameters(): # must be at the end of __init__
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    # src_mask, same shape as the source sentence,
    # value of 1 when the token in the source sentence is not <pad> token,
    # for encoder layers to mask the multi-head attention mechanisms,
    # calculate and apply attention over the source sentence, so dun pay attention to <pad> tokens
    def make_src_mask(self, src): #src = [batch size, src len]
        return (src != PAD_IDX).unsqueeze(1).unsqueeze(2).to(device) #src_mask = [batch size, 1, 1, src len]

    # subsequent mask, tril, nth in tgt can only see up to nth in out
    # bitwise & with trg_pad_mask: dun pay attn to <pad>
    def make_trg_mask(self, trg): #trg = [batch size, trg len]
        trg_pad_mask = (trg != PAD_IDX).unsqueeze(1).unsqueeze(2).to(device) #trg_pad_mask = [batch size, 1, 1, trg len]
        trg_len = trg.shape[1] # batch_first
        # trg_len = trg.shape[0]
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = device)).bool() #trg_sub_mask = [trg len, trg len]
        trg_mask = trg_pad_mask & trg_sub_mask #trg_mask = [batch size, 1, trg len, trg len]
        return trg_mask

    def forward(self, src, trg): #src = [batch size, src len] #trg = [batch size, trg len]
        src_mask = self.make_src_mask(src) #src_mask = [batch size, 1, 1, src len]
        trg_mask = self.make_trg_mask(trg) #trg_mask = [batch size, 1, trg len, trg len]

        src = self.pos_enc(self.src_tok_emb(src.long()) * math.sqrt(self.d_model))
        trg = self.pos_enc(self.trg_tok_emb(trg.long()) * math.sqrt(self.d_model))

        enc_src = self.encoder(src, src_mask) #enc_src = [batch size, src len, hid dim]
        trg = self.decoder(trg, enc_src, trg_mask, src_mask) #output = [batch size, trg len, hid dim]
        trg = self.fc_out(trg) #output = [batch size, trg len, output dim]
        return trg

    def encode(self, src, src_mask):
        return self.encoder(self.pos_enc(self.src_tok_emb(src.long()) * math.sqrt(self.d_model)), src_mask)

    def decode(self, trg, memory, trg_mask, src_mask):
        trg = self.decoder(self.pos_enc(self.trg_tok_emb(trg.long()) * math.sqrt(self.d_model)), memory, trg_mask, src_mask)
        return self.fc_out(trg)

in_dim = len(de_vocab)
out_dim = len(en_vocab)
d_model=512 # hid_dim
nhead=8
num_encoder_layers=3
num_decoder_layers=3
dim_feedforward=512 # pf_dim
dropout=0.1

model = Seq2Seq(in_dim, out_dim, d_model = 512, nhead = 8, enc_layers = 3, dec_layers = 3, ff_dim = 512, dropout = 0.1).to(device)
# torch.nn.Transformer(d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, activation=<function relu>, custom_encoder=None, custom_decoder=None, layer_norm_eps=1e-05, batch_first=False, norm_first=False, bias=True, device=None, dtype=None)
# torch.nn.Transformer(d_model=512, nhead=8, num_encoder_layers=6, num_decoder_layers=6, dim_feedforward=2048, dropout=0.1)


In [24]:
# @title gpt
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class PositionalEncoder(nn.Module):
    def __init__(self, emb_dim, max_seq_length=512, dropout=0.1):
        super(PositionalEncoder, self).__init__()
        self.emb_dim = emb_dim
        self.drop = nn.Dropout(dropout)
        pe = torch.zeros(max_seq_length, emb_dim)
        pos = torch.arange(0, max_seq_length).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, emb_dim, 2) * -(math.log(10000.0) / emb_dim))
        pe[:, 0::2] = torch.sin(pos * div_term)
        pe[:, 1::2] = torch.cos(pos * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return self.drop(x + self.pe[:, : x.size(1)])

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout=0):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.head_dim = d_model // n_heads
        # self.q = nn.Linear(d_model, d_model)
        # self.k = nn.Linear(d_model, d_model)
        # self.v = nn.Linear(d_model, d_model)
        self.q = nn.Linear(d_model, d_model,bias=False)
        self.k = nn.Linear(d_model, d_model,bias=False)
        self.v = nn.Linear(d_model, d_model,bias=False)
        self.fc_out = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.tensor((self.head_dim,), dtype=torch.float, device=device))

    def forward(self, query, key, value, mask=None):
        batch_size = query.shape[0]

        Q = self.q(query).view(batch_size, -1, self.n_heads, self.head_dim).transpose(1, 2)
        K = self.k(key).view(batch_size, -1, self.n_heads, self.head_dim).transpose(1, 2)
        V = self.v(value).view(batch_size, -1, self.n_heads, self.head_dim).transpose(1, 2)

        energy = torch.matmul(Q, K.transpose(2, 3)) / self.scale
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
            # print("mha",energy)

        attention = torch.softmax(energy, dim=-1)
        x = torch.matmul(self.dropout(attention), V)
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        x = self.fc_out(x)
        return x, attention

class Feedforward(nn.Module):
    def __init__(self, d_model, ff_dim, dropout=0.1):
        super(Feedforward, self).__init__()
        self.linear1 = nn.Linear(d_model, ff_dim)
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(ff_dim, d_model)

    def forward(self, x):
        x = F.relu(self.linear1(x))
        x = self.dropout(x)
        x = self.linear2(x)
        return x

class EncoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, ff_dim, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.self_attn = MultiHeadAttention(d_model, n_heads, dropout=0)
        self.ff = Feedforward(d_model, ff_dim, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_mask):
        src = self.norm1(src + self.dropout(self.self_attn(src, src, src, src_mask)[0]))
        src = self.norm2(src + self.dropout(self.ff(src)))
        return src

class Encoder(nn.Module):
    def __init__(self, d_model, n_layers, n_heads, ff_dim, dropout=0.1):
        super(Encoder, self).__init__()
        self.layers = nn.ModuleList([EncoderLayer(d_model, n_heads, ff_dim, dropout) for _ in range(n_layers)])

    def forward(self, src, src_mask):
        for layer in self.layers:
            src = layer(src, src_mask)
        return src

class DecoderLayer(nn.Module):
    def __init__(self, d_model, n_heads, ff_dim, dropout=0.1):
        super(DecoderLayer, self).__init__()
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.self_attn = MultiHeadAttention(d_model, n_heads, dropout=0)
        self.enc_attn = MultiHeadAttention(d_model, n_heads, dropout=0)
        self.ff = Feedforward(d_model, ff_dim, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, trg, enc_src, trg_mask, src_mask):
        trg = self.norm1(trg + self.dropout(self.self_attn(trg, trg, trg, trg_mask)[0]))
        trg = self.norm2(trg + self.dropout(self.enc_attn(trg, enc_src, enc_src, src_mask)[0]))
        trg = self.norm3(trg + self.dropout(self.ff(trg)))
        return trg

class Decoder(nn.Module):
    def __init__(self, d_model, n_layers, n_heads, ff_dim, dropout=0.1):
        super(Decoder, self).__init__()
        self.layers = nn.ModuleList([DecoderLayer(d_model, n_heads, ff_dim, dropout) for _ in range(n_layers)])

    def forward(self, trg, enc_src, trg_mask, src_mask):
        for layer in self.layers:
            trg = layer(trg, enc_src, trg_mask, src_mask)
        return trg

class Seq2Seq(nn.Module):
    def __init__(self, in_dim, out_dim, d_model=512, nhead=8, enc_layers=3, dec_layers=3, ff_dim=512, dropout=0.1):
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder(d_model, enc_layers, nhead, ff_dim, dropout)
        self.decoder = Decoder(d_model, dec_layers, nhead, ff_dim, dropout)
        self.pos_enc = PositionalEncoder(d_model, dropout=dropout)
        self.src_tok_emb = nn.Embedding(in_dim, d_model)
        self.trg_tok_emb = nn.Embedding(out_dim, d_model)
        self.d_model = d_model
        self.fc_out = nn.Linear(d_model, out_dim)

        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)

    def forward(self, src, trg, src_mask=None, trg_mask=None):
        src = self.pos_enc(self.src_tok_emb(src) * math.sqrt(self.d_model))
        trg = self.pos_enc(self.trg_tok_emb(trg) * math.sqrt(self.d_model))

        enc_src = self.encoder(src, src_mask)
        output = self.decoder(trg, enc_src, trg_mask, src_mask)
        output = self.fc_out(output)
        return output

    def encode(self, src, src_mask):
        return self.encoder(self.pos_enc(self.src_tok_emb(src) * math.sqrt(self.d_model)), src_mask)

    def decode(self, trg, memory, trg_mask, src_mask):
        trg = self.decoder(self.pos_enc(self.trg_tok_emb(trg) * math.sqrt(self.d_model)), memory, trg_mask, src_mask)
        return self.fc_out(trg)


in_dim = len(de_vocab)
out_dim = len(en_vocab)
model = Seq2Seq(in_dim, out_dim, d_model=512, nhead=8, enc_layers=3, dec_layers=3, ff_dim=512, dropout=0.1).to(device)


In [21]:
# @title mask translate

def make_src_mask(src):
    return (src != PAD_IDX).unsqueeze(1).unsqueeze(2).to(device) # [batch_size, 1, src_len]?

def make_trg_mask(trg):
    trg_pad_mask = (trg != PAD_IDX).unsqueeze(1).unsqueeze(2).to(device)
    trg_len = trg.shape[1]
    trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device=device)).bool()
    trg_mask = trg_pad_mask & trg_sub_mask
    return trg_mask

def translate(model, src_sentence):
    model.eval()
    src = de_transform(src_sentence).view(1,-1).to(device)
    num_tokens = src.shape[1]
    trg_indexes = [BOS_IDX]
    max_len = src.shape[1]+5
    for i in range(max_len):
        trg_tensor = torch.tensor(trg_indexes, dtype=torch.long, device=device).unsqueeze(0)
        src_mask, trg_mask = make_src_mask(src), make_trg_mask(trg_tensor)
        with torch.no_grad():
            output = model(src, trg_tensor, src_mask, trg_mask)
        pred_token = output.argmax(2)[:,-1].item() # batch_first=F -> ?
        trg_indexes.append(pred_token)
        if pred_token == EOS_IDX: break
    tgt_tokens = torch.tensor(trg_indexes[1:-1]).flatten()
    return " ".join(en_vocab.lookup_tokens(list(tgt_tokens.cpu().numpy())))

# UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3 # unknown, pad, bigining, end of sentence
print(translate(model, "Eine Gruppe von Menschen steht vor einem Iglu ."))


rowers rowers rowers rowers rowers rowers rowers rowers rowers rowers rowers Wheels Wheels Wheels Wheels


In [22]:
# @title train test

def train(model, dataloader, optimizer, loss_fn):
    model.train()
    total_loss = 0
    for src, trg in dataloader:
        src, trg = src.to(device), trg.to(device) #trg = [batch size, trg len]
        trg_input = trg[:,:-1]
        src_mask, trg_mask = make_src_mask(src), make_trg_mask(trg_input)
        output = model(src, trg_input, src_mask, trg_mask) #output = [batch size, trg len - 1, output dim]
        optimizer.zero_grad()
        loss = loss_fn(output.reshape(-1, output.shape[-1]), trg[:,1:].reshape(-1))
        # loss = loss_fn(output.contiguous().view(-1, output.shape[-1]), trg[:, 1:].contiguous().view(-1))
        loss.backward()
        # torch.nn.utils.clip_grad_norm_(model.parameters(), 1) # from og 6attsalluneed
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(list(dataloader))

def test(model, dataloader, loss_fn):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, trg in dataloader:
            src, trg = src.to(device), trg.to(device) #trg = [batch size, trg len]
            trg_input = trg[:,:-1]
            src_mask, trg_mask = make_src_mask(src), make_trg_mask(trg_input)
            output = model(src, trg_input, src_mask, trg_mask) #output = [batch size, trg len - 1, output dim]
            output = output.contiguous().view(-1, output.shape[-1]) #output = [batch size * trg len - 1, output dim]
            trg = trg[:,1:].contiguous().view(-1) #trg = [batch size * trg len - 1]
            loss = loss_fn(output, trg)
            # loss = loss_fn(output.contiguous().view(-1, output.shape[-1]), trg[:, 1:].contiguous().view(-1))
            epoch_loss += loss.item()
    return epoch_loss / len(list(dataloader))


In [23]:
# @title run
# import math
import time
loss_fn = nn.CrossEntropyLoss(ignore_index = PAD_IDX)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9) # lr=0.0001

for epoch in range(20):
    start_time = time.time()
    train_loss = train(model, train_loader, optimizer, loss_fn)
    val_loss = test(model, val_loader, loss_fn)
    end_time = time.time()
    print((f"Epoch: {epoch+1}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))
    print(translate(model, "Eine Gruppe von Menschen steht vor einem Iglu ."))
#  A group of people standing in front of an igloo

# sine pos enc, scale after pos
# Epoch: 20, Train loss: 3.238, Val loss: 3.504, Epoch time = 44.329s
# A person is playing a trick in a race .

# sine pos enc, token,scale,pos
# Epoch: 20, Train loss: 1.506, Val loss: 2.149, Epoch time = 43.637s
# A crowd of people of a crowd .

# scale, gelu, sine pos enc
# Epoch: 20, Train loss: 1.547, Val loss: 2.114, Epoch time = 41.686s
# A group of people are standing in a doorway .

# relu posenc,sqrt
# Epoch: 20, Train loss: 1.334, Val loss: 2.157, Epoch time = 43.022s
# A group of people are standing in front of a crowd .

# relu posenc nosqurt
# Epoch: 20, Train loss: 0.990, Val loss: 1.932, Epoch time = 42.096s
# A group of people are standing in front of a house .

# no nograd
# Epoch: 20, Train loss: 0.991, Val loss: 1.908, Epoch time = 42.257s
# A group of people are standing in front of a performance .

# norm end of enc/dec

# gpt
# Epoch: 20, Train loss: 0.731, Val loss: 2.005, Epoch time = 43.226s
# A crowd of people in front of them in a classroom .

# very down
# Epoch: 20, Train loss: 0.991, Val loss: 1.940, Epoch time = 42.046s
# A crowd of people in front of a crowd .

# mha no drop; from 19
# Epoch: 20, Train loss: 0.897, Val loss: 1.941, Epoch time = 41.742s
# A group of people stand in front of an igloo

# kqv no bias




Epoch: 1, Train loss: 5.386, Val loss: 4.193, Epoch time = 41.734s
A woman in a blue shirt is shirt is playing a blue .
Epoch: 2, Train loss: 3.886, Val loss: 3.502, Epoch time = 41.581s
A group of people are standing in front of a crowd .
Epoch: 3, Train loss: 3.323, Val loss: 3.073, Epoch time = 41.833s
A group of people standing in front of a crowd .
Epoch: 4, Train loss: 2.919, Val loss: 2.772, Epoch time = 41.500s
A group of people standing in front of a building .
Epoch: 5, Train loss: 2.612, Val loss: 2.570, Epoch time = 42.001s
A group of people standing in front of a tree .
Epoch: 6, Train loss: 2.361, Val loss: 2.421, Epoch time = 41.616s
A group of people standing in front of a pool .
Epoch: 7, Train loss: 2.147, Val loss: 2.282, Epoch time = 42.051s
A group of people stand in front of a pile .
Epoch: 8, Train loss: 1.972, Val loss: 2.186, Epoch time = 41.474s
A group of people stand in front of a store .
Epoch: 9, Train loss: 1.824, Val loss: 2.110, Epoch time = 42.049s
A g

In [None]:
# @title inference
print(translate(model, "Eine Gruppe von Menschen steht vor einem Iglu .")) # A group of people stand in front of an igloo .
print(translate(model, "Ein Koch in weißer Uniform bereitet Essen in einer Restaurantküche zu .")) # A chef in a white uniform prepares food in a restaurant kitchen .
print(translate(model, "Zwei junge Mädchen spielen Fußball auf einem Feld. .")) # Two young girls play soccer on a field. .
print(translate(model, "Eine Frau mit Hut und Sonnenbrille steht am Strand .")) # A woman wearing a hat and sunglasses stands on the beach .
print(translate(model, "Zwei Freunde lachen und genießen ein Eis auf einer wunderschönen Wiese .")) # Two friends laugh and enjoy ice cream on a beautiful meadow .


A group of people standing up .
A cooking in a cooking in a cooking .
Two
A woman is standing with her hat on .
Two hockey players on a rink and take a hockey .
