# Daten Vorbereitung

In [1]:
"""!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm"""

'!python -m spacy download de_core_news_sm\n!python -m spacy download en_core_web_sm'

In [1]:
import torch
import torchtext
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import vocab
from torchtext.utils import download_from_url, extract_archive
import spacy
import io
import utils
from tqdm import tqdm
import numpy as np

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Get English and German Language Data

In [4]:
"""url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
train_urls = ('train.de.gz', 'train.en.gz')
val_urls = ('val.de.gz', 'val.en.gz')
test_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')

train_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]
val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]

de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

def build_vocab(filepath, tokenizer):
    unk_token = '<unk>'
    counter = Counter()
    with io.open(filepath, encoding="UTF-8") as f:
        for string_ in f:
            counter.update(tokenizer(string_))
    v = vocab(counter, specials=["<unk>", "<pad>", "<bos>", "<eos>"])
    # v.set_default_index(default_index)
    v.set_default_index(v[unk_token])
    return v

de_vocab = build_vocab(train_filepaths[0], de_tokenizer)
en_vocab = build_vocab(train_filepaths[1], en_tokenizer)

def data_process(filepaths):
  raw_de_iter = iter(io.open(filepaths[0], encoding="utf8"))
  raw_en_iter = iter(io.open(filepaths[1], encoding="utf8"))
  data = []
  for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):
    de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de.rstrip("\n"))],
                            dtype=torch.long)
    en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en.rstrip("\n"))],
                            dtype=torch.long)
    data.append((de_tensor_, en_tensor_))
  return data

train_data = data_process(train_filepaths)
val_data = data_process(val_filepaths)
test_data = data_process(test_filepaths)"""

url_base = 'https://raw.githubusercontent.com/multi30k/dataset/master/data/task1/raw/'
train_urls = ('train.de.gz', 'train.en.gz')
val_urls = ('val.de.gz', 'val.en.gz')
test_urls = ('test_2016_flickr.de.gz', 'test_2016_flickr.en.gz')

train_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in train_urls]
val_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in val_urls]
test_filepaths = [extract_archive(download_from_url(url_base + url))[0] for url in test_urls]

de_tokenizer = get_tokenizer('spacy', language='de_core_news_sm')
en_tokenizer = get_tokenizer('spacy', language='en_core_web_sm')

def build_vocab(filepath, tokenizer):
  unk_token = '<unk>'
  counter = Counter()
  with io.open(filepath, encoding="utf8") as f:
    for string_ in f:
      counter.update(tokenizer(string_))
  v = vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
  v.set_default_index(v[unk_token])
  return v

de_vocab = build_vocab(train_filepaths[0], de_tokenizer)
en_vocab = build_vocab(train_filepaths[1], en_tokenizer)

def data_process(filepaths):
  raw_de_iter = iter(io.open(filepaths[0], encoding="utf8"))
  raw_en_iter = iter(io.open(filepaths[1], encoding="utf8"))
  data = []
  for (raw_de, raw_en) in zip(raw_de_iter, raw_en_iter):
    de_tensor_ = torch.tensor([de_vocab[token] for token in de_tokenizer(raw_de.rstrip("\n"))],
                            dtype=torch.long)
    en_tensor_ = torch.tensor([en_vocab[token] for token in en_tokenizer(raw_en.rstrip("\n"))],
                            dtype=torch.long)
    data.append((de_tensor_, en_tensor_))
  return data


train_data = data_process(train_filepaths)
val_data = data_process(val_filepaths)
test_data = data_process(test_filepaths)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


BATCH_SIZE = 128
PAD_IDX = de_vocab['<pad>']
BOS_IDX = de_vocab['<bos>']
EOS_IDX = de_vocab['<eos>']

In [5]:
max(de_vocab.get_stoi().values()), max(en_vocab.get_stoi().values())

(19214, 10837)

### DataLoader

In [6]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

In [7]:
batch_size = 128
max_seq_len = 100 
PAD_IDX = de_vocab['<pad>']
BOS_IDX = de_vocab['<bos>']
EOS_IDX = de_vocab['<eos>']
print(PAD_IDX, BOS_IDX, EOS_IDX)

def generate_batch(data_batch):
    de_batch, en_batch = [], []
    for (de_item, en_item) in data_batch:
        
        pad_de = torch.cat([torch.tensor([BOS_IDX]), de_item, torch.tensor([EOS_IDX], dtype=int)], dim=0) #, torch.ones(max_seq_len - (de_item.size()[0]+2))
        de_batch.append(pad_de)
        
        pad_en = torch.cat([torch.tensor([BOS_IDX]), en_item, torch.tensor([EOS_IDX], dtype=int)], dim=0) #, torch.ones(max_seq_len - (en_item.size()[0]+2))
        en_batch.append(pad_en)
    de_batch = pad_sequence(de_batch, padding_value=PAD_IDX, batch_first=True)  #, batch_first=True
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX, batch_first=True)  #, batch_first=True
    # print(de_batch.size(), en_batch.size())
    return de_batch, en_batch

train_iter = DataLoader(train_data, batch_size=batch_size,
                        shuffle=True, collate_fn=generate_batch)

valid_iter = DataLoader(val_data, batch_size=batch_size,
                        shuffle=True, collate_fn=generate_batch)

test_iter = DataLoader(test_data, batch_size=batch_size,
                        shuffle=True, collate_fn=generate_batch)

1 2 3


# Transformer Encoder Decoder

In [8]:
# Benötigte Pakete der Pytroch Bibliothek und Numpy
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Multihead Attention:

In [9]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, mask=None):
        super().__init__()

        self.mask = mask
        self.num_heads = num_heads # Anzahl der Attention Heads
        self.emb_size = d_model # Dimension der Einbettungen

        d_xq = d_xk = d_xv = self.emb_size # Dimension der Einbetung Keys, Queries und Values zuweisen 

        self.d_head = d_model//num_heads # reduzierte Dimension der einzelnen Attention-Heads

        # Zuerst sind die Matrizen noch in der normalen Dimension der Einbettungen und werden noch gesplittet
        self.W_Q = nn.Linear(d_xq, d_xq, bias=False) # Gewichtsmatrix Queries
        self.W_K = nn.Linear(d_xk, d_xk, bias=False) # Gewichtsmatrix Keys
        self.W_V = nn.Linear(d_xv, d_xv, bias=False) # Gewichtsmatrix Values

        # Die Ausgabe aller Subschichten müssen der Dimension d_model=emb_size=seq_len entsprechen
        self.W_H = nn.Linear(self.emb_size, self.emb_size) #Ursprünglich: self.W_H = nn.Linear(self.emb_size, self.seq_len)'

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        seq_len = Q.size(2)
        if mask is None:
            scores = torch.matmul(Q, K.transpose(2, 3)) / np.sqrt(self.d_head)
        else:
            scores = torch.matmul(Q, K.transpose(2, 3)) / np.sqrt(self.d_head)
            mask = make_mask(seq_len)
            scores = scores + mask

        # Reihenweise Anweundung der Softmax-Funktion in der Attention-Matrix
        A = nn.Softmax(dim=-1)(scores)
        # Gewichten der Values mit der Attention-Matrix
        H = torch.matmul(A, V)

        return H
    
    def split_heads(self, x, batch_size, seq_len):
        """ 
        x_split = x.view(batch_size,self.seq_len, self.num_heads, self.d_head).transpose(1, 2)
        print(x_split)
        return x_split
        """
        
        #print("x:", x.size())
        #print("x flat", x.flatten().size())self.seq_len
        
        x_view = x.view(batch_size, seq_len, self.num_heads, self.d_head) #(batch_size x seq_len x num_heads x head_dim)
        return x_view.transpose(1,2)
    
    def group_heads(self, x, batch_size):
        return x.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_head)
    
    def forward(self, X_q, X_k, X_v):
        batch_size = X_q.size()[0]
        Q = self.split_heads(self.W_Q(X_q), batch_size, X_q.size()[1]) # (bs, n_heads, q_length, dim_per_head)
        K = self.split_heads(self.W_K(X_k), batch_size, X_k.size()[1]) # (bs, n_heads, k_length, dim_per_head)
        V = self.split_heads(self.W_V(X_v), batch_size, X_v.size()[1]) # (bs, n_heads, v_length, dim_per_head)

        H_head = self.scaled_dot_product_attention(Q, K, V, mask=self.mask)
        H_cat = self.group_heads(H_head, batch_size)
        
        H = self.W_H(H_cat)

        return H

### FNN

In [10]:
# Erstellen des vollvernetzten Netzwerks
class FFN(nn.Module):
    def __init__(self, d_model, hidden_dim):
        super().__init__()
        self.Layer1 = nn.Linear(d_model, d_model*hidden_dim) 
        self.Layer2 = nn.Linear(d_model*hidden_dim, d_model) 
        self.activation = nn.ReLU()
    
    def forward(self, x):
        h1 = self.Layer1(x)
        h2 = self.Layer2(h1)
        out = self.activation(h2)
        return out

### Einbettungen

In [11]:
# Erstellen der Wort- und Positionseinbettungen 
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab_size, src_pad_idx):
        super().__init__()
        self.word_embedings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model, padding_idx=src_pad_idx) 
        
        self.layernorm = nn.LayerNorm(d_model, eps=1e-12)
        self.emb_dim = d_model

    
    def positional_embedding(self, seq_len, emb_size):
        pos_array = np.array([
            [p/np.power(10000, 2 * (i//2)/emb_size) for i in range(emb_size)]
            for p in range(seq_len)])
        
        pos_array[:, 0::2] = np.sin(pos_array[:, 0::2])
        pos_array[:, 1::2] = np.cos(pos_array[:, 1::2])
        pos_emb = torch.from_numpy(pos_array)
        pos_emb.requires_grad = False
        pos_emb = pos_emb.to(device)
        #self.register_buffer("pos_emb", pos_emb)
        
        return pos_emb

    def forward(self, input_ids):
        # print(input_ids.size())
        seq_len = input_ids.size()[1]
        # Zuweisen der Worteinbettungen
        word_emb = self.word_embedings(input_ids.long()) * np.sqrt(self.emb_dim)

        # Zuweisen der Positionseinbettungen
        position_emb = self.positional_embedding(seq_len, self.emb_dim)
        # print("Dim Encoded Seq: ", word_emb.size())
        # print("Wordembedding size: ", word_emb.size(), "Wordembedding size: ",position_emb.size())
        # Einbettungen Addieren
        # print("Word Embeddings: ", word_emb.size(), "Pos Embeding: ", position_emb.size())
        
        embeddings = word_emb + position_emb
        # print("Embeddings done")
        # print("Final Embedding DIm: ", embeddings.size())
        # Normaliesieren der Einbettungen
        embeddings = self.layernorm(embeddings.float())
        

        return embeddings

import plotly.express as px
import plotly.io as pio
pio.renderers
pio.renderers.default = "notebook_connected"


seq_len = 100
emb_size = 512

pos_array = np.array([
        [p/np.power(10000, 2 * (i//2)/emb_size) for i in range(emb_size)]
        for p in range(seq_len)])
    
pos_array[:, 0::2] = np.sin(pos_array[:, 0::2])
pos_array[:, 1::2] = np.cos(pos_array[:, 1::2])
pos_emb = torch.from_numpy(pos_array)
pos_emb.requires_grad = False
pos_emb = pos_emb.unsqueeze(0)#.to(device)
pos_emb.shape

### Masking

In [12]:
def make_mask(seq_len):
    mask = torch.zeros(seq_len, seq_len)
    for i in range(seq_len):
        for j in range(seq_len):
            if j > i: mask[i][j] = 1
    return mask.masked_fill(mask==1, float('-inf')).unsqueeze(0).to(device)


### Encoder

In [13]:
# Erstellen einer Encoder-Schicht
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, ff_h_dim):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = FFN(d_model, ff_h_dim)

        self.layernorm1 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
        self.layernorm2 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
    
    def forward(self, x):
        # print("X in: ", x.size())
        mha_out = self.mha(x, x, x)
        # print("mha out: ",mha_out.size())
        out1 = self.layernorm1(x + mha_out) # x + mha_out = Residuen Verbindung
        
        ffn_out = self.ffn(out1) 
        out2 = self.layernorm2(out1 + ffn_out)

        return out2

# Erstellen des Encoders
class Encoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, ff_h_dim, vocab_size, src_pad_idx):
        super().__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = Embeddings(d_model, vocab_size, src_pad_idx)
        
        self.enc_layers = nn.ModuleList()
        for _ in range(num_layers):
            self.enc_layers.append(EncoderLayer(d_model, num_heads, ff_h_dim))
    
    def forward(self, x):
        x = self.embedding(x)
        
        for i in range(self.num_layers):
            x = self.enc_layers[i](x)
        
        return x

### Decoder

In [14]:
# Erstellen der Decoder Schicht
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, dff):
        super().__init__()
        self.mha = MultiHeadAttention(d_model, num_heads, mask=True)
        self.ffn = FFN(d_model, dff)
        self.cmha = MultiHeadAttention(d_model, num_heads)
        
        self.layernorm1 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
        self.layernorm2 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
        self.layernorm3 = nn.LayerNorm(normalized_shape=d_model, eps=1e-6)
    
    def forward(self, x_dec, x_enc):
        mha_out = self.mha(x_dec, x_dec, x_dec)
        out1 = self.layernorm1(mha_out + x_dec)
        cmha_out = self.cmha(out1, x_enc, x_enc)
        out2 = self.layernorm2(cmha_out + out1)
        ffn_out = self.ffn(out2)
        out3 = self.layernorm3(ffn_out + out2)

        return out3 

# Erstellen des Encoders
class Decoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, ff_h_dim, trg_vocab_size, src_pad_idx):
        super().__init__()

        self.d_model = d_model
        self.num_layers = num_layers
        self.trg_vocab_size = trg_vocab_size

        self.trg_embedding = Embeddings(d_model, trg_vocab_size, src_pad_idx)

        self.dec_layers = nn.ModuleList()
        
        for _ in range(num_layers):
            self.dec_layers.append(DecoderLayer(d_model, num_heads, ff_h_dim))
    
    def forward(self, x_trg, x_src):
        x = self.trg_embedding(x_trg.long())

        for i in range(self.num_layers):
            x = self.dec_layers[i](x, x_src)
        
        return x

### Encoder Decoder 

In [15]:
class TransformerEncoderDecoder(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, ff_h_dim, src_vocab_size, trg_vocab_size, src_pad_idx):
        super().__init__()
        self.encoder = Encoder(num_layers, d_model, num_heads, ff_h_dim, src_vocab_size, src_pad_idx)
        self.decoder = Decoder(num_layers, d_model, num_heads, ff_h_dim, trg_vocab_size, src_pad_idx)
        self.class_layer = nn.Linear(d_model, trg_vocab_size)

    def forward(self, x_src, x_trg):
        # print("Start Encoder")        
        x_enc = self.encoder(x_src)
        # print("Start Decoder")
        x_dec = self.decoder(x_trg, x_enc)
        x = self.class_layer(x_dec)
        
        return x

# Training

### Data Preprocessing

In [16]:
# Set Training Parameters
num_epochs = 10
lr = 0.001

# Set Model Parameters
src_vocab_size = len(en_vocab)+10000#max(en_vocab.get_stoi().values()) + 10000 # 30000
trg_vocab_size = len(de_vocab)+10000# max(de_vocab.get_stoi().values()) + 2000 # 35000
embedding_size = 512
num_heads = 8
num_layers = 3
max_len = max_seq_len 
expansion_factor = 4
src_pad_idx = de_vocab.get_stoi()["<pad>"]
print(src_vocab_size, trg_vocab_size)

20838 29215


In [17]:
# Erstellen eines Pseudo Modells
transformer = TransformerEncoderDecoder(num_layers=num_layers,
                                    d_model=embedding_size,
                                    num_heads=num_heads,
                                    ff_h_dim=expansion_factor,
                                    src_vocab_size=src_vocab_size,
                                    trg_vocab_size=trg_vocab_size, 
                                    src_pad_idx=src_pad_idx)
                                    
for p in transformer.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

transformer.to(device)

TransformerEncoderDecoder(
  (encoder): Encoder(
    (embedding): Embeddings(
      (word_embedings): Embedding(20838, 512, padding_idx=1)
      (layernorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
    )
    (enc_layers): ModuleList(
      (0): EncoderLayer(
        (mha): MultiHeadAttention(
          (W_Q): Linear(in_features=512, out_features=512, bias=False)
          (W_K): Linear(in_features=512, out_features=512, bias=False)
          (W_V): Linear(in_features=512, out_features=512, bias=False)
          (W_H): Linear(in_features=512, out_features=512, bias=True)
        )
        (ffn): FFN(
          (Layer1): Linear(in_features=512, out_features=2048, bias=True)
          (Layer2): Linear(in_features=2048, out_features=512, bias=True)
          (activation): ReLU()
        )
        (layernorm1): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
        (layernorm2): LayerNorm((512,), eps=1e-06, elementwise_affine=True)
      )
      (1): EncoderLayer(
     

In [18]:
print(f"Num total Parameter: ", sum(p.numel() for p in transformer.parameters()))
print(f"Num total trainable Parameter: ", sum(p.numel() for p in transformer.parameters() if p.requires_grad))

Num total Parameter:  62671903
Num total trainable Parameter:  62671903


In [19]:
load_model = False
save_model = True

optimizer = torch.optim.Adam(
    transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9
)
# optimizer = torch.optim.Adam(transformer.parameters(), lr) 

pad_idx = de_vocab.get_stoi()["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

### Training Neu

In [20]:
def greedy_search(model: nn.Module, src, start_symbol):
    seq_len = src.size(1)
    src = src.to(device)
    out = model.encoder(src)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
    for _ in range(seq_len-1):
        enc_out = out.to(device)
        #print("Enc out:", enc_out.size())
        dec_out = model.decoder(ys, enc_out)
        #print("Dec out:", dec_out.size())
        prob = model.class_layer(dec_out[:, -1]) #

        _, next_word = torch.max(prob, dim = 1)
        next_word = next_word.item()
        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
        if next_word == EOS_IDX:
            break
    return ys         

def translate(model, src, src_vocab, trg_vocab, src_tokenizer):
    model.eval()
    tokens = [BOS_IDX] + [src_vocab.get_stoi()[token] for token in src_tokenizer(src)] + [EOS_IDX]
    seq_len = len(tokens)
    src = (torch.LongTensor(tokens).reshape(1, seq_len))
    trg_tokens = greedy_search(model, src, start_symbol=BOS_IDX).flatten()
    return " ".join([trg_vocab.get_itos()[token] for token in trg_tokens])#.replace("<bos>", "").replace("<eos>", "")

In [21]:
def evaluate(valid_data, batch_size):
    num_batches = batch_size
    eval_acc = 0
    for batch in valid_data:
        src_lang = batch[0].to(device)
        trg_lan = batch[1].to(device)
        out = transformer(src_lang, trg_lan[:, 1:])
        eval_acc += (out.argmax(1)==trg_lan).cpu().numpy().mean()
    
    print(f"Validierungs Accuracy: {eval_acc/num_batches}")

In [22]:
sen = "We came a long way."

In [23]:
if load_model:
    torch.load("model.pt")

for epoch in tqdm(range(num_epochs)):
    print(f"[Epoch {epoch} / {num_epochs}]")

    if save_model:
        checkpoint = {
            "state_dict": transformer.state_dict(),
            "optimizer": optimizer.state_dict()
        }
        utils.save_checkpoint(checkpoint)
        
    transformer.train()
    losses = 0
    train_acc = 0

    for i, (src, trg) in enumerate(train_iter):
        print(src.size(), trg.size())
        src, trg = src.to(device), trg.to(device) # (bs x seq_len_en), (bs x seq_len_le)
        trg_input = trg[:, :-1]
        # print(" ".join([en_vocab.get_itos()[tok] for tok in trg[1]]).replace("<bos>", "").replace("<eos>", ""))
        # print(" ".join([de_vocab.get_itos()[tok] for tok in src[1]]).replace("<bos>", "").replace("<eos>", ""))

        output = transformer(src, trg_input) # (bs x trg_seq_len x trg_vocab_size)
        
        output = output.view(output.size()[0]*output.size()[1],-1) # (bs * trg_seq_len x trg_vocab_size)
        target = trg[:, 1:].reshape(-1) # (trg_seq_len)

        optimizer.zero_grad()
        loss = criterion(output, target)
        
        losses += loss.item()

        # Back prop
        loss.backward()
        
        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(transformer.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        train_acc += (output.argmax(1) == target).cpu().numpy().mean()/output.size()[0]
        translation = translate(transformer, sen, en_vocab, de_vocab, en_tokenizer)
        print(f"Testübersetzung: ", translation)
        print(f"Loss of Epoch {epoch} Batch {i}: ", loss.item())

    print(f"Fehler in der Epoche {epoch} ist {losses / num_epochs}")
    print(f"Training Accuracy: {train_acc / num_epochs}")
    print('Performance mit Validierungsdaten:')
    ## evaluate(valid_data)   

  0%|          | 0/10 [00:00<?, ?it/s]

[Epoch 0 / 10]
=> Saving checkpoint
torch.Size([128, 30]) torch.Size([128, 31])
Testübersetzung:  <bos> ein ein ein ein ein ein ein
Loss of Epoch 0 Batch 0:  10.313152313232422
torch.Size([128, 27]) torch.Size([128, 28])
Testübersetzung:  <bos> ein ein ein ein ein ein ein
Loss of Epoch 0 Batch 1:  9.963177680969238
torch.Size([128, 26]) torch.Size([128, 31])
Testübersetzung:  <bos> ein ein ein ein ein ein ein
Loss of Epoch 0 Batch 2:  9.781220436096191
torch.Size([128, 32]) torch.Size([128, 25])
Testübersetzung:  <bos> ein ein ein ein ein ein ein
Loss of Epoch 0 Batch 3:  9.67807388305664
torch.Size([128, 34]) torch.Size([128, 26])
Testübersetzung:  <bos> ein ein ein ein ein ein ein
Loss of Epoch 0 Batch 4:  9.58344841003418
torch.Size([128, 31]) torch.Size([128, 31])


  0%|          | 0/10 [02:03<?, ?it/s]

KeyboardInterrupt

