In [58]:
import re, json, torch, torch.nn as nn
from torch.utils.data import DataLoader

path = "./deu.txt"

lines = open(path, encoding="utf-8").read().strip().split("\n")
lines = lines[:20000]

pairs = [ln.split("\t")[:2] for ln in lines] 
src_texts, tgt_texts = zip(*pairs)

In [59]:
src_texts[:5], tgt_texts[:5]
PAD, UNK, BOS, EOS = 0, 1, 2, 3 # Spezialtokens 
# PAD = Padding, UNK = Unknown, 
# BOS = Begin of Sequence, EOS = End of Sequence
VOCAB_SIZE = 20004
def tokenize(s): return re.findall(r"\b\w+\b", s.lower())
def build_vocab(texts, max_tokens=VOCAB_SIZE):
    from collections import Counter
    freq = Counter(tok for t in texts for tok in tokenize(t))
    itos = ["<pad>", "<unk>", "<bos>", "<eos>"] + [w for w,_ in freq.most_common(max_tokens-4)]
    return {w:i for i,w in enumerate(itos)}, itos
src_texts_vocab, src_itos = build_vocab(src_texts)
tgt_texts_vocab, tgt_itos = build_vocab(tgt_texts)

src_t = "find a job"
tgt_t = "Grüß Gott, Tom"
#src_t = ("<bos>",1,2,3,4,"<pad>","<pad>","<eos>") 
#tgt_t = ("<bos>",5,3,6,2,3,4, "<eos>")


def vectorize(text, stoi, max_len, add_bos_eos=False):
    ids = [stoi.get(tok, UNK) for tok in tokenize(text)]
    if add_bos_eos: ids = [BOS] + ids + [EOS]
    ids = ids[:max_len]
    if len(ids) < max_len: ids += [PAD]*(max_len-len(ids))
    return ids




max_src, max_tgt = 30, 30  # Maximale Länge 

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src = torch.tensor([vectorize(t, src_texts_vocab, max_src) for t in src_batch])
    tgt = torch.tensor([vectorize(t, tgt_texts_vocab, max_tgt, add_bos_eos=True) for t in tgt_batch])
    tgt_in, tgt_out = tgt[:, :-1], tgt[:, 1:]
    return src, tgt_in, tgt_out

dataset = list(zip(src_texts, tgt_texts))
loader = DataLoader(dataset, batch_size= 64, shuffle=True, collate_fn=collate_fn)


In [60]:
emb_dim, hid_dim = 128, 256

class Encoder(nn.Module):
    def __init__(self, vocab, emb_dim, hid_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab, emb_dim , padding_idx=PAD)
        self.rnn = nn.RNN(emb_dim, hid_dim , batch_first=True) # [B, S, E] 
# für das Mini-Batch Training [B, S, E]
# B ... Anzahl der Sätze im Batch 
# S ... Länge der Sequenz
# E ... Embedding-Dimension 
# ohne batch_first [S, B, E]
    def forward(self,x):
        x = self.embedding(x)
        _, hidden = self.rnn(x)
        return hidden # Hidden State 

class Decoder(nn.Module):
    def __init__(self, vocab, emb_dim, hid_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab, emb_dim, padding_idx=PAD)
        self.rnn = nn.RNN(emb_dim, hid_dim, num_layers=1 , batch_first=True)
        self.fc = nn.Linear(hid_dim, vocab) # 
    
    def forward(self, x, h):
        x = self.embedding(x)
        out, h = self.rnn(x,h) 
        return self.fc(out), h # logits, hidden_state
 
class Seq2Seq(nn.Module):
    def __init__(self, enc, dec):
        super().__init__()
        self.enc = enc
        self.dec = dec 
    
    def forward(self, src, tgt_in_dec):
    # src ... englische Sätze, 
    # tgt_in_dec ... aktuelle deutsche Sätze als Eingabe für den Decoder 
        hidden_enc = self.enc(src)
        logits, _ = self.dec(tgt_in_dec, hidden_enc) 
        # Hidden State aus dem Encoder
        return logits 





In [61]:
source = "I am a boy."	
target = "Ich bin ein Junge."

# ----------

# Encoder [ "I am a boy." ] -> h
# Next Token Prediction 
# 1. Schritt Decoder [ h, <bos> ] -> "ich"
# 2. Schritt Decoder [ h, (<bos>,"ich")] -> "bin" 
# 3. Schritt Decoder [ h, (<bos>,"ich", "bin")] -> "ein"
# 4. Schritt Decoder [ h, (<bos>,"ich", "bin", "ein")] -> "Junge"
# 5. Schritt Decoder [ h, (<bos>,"ich", "bin", "ein", "Junge")] -> "<eos>"

# X = (h, (<bos>,"ich", "bin", "ein", "Junge")) -> Eingabe für Decoder 
# y = ("ich", "bin", "ein", "Junge", "<eos>") -> Zielvorgaben für den Loss 


In [None]:



device = torch.device("mps")

model = Seq2Seq(
    Encoder(len(src_texts_vocab), emb_dim=emb_dim, hid_dim=hid_dim),
    Decoder(len(tgt_texts_vocab), emb_dim=emb_dim, hid_dim=hid_dim),
                ).to(device)

crit = nn.CrossEntropyLoss(ignore_index=PAD)
opt = torch.optim.Adam(model.parameters(), lr = 1e-3)
epochs = 12

@torch.no_grad()
def translate(prompt, max_len=max_tgt):
    model.eval()
    src = torch.tensor([vectorize(prompt, src_texts_vocab, max_src)], device=device)
    h = model.enc(src)
    ys = torch.tensor([[BOS]], device=device)
    out_tokens = []
    for _ in range(max_len):
        logits, h = model.dec(ys, h)
        next_id = logits[0, -1].argmax().item()
        if next_id in (EOS, PAD): break
        out_tokens.append(next_id)
        ys = torch.cat([ys, torch.tensor([[next_id]], device=device)], dim=1)
    return " ".join(tgt_itos[t] for t in out_tokens)


for epoch in range(epochs):
    model.train()
    running_loss = 0.0 
    for src, tgt_in, tgt_out in loader:
        src, tgt_in, tgt_out = src.to(device), tgt_in.to(device), tgt_out.to(device)
        logits = model(src, tgt_in) 
        loss = crit(logits.reshape(-1, logits.size(-1)), tgt_out.reshape(-1))
        opt.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Gradient-Clipping -> falls Gradient explodiert 
        opt.step()
        running_loss += loss.item()
    print(f"epoch {epoch+1}: loss {running_loss/len(loader):.4f}")
    print(translate("I will do my best."))


epoch 1: loss 4.5191
ich bin nicht
epoch 2: loss 3.8021
ich bin nicht
epoch 3: loss 3.4937
ich bin
epoch 4: loss 3.2774
ich bin
epoch 5: loss 3.1186
ich bin nicht
epoch 6: loss 2.9962
ich bin
epoch 7: loss 2.8955
ich bin
epoch 8: loss 2.8135
ich bin
epoch 9: loss 2.7450
ich bin
epoch 10: loss 2.6856
ich bin
epoch 11: loss 2.6367
ich bin
epoch 12: loss 2.5925
ich bin


In [63]:
# h_0 -> Zufällig
#[145,
# 10,
# 170,
# 0,
# 0,
# 0,...,
# 
# ]

# h_1 = Wh * h_0 + Wx*145 
# h_2 = .... 


# RNN1: h_t = Wh*h_{t-1} + Wx*x_t 
# RNN2: h_t = Wh*h_{t-1} + Wx*x_t  



# RNN2(RNN1) 