In [2]:


source = "I am a boy."
target = "Ich bin ein Junge."

# ----------
# Encoder["I am a boy."] -> hidden state (context vector)
# Next Token Prediction
# 1. step Decoder[hidden state, "<bos>"] -> "ich"
# 2. step Decoder[hidden state, ("<bos>", "ich") ] -> "bin" 
# 3. step Decoder[hidden state, ("<bos>", "ich", "bin") ] -> "ein" 
# 4. step Decoder[hidden state, ("<bos>", "ich", "bin", "ein") ] -> "Junge"
# 5. step Decoder[hidden state, ("<bos>", "ich", "bin", "ein", "Junge") ] -> "." 
# 6. step Decoder[hidden state, ("<bos>", "ich", "bin", "ein", "Junge", ".") ] -> "<eos>" 

# X = (hidden state, ("<bos>", "ich", "bin", "ein", "Junge", "."))
# y = ("ich", "bin", "ein", "Junge", ".", "<eos>")


In [1]:
import re, json, torch, torch.nn as nn
from torch.utils.data import DataLoader

path = "./deu.txt"

lines = open(path, encoding="utf-8").read().strip().split("\n")
lines = lines[:20000]

pairs = [ln.split("\t")[:2] for ln in lines] 
src_texts, tgt_texts = zip(*pairs)

In [None]:
PAD, UNK, BOS, EOS = 0, 1, 2, 3

VOCAB_SIZE = 20004 

def tokenize(s): return re.findall(r"\b\w+\b", s.lower())
def build_vocab(texts, max_tokens=VOCAB_SIZE):
    from collections import Counter
    freq = Counter(tok for t in texts for tok in tokenize(t))
    itos = ["<pad>", "<unk>", "<bos>", "<eos>"] + [w for w,_ in freq.most_common(max_tokens-4)]
    return {w:i for i,w in enumerate(itos)}, itos
src_texts_vocab, src_itos = build_vocab(src_texts)
tgt_texts_vocab, tgt_itos = build_vocab(tgt_texts)


def vectorize(text, stoi, max_len, add_bos_eos=False):
    ids = [stoi.get(tok, UNK) for tok in tokenize(text)]
    if add_bos_eos: ids = [BOS] + ids + [EOS]
    ids = ids[:max_len]
    if len(ids) < max_len: ids += [PAD]*(max_len-len(ids))
    return ids

max_src, max_tgt = 30, 30 

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src = torch.tensor([vectorize(t, src_texts_vocab, max_src) for t in src_batch])
    tgt = torch.tensor([vectorize(t, tgt_texts_vocab, max_tgt, add_bos_eos=True) for t in tgt_batch])
    tgt_in, tgt_out = tgt[:, :-1], tgt[:, 1:]
    return src, tgt_in, tgt_out

dataset = list(zip(src_texts, tgt_texts))
loader = DataLoader(dataset, batch_size= 64, shuffle=True, collate_fn=collate_fn)

In [21]:
src_texts[13000]

"You're in luck."

In [None]:
vectorize(src_texts[13000], src_texts_vocab, max_src)

In [22]:
sentence = "this is sample sentence for embedding"
dc = {s:i for i,s in enumerate(sorted(sentence.replace(',', '').split()))}
dc

{'embedding': 0, 'for': 1, 'is': 2, 'sample': 3, 'sentence': 4, 'this': 5}

In [25]:
vocab_size_tmp = len(dc)
emb = torch.nn.Embedding(vocab_size_tmp, 3)
emb.weight.data

tensor([[ 1.2410, -0.7009, -1.0035],
        [ 0.0272, -0.7268, -0.5822],
        [ 0.6845, -0.2334,  0.2144],
        [ 1.4403,  1.0988, -1.7053],
        [ 0.9302,  1.7344, -0.6735],
        [-0.1064, -2.0549, -0.6539]])

In [30]:
emb_dim = 128 # in practice starts from 768 
hid_dim = 256 

class Encoder(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=PAD)
        self.rnn = nn.GRU(emb_dim, hid_dim, batch_first=True)
    
    def forward(self,x):
        x = self.embedding(x)
        _, hidden = self.rnn(x)
        return hidden 

class Decoder(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=PAD)
        self.rnn = nn.GRU(emb_dim, hid_dim, batch_first=True)
        self.fc = nn.Linear(hid_dim, vocab_size) # classifier head, mlp head, FFNN head 
    
    def forward(self, x, h): # hidden state from encoder
        x = self.embedding(x)
        out, _ = self.rnn(x,h) # use h from encoder in the decoder
        return self.fc(out) # decision which token likely the next

class Seq2Seq(nn.Module):

    def __init__(self, enc, dec):
        super().__init__()
        self.enc = enc
        self.dec = dec

    def forward(self, src, tgt_in_dec):
        # src ... english sentences
        # tgt_in ...  already translated part german sentences 
        hidden_enc = self.enc(src)
        logits = self.dec(tgt_in_dec, hidden_enc)
        return logits 

device = "mps" # you use "cpu" or "cuda"

model = Seq2Seq(
    Encoder(len(src_texts_vocab)),
    Decoder(len(tgt_texts_vocab)),   
).to(device)


In [31]:
crit = nn.CrossEntropyLoss(ignore_index=PAD) 
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)
epochs = 20

@torch.no_grad()
def translate(prompt, max_len=max_tgt):
    model.eval()
    src = torch.tensor([vectorize(prompt, src_texts_vocab, max_src)], device=device)
    h = model.enc(src)
    ys = torch.tensor([[BOS]], device=device)
    out_tokens = []
    for _ in range(max_len):
        logits = model.dec(ys, h)
        next_id = logits[0, -1].argmax().item()
        if next_id in (EOS, PAD): break
        out_tokens.append(next_id)
        ys = torch.cat([ys, torch.tensor([[next_id]], device=device)], dim=1)
    return " ".join(tgt_itos[t] for t in out_tokens)

for epoch in range(epochs):
    model.train()
    running_loss = 0.0 
    for src, tgt_in, tgt_out in loader:
        src, tgt_in, tgt_out = src.to(device), tgt_in.to(device), tgt_out.to(device)
        logits = model(src,tgt_in)
        loss = crit(logits.reshape(-1, logits.size(-1)), tgt_out.reshape(-1)) 
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # gradient clipping ->  preventing exploding gradient 
        optimizer.step()
        running_loss += loss.item()
    print(f"epoch {epoch+1}: loss {running_loss/len(loader):.4f}") # 
    print(translate("I will do my best."))



epoch 1: loss 4.4610
ich bin ein
epoch 2: loss 3.3501
ich habe das gesagt
epoch 3: loss 2.7508
ich habe einen hund
epoch 4: loss 2.3282
ich werde es versuchen
epoch 5: loss 2.0096
ich werde mich drum kÃ¼mmern
epoch 6: loss 1.7556
ich werde mein bestes holen
epoch 7: loss 1.5233
ich werde mein bestes tun


KeyboardInterrupt: 