In [None]:
source = "I am a boy."
target = "Ich bin ein Junge."

# ----------
# Encoder["I am a boy."] -> h 
# Next Token Prediction 
# 1. step Decoder [h, <bos>] -> "ich"
# 2. step Decoder [h, (<bos>, "ich")] -> "bin" 
# 3. step Decoder [h, (<bos>, "ich", "bin")] -> "ein"
# 4. step Decoder [h, (<bos>, "ich", "bin", "ein")] -> "Junge"
# 5. step Decoder [h, (<bos>, "ich", "bin", "ein", "Junge")] -> "<eos>"

# X = (h, (<bos>, "ich", "bin", "ein", "Junge")) -> input for Decoder
# y = ( "ich", "bin", "ein", "Junge", <eos>) -> target/labels for Loss

In [1]:
import re, json, torch, torch.nn as nn
from torch.utils.data import DataLoader

path = "./deu.txt"

lines = open(path, encoding="utf-8").read().strip().split("\n")
lines = lines[:20000]

pairs = [ln.split("\t")[:2] for ln in lines] 
src_texts, tgt_texts = zip(*pairs)

In [None]:
PAD, UNK, BOS, EOS = 0, 1, 2, 3 # special tokens
# PAD = Padding, UNK = Unknown,
# BOS, EOS 

VOCAB_SIZE = 20004 

def tokenize(s): return re.findall(r"\b\w+\b", s.lower())
def build_vocab(texts, max_tokens=VOCAB_SIZE):
    from collections import Counter
    freq = Counter(tok for t in texts for tok in tokenize(t))
    itos = ["<pad>", "<unk>", "<bos>", "<eos>"] + [w for w,_ in freq.most_common(max_tokens-4)]
    return {w:i for i,w in enumerate(itos)}, itos
src_texts_vocab, src_itos = build_vocab(src_texts)
tgt_texts_vocab, tgt_itos = build_vocab(tgt_texts)



def vectorize(text, stoi, max_len, add_bos_eos=False):
    ids = [stoi.get(tok, UNK) for tok in tokenize(text)]
    if add_bos_eos: ids = [BOS] + ids + [EOS]
    ids = ids[:max_len]
    if len(ids) < max_len: ids += [PAD]*(max_len-len(ids))
    return ids

#vectorize(src_texts[60], src_texts_vocab, 30)
#src_texts[60]


max_src, max_tgt = 30, 30 

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src = torch.tensor([vectorize(t, src_texts_vocab, max_src) for t in src_batch])
    tgt = torch.tensor([vectorize(t, tgt_texts_vocab, max_tgt, add_bos_eos=True) for t in tgt_batch])
    tgt_in, tgt_out = tgt[:, :-1], tgt[:, 1:]
    return src, tgt_in, tgt_out

dataset = list(zip(src_texts, tgt_texts))
loader = DataLoader(dataset, batch_size= 64, shuffle=True, collate_fn=collate_fn)


In [19]:
sentence = "this is sample sentence for embedding"
sentence2 = "this is sentence embedding"


dc = {s:i for i,s in enumerate(sorted(sentence.replace(',', '').split()))}
dc

{'embedding': 0, 'for': 1, 'is': 2, 'sample': 3, 'sentence': 4, 'this': 5}

In [24]:
vocab_size = len(dc)
emb = torch.nn.Embedding(vocab_size, 3)
emb.weight.data

tensor([[-1.5272,  0.4394,  0.5539],
        [ 0.1789, -1.1513, -2.5549],
        [-0.6485, -0.6550, -0.7441],
        [ 1.4792,  0.7552,  0.6131],
        [ 2.0397,  0.1831, -0.6793],
        [-0.9258, -0.2079,  1.1493]])