In [1]:
from tokenizers import Tokenizer
import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm.notebook import trange, tqdm

from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import WordPieceTrainer

from pathlib import Path

# Paths

In [2]:
DATA_DIR = Path("data")
TOK_LOC = DATA_DIR / "tokenizer-goethe_schiller_raimund.json"


# Params

In [7]:
VOCAB_SIZE = 64**2
BLOCK_SIZE = 20
BATCH_SIZE = 512  # how many independent sequences will we process in parallel?
MAX_ITERS = 10000
EVAL_INTERVAL = 200
LEARNING_RATE = 3e-4
EVAL_ITERS = 10
N_EMBD = 1024
N_HEAD = 6
N_LAYER = 6
DROPOUT = 0.2
SPECIAL_TOKENS = ["[UNK]", "[SOS]", "[EOS]"]
torch.manual_seed(1337)
device = "cuda" if torch.cuda.is_available() else "cpu"

# Tokenizer

In [4]:
tokenizer = Tokenizer(WordPiece())
tokenizer.pre_tokenizer = Whitespace()
trainer = WordPieceTrainer(vocab_size=VOCAB_SIZE, special_tokens=SPECIAL_TOKENS)

tokenizer.train([str(DATA_DIR / "train.txt")], trainer)
tokenizer.save(str(TOK_LOC))
tokenizer = Tokenizer.from_file(str(TOK_LOC))

with open(DATA_DIR / "train.txt", "r") as f:
    train_enc = tokenizer.encode(f.read())
with open(DATA_DIR / "val.txt", "r") as f:
    val_enc = tokenizer.encode(f.read())







In [6]:
len(val_enc.ids)

201021

In [5]:
def build_batch(split):
    data = train_enc.ids if split == 'train' else val_enc.ids
    ix = torch.randint(len(data) - BLOCK_SIZE, (BATCH_SIZE,))
    #torch.tensor uses ints and torch.Tensor uses float also torch.Tensor is annoying
    x = torch.stack([torch.tensor(data[i:i+BLOCK_SIZE]) for i in ix])
    y = torch.stack([torch.tensor(data[i+1:i+BLOCK_SIZE+1]) for i in ix])
    X, Y = x.to(device), y.to(device)
    return X, Y

@torch.no_grad()
def estimate_loss():
    out = {}
    m.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(EVAL_ITERS)
        for k in range(EVAL_ITERS):
            X, Y = build_batch(split)
            logits, loss = m(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    m.train()
    return out


     

class Head(nn.Module):
    
    def __init__(self, head_size):
        super().__init__()
        self.key   = nn.Linear(N_EMBD, head_size, bias=False)
        self.query = nn.Linear(N_EMBD, head_size, bias=False)
        self.value = nn.Linear(N_EMBD, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(BLOCK_SIZE, BLOCK_SIZE)))
        
    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)
        wei = q @ k.transpose(-2, -1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        out = wei @ v
        return out


class MultiHeadAttention(nn.Module):
    
    def __init__(self, num_heads, head_size, n_embd):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        
    def forward(self, x):
        out = toch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)

class FeedForward(nn.Module):
    
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Liner(n_embd, 4*n_embd),
            nn.ReLU(),
            nn.Linear(4*n_embd, n_embd)
        )

    def forward(self, x):
        return self.net(x)
    
def Block(nn.Module):
    
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self. sa = MultiHeadAttention(n_head, head_size, n_embd)
        self.ffwd = FeedForward(n_embd)
        
    def forward(self, x):
        x = x + self.sa(x)
        x = x + self.ffwd(x)
        return x
    
class Transformer(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(VOCAB_SIZE, N_EMBD)
        self.positon_embedding_table = nn.Embedding(BLOCK_SIZE, N_EMBD)
        self.blocks = nn.Sequential(
            Block(N_EMBD, n_head = 4),
            Block(N_EMBD, n_head = 4),
            Block(N_EMBD, n_head = 4),
        )
        self.lm_head = nn.Linear(N_EMBD, VOCAB_SIZE).to(device)
        
    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.positon_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        logits = self.lm_head(x)
        
        
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            
            
        
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:,-BLOCK_SIZE:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx   
        


In [6]:

m = Transformer().to(device)

In [20]:
xb, yb = build_batch("train")
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)
idx = torch.zeros((1,1), dtype=torch.long).to(device)
idx = m.generate(idx, max_new_tokens=100)[0].tolist()
print(tokenizer.decode(idx))

torch.Size([10240, 4096])
tensor(4.4051, device='cuda:0', grad_fn=<NllLossBackward0>)
##ischen Dichter machen will, auf Höhen richtet sich gesellts acht, die Geschwindigkeit versäumteinett wieder und versetzt. Soviel begegnen ist, herbst einem Manne unserer erzählen. Die Welt hier nur reduf der Handlung, ausgetrieben, und vernahm der Herr, das fühllos, ich werde da auf mich darüber zu rätseln werden. Aufwanderung irgendeine Antwort mit, es geht an mir meinen Namen angekommenden, und K


In [19]:
optimizer = torch.optim.AdamW(m.parameters(),lr=1e-3)

t = trange(MAX_ITERS, desc='Bar desc', leave=True)

for steps in t:

    if steps % EVAL_INTERVAL == 0:
        losses = estimate_loss()
        t.set_description(f"Losses: train = {losses['train']:.4f} | test = {losses['val']:.4f}", refresh= True)
        
    xb, yb = build_batch('train')
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    t.refresh()

Bar desc:   0%|          | 0/10000 [00:00<?, ?it/s]

In [9]:
for i in trange(10):
    print("ass"[i])

  0%|          | 0/10 [00:00<?, ?it/s]

a
s
s


IndexError: string index out of range