In [1]:
from tokenizers import Tokenizer
from paths import DATA_DIR, TOK_LOC
from hyperparameters import (BATCH_SIZE, BLOCK_SIZE, DROPOUT, EVAL_INTERVAL,
                             EVAL_ITERS, LEARNING_RATE, MAX_ITERS, N_EMBD,
                             N_HEAD, N_LAYER, VOCAB_SIZE)
import torch
import torch.nn as nn
from torch.nn import functional as F






In [2]:
torch.manual_seed(1337)
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
tokenizer = Tokenizer.from_file(str(TOK_LOC))
with open(DATA_DIR / "train.txt", "r") as f:
    train_enc = tokenizer.encode(f.read())
with open(DATA_DIR / "val.txt", "r") as f:
    val_enc = tokenizer.encode(f.read())

In [4]:
def build_batch(split):
    data = train_enc.ids if split == 'train' else val_enc.ids
    ix = torch.randint(len(data) - BLOCK_SIZE, (BATCH_SIZE,))
    #torch.tensor uses ints and torch.Tensor uses float also torch.Tensor is annoying
    x = torch.stack([torch.tensor(data[i:i+BLOCK_SIZE]) for i in ix])
    y = torch.stack([torch.tensor(data[i+1:i+BLOCK_SIZE+1]) for i in ix])
    X, Y = x.to(device), y.to(device)
    return X, Y


In [5]:
class BigramLanguageModel(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(VOCAB_SIZE, VOCAB_SIZE)
    
    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx)
        
        if targets is None:
            loss = None
        else:
        
            B, T, C = logits.shape
        
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx 
        
        


In [6]:
xb, yb = build_batch("train")
m = BigramLanguageModel()
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)
idx = torch.zeros((1,1), dtype=torch.long)
idx = m.generate(idx, max_new_tokens=100)[0].tolist()
print(tokenizer.decode(idx))

torch.Size([320, 512])
tensor(6.7543, grad_fn=<NllLossBackward0>)
##mm ##l ##a  ##Q ## ##7 sch man ##ì ##hn ##r und ##: ##pf û ##as ##au Sch ##ei ##ort C X ##R ##der ##aß ver ##q ##' ##ill ##> ##ck ##á ##-- wer ##re ##M ##es aber ##keit ##uß ##ô A ##al ##à dies é ##in ##ber ##ind ## ##o ##und ##as mir ##E ##el ##em , | ##v ##ot im bei Ich v ##mm ##ig ##z 0 ##zu ~ 1 hat h ##b ##bst ##us m ##hn ù ##us ##rei D ##- all ##| er and ##gt ##u , ##ir C à Q ##Y kann einen


In [7]:
optimizer = torch.optim.AdamW(m.parameters(),lr=1e-3)

In [8]:
for steps in range(100):
    
    xb, yb = build_batch('train')
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    print(loss.item())

6.762286186218262
6.834046840667725
6.696063041687012
6.807833194732666
6.811186790466309
6.784790992736816
6.748086452484131
6.733273506164551
6.82156229019165
6.741987705230713
6.8552656173706055
6.767934322357178
6.720228672027588
6.750064849853516
6.675497531890869
6.74094295501709
6.753795623779297
6.741440773010254
6.742257118225098
6.7121992111206055
6.854103088378906
6.766846656799316
6.79730749130249
6.7569732666015625
6.8162946701049805
6.874232292175293
6.691197872161865
6.768557548522949
6.698671817779541
6.85129451751709
6.677887916564941
6.760354518890381
6.714031219482422
6.755202293395996
6.784643650054932
6.720249176025391
6.711251258850098
6.615055084228516
6.733750820159912
6.762115478515625
6.772369384765625
6.661929130554199
6.7191009521484375
6.640076637268066
6.604544162750244
6.8138604164123535
6.757739067077637
6.734800815582275
6.757530212402344
6.780100345611572
6.690114498138428
6.741968631744385
6.708401679992676
6.706373691558838
6.770060062408447
6.695371

In [9]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out