In [1]:
with open('kafka.txt', 'r', encoding='utf-8') as file:

    data = file.read().replace('\n', ' ')

In [2]:
chars = sorted(list(set(data)))
vocab_size = len(chars)

stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda x: [stoi[ch] for ch in x]
decode = lambda x: ''.join([itos[i] for i in x])

In [3]:
import torch

data = torch.tensor(encode(data))

In [4]:
train_data = data[:int(len(data)*0.9)]
val_data = data[int(len(data)*0.9):]

In [5]:
torch.manual_seed(42)
block_size = 8
batch_size = 4

def get_batch(source):
    data = train_data if source == 'train' else val_data
    idx = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in idx])
    y = torch.stack([data[i+1:i+block_size+1] for i in idx])
    return x, y

xb, yb = get_batch('train')



In [9]:
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.manual_seed(42)

class bigramLanguageModel(nn.Module):

    
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        logits = self.token_embedding_table(idx)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1) 
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1) 
        return idx

model = bigramLanguageModel(vocab_size)
logits, loss = model(xb, yb)
print(logits.shape, loss)

idx = torch.zeros((1,1), dtype=torch.long)
print(decode(model.generate(idx,max_new_tokens=100)[0].tolist()))

torch.Size([32, 81]) tensor(5.1227, grad_fn=<NllLossBackward0>)
 é̈ï'uyNrço?”Dt"'DHryxuzïïavGof—,pAf1. G‘k—Qh\hgn‘"W!vT7BA9H«*’x(̈hV*LRx.B".MO\ü;WfNT.BI?kLy\FHPQnfsT


In [10]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)


In [16]:
batch_size = 32
for steps in range(10000):
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    print(loss.item())

4.233445644378662
4.197903633117676
4.123136043548584
4.231251239776611
4.184401035308838
4.238352298736572
4.261487007141113
4.251667022705078
4.231711387634277
4.194826602935791
4.169344425201416
4.327320098876953
4.202890872955322
4.289733409881592
4.288973331451416
4.135246276855469
4.238687515258789
4.274935722351074
4.223235130310059
4.242545127868652
4.232923984527588
4.179357528686523
4.1916399002075195
4.2315144538879395
4.161498069763184
4.114600658416748
4.216714859008789
4.200356483459473
4.279462814331055
4.201079368591309
4.270895004272461
4.220630168914795
4.123661518096924
4.211489677429199
4.083621025085449
4.193958759307861
4.163402557373047
4.185471057891846
4.171281814575195
4.079345703125
4.228950500488281
4.1802215576171875
4.217527389526367
4.076112747192383
4.098449230194092
4.089300632476807
4.1701436042785645
4.194666862487793
4.15111780166626
4.039609909057617
4.0061211585998535
4.040195465087891
4.212000370025635
4.171072959899902
3.9821105003356934
4.306166

In [18]:
print(decode(model.generate(idx,max_new_tokens=500)[0].tolist()))

 t wanthedvinounecly It turersousout yag, chmeasoi ant h honsathid m. ced albentha athtak I So o y w, wiser, min’ K. ct ttasing herd hef alds mithenout h hid mule an’sas ins p beickichis ictrern herllly aneeld wedevee y t fo he t as y abeto-out t Mait. war tlay ined Mï’s(a, p surent g a be on rwhont one ge, d lstillecunelomplintod " iü̈Zveood ad alwecoougnd jur—(Uut bl o an’s toxxt. cqure“whed; bach " hith, Thalpasendaitwhins able Ellw'skat ts oro " he t pa mppmenga sen f s s bathesucto puzindve-


In [21]:
torch.manual_seed(42)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)

k = key(x) 
q = query(x)
wei = q @ k.transpose(-2,-1)

tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril==0, float('-inf'))
wei = F.softmax(wei, dim=-1)   
out = wei @ x



tensor([[[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00],
         [1.9052e-01, 8.0948e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00],
         [3.7418e-01, 5.6820e-02, 5.6900e-01, 0.0000e+00, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00],
         [1.2878e-01, 3.3800e-01, 1.3765e-01, 3.9557e-01, 0.0000e+00,
          0.0000e+00, 0.0000e+00, 0.0000e+00],
         [4.3106e-01, 8.4134e-02, 5.8193e-02, 3.0487e-01, 1.2175e-01,
          0.0000e+00, 0.0000e+00, 0.0000e+00],
         [5.3737e-02, 3.2051e-01, 6.9361e-02, 2.4035e-01, 2.5680e-01,
          5.9238e-02, 0.0000e+00, 0.0000e+00],
         [3.3957e-01, 1.4859e-02, 5.1650e-01, 1.7974e-02, 6.5784e-02,
          8.0182e-03, 3.7289e-02, 0.0000e+00],
         [1.6459e-02, 3.7486e-02, 1.4404e-02, 1.1200e-01, 3.3159e-02,
          4.0686e-01, 3.1364e-01, 6.5997e-02]],

        [[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.00

In [1]:
import torch
print(torch.cuda.is_available())

True


In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F

batch_size = 64 
block_size = 256
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

torch.manual_seed(1337)

with open('kafka.txt', 'r', encoding='utf-8') as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] 
decode = lambda l: ''.join([itos[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)  
        q = self.query(x) 
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x) 
        out = wei @ v
        return out

class MultiHeadAttention(nn.Module):

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x) 
        x = self.ln_f(x)
        logits = self.lm_head(x) 

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1) 
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

model = GPTLanguageModel()
m = model.to(device)
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))

10.802002 M parameters
step 0: train loss 4.4960, val loss 4.4923
step 500: train loss 1.6015, val loss 1.7801
step 1000: train loss 1.1972, val loss 1.4341
step 1500: train loss 1.0646, val loss 1.3502
step 2000: train loss 0.9731, val loss 1.3015
step 2500: train loss 0.9020, val loss 1.3075
step 3000: train loss 0.8308, val loss 1.3189
step 3500: train loss 0.7615, val loss 1.3585
step 4000: train loss 0.6953, val loss 1.3956
step 4500: train loss 0.6282, val loss 1.4502
step 4999: train loss 0.5594, val loss 1.5049

The captain shouting again. "Well, it really?" he asked. He looked at K. sitting, right up for behind the business. He had already waited to gentle the pair to hir peneray, but, since K. did not want. Then he looked up at Mrs. Grubach as she grasped an under and, accompassive and both him, she passed then the planks came up when, she came in andusive even likely and she decimen to mention that was so clear, as it also, narry wad packed. "A called Lanz, with a colleague 