In [81]:
import torch
import torch.nn as nn
from torch.nn import functional as F

batch_size = 32
block_size = 8
max_iters = 3500
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200

torch.manual_seed(1337)

def encode(word):
    stoi = {sorted(list(all_characters))[i]:i for i in range(len(all_characters))}
    return [stoi[letter] for letter in word]

def decode(list_of_numbers):
    itos = {i:sorted(list(all_characters))[i] for i in range(len(all_characters))}
    return ''.join([itos[i] for i in list_of_numbers])

def get_batch(batch_size, block_size, data):
    idxs = torch.randint(high=len(data)-block_size, size=(batch_size,))
    xs = torch.stack([train[idx:idx+block_size] for idx in idxs])
    ys = torch.stack([train[idx+1:idx+block_size+1] for idx in idxs])
    xs, ys = xs.to(device), ys.to(device)
    return xs, ys


class BigramModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.guess_next_letter_matrix = torch.nn.Embedding(number_of_characters,number_of_characters)

    def forward(self, x,y):
        logits = self.guess_next_letter_matrix(x)
        logits_reshaped = logits.view(logits.shape[0] * logits.shape[1], logits.shape[2])
        target = y.view(logits.shape[0] * logits.shape[1])
        loss = F.cross_entropy(logits_reshaped,target)
        return logits_reshaped, loss
    
    def generate(self, x, how_many_tokens):
        for _ in range(how_many_tokens):
            last_letter = x[:,-1].view(x.shape[0])
            logits = self.guess_next_letter_matrix(last_letter)
            next_letter_probability_dist = F.softmax(logits, dim = 1)
            next_letter_guess = torch.multinomial(next_letter_probability_dist,1)
            x = torch.cat((x,next_letter_guess), dim=1)
        return x

with open('input.txt', 'r', encoding='utf-8') as text:
    data = text.read()
all_characters = set(data)
number_of_characters = len(all_characters)
train = torch.tensor(encode(data)[:int(0.9*len(data))], dtype = torch.long)
test = torch.tensor(encode(data)[int(0.9 * len(data)):], dtype = torch.long)

model = BigramModel()
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(),lr=1e-3)
for step in range(max_iters):
    xb, yb = get_batch(batch_size, block_size, train)
    logits, loss = model.forward(xb,yb)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()
    if step % 500 == 0:
        print('Loss:',loss)

print('Loss:', loss)

context = torch.zeros((1,1), dtype = torch.long, device = device)
print(decode(model.generate(context, how_many_tokens=500)[0].tolist()))

Loss: tensor(4.6485, grad_fn=<NllLossBackward0>)
Loss: tensor(4.1626, grad_fn=<NllLossBackward0>)
Loss: tensor(3.7027, grad_fn=<NllLossBackward0>)
Loss: tensor(3.4229, grad_fn=<NllLossBackward0>)
Loss: tensor(3.1231, grad_fn=<NllLossBackward0>)
Loss: tensor(3.0263, grad_fn=<NllLossBackward0>)
Loss: tensor(2.7293, grad_fn=<NllLossBackward0>)
Loss: tensor(2.7074, grad_fn=<NllLossBackward0>)

vet IMr'bV$Ollackze hit aind thr, hith$!.
QX$Gl CHorivinsimalS.
-ine sis?imLithinR?USoDU,RSpiA-inYzeinit f fln:, rw t$NI'JFo te I! ghe nkUMgellYpa!
nof s,MZYP gea!
Wot hjBuFRy Jaman ve.
TotrgHowes$F-pe sheaI'dd, nYKaNCIiorvil.
IV; icNo'o aumed s; wlrerr non;brJifrus ?he noco'dco h s golf sthiszKFir:D$LIfixr:
A:EYo.
T-
IINDYef itTheyO:Comyway bod s:
CErp : avealiavot,&VMCKang!BOl d
T' genVJates, :zR:CEY ff?fulvNuris CIndO: wzo a -vPve, cofanbPos st
WeaSTheaves.:
Akenremeallasu wnoitRat HAKStn cai


In [99]:
n_emb = 32

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_emb, head_size, bias = False)
        self.query = nn.Linear(n_emb, head_size, bias = False)
        self.value = nn.Linear(n_emb, head_size, bias = False)

    def forward(self, x):
        B, T, C = x.shape[0], x.shape[1], x.shape[2]
        q = self.query(x) #B,T,head
        k = self.key(x)   #B,T,head
        v = self.value(x) #B,T,head
        wei = q @ k.transpose(-2,-1) * C ** -0.5#B,T,T
        tril = torch.tril(torch.ones(T,T)) #T,T
        wei = wei.masked_fill(tril == 0, float('-inf')) #B,T,T
        wei = F.softmax(wei,dim=-1)#B,T,T
        output = wei @ v #B,T,head
        return output

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    
    def forward(self,x):
        return torch.cat([h(x) for h in self.heads], dim = -1) 

class BigramModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = torch.nn.Embedding(number_of_characters,n_emb)
        self.position_embedding_table = nn.Embedding(block_size, n_emb)
        self.sa_heads = MultiHeadAttention(4,n_emb//4)
        self.lm_head = nn.Linear(n_emb, number_of_characters)

    def forward(self, x, y):
        token_embedding = self.embedding(x) # B, T, n_emb
        position_encoding = self.position_embedding_table(torch.arange(x.shape[1], device = device)) # T,C
        x = token_embedding + position_encoding # new dimension of 1 is added, and it's broadcasted up the batches
        x = self.sa_heads(x)
        logits = self.lm_head(x)
        logits_reshaped = logits.view(logits.shape[0] * logits.shape[1],-1)
        target = y.view(logits.shape[0] * logits.shape[1])
        loss = F.cross_entropy(logits_reshaped,target)
        return logits_reshaped, loss
    
    def forward_pass_only(self, x):
        token_embedding = self.embedding(x) # B, T, n_emb
        position_encoding = self.position_embedding_table(torch.arange(x.shape[1], device = device)) # T,C
        x = token_embedding + position_encoding # new dimension of 1 is added, and it's broadcasted up the batches
        x = self.sa_heads(x)
        logits = self.lm_head(x)
        return logits


    def generate(self, x, how_many_tokens):
        for _ in range(how_many_tokens):
            context = x[:, -block_size:]
            logits = self.forward_pass_only(context)
            next_letter_probability_dist = F.softmax(logits[:, -1, :], dim=-1)
            next_letter_guess = torch.multinomial(next_letter_probability_dist, 1)
            x = torch.cat((x, next_letter_guess), dim=1)
        return x

    
xb, yb = get_batch(batch_size, block_size, train)

In [98]:
model = BigramModel()
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(),lr=1e-3)
for step in range(max_iters):
    xb, yb = get_batch(batch_size, block_size, train)
    logits, loss = model.forward(xb,yb)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()
    if step % 500 == 0:
        print('Loss:',loss)

print('Loss:', loss)

context = torch.zeros((1,1), dtype = torch.long, device = device)
print(decode(model.generate(context, how_many_tokens=500)[0].tolist()))

Loss: tensor(4.2357, grad_fn=<NllLossBackward0>)
Loss: tensor(2.6528, grad_fn=<NllLossBackward0>)
Loss: tensor(2.4865, grad_fn=<NllLossBackward0>)
Loss: tensor(2.5098, grad_fn=<NllLossBackward0>)
Loss: tensor(2.3091, grad_fn=<NllLossBackward0>)
Loss: tensor(2.3195, grad_fn=<NllLossBackward0>)
Loss: tensor(2.4876, grad_fn=<NllLossBackward0>)
Loss: tensor(2.3926, grad_fn=<NllLossBackward0>)

Whis as hath!
I fo
anit.

Whofrem do.

IGWo, ha, ime aks Aivowe a, Cgthasat his,
Bot hearke, heied Ird fo.

DBour to bree. sy wor sok hetucthachirich sitoth. wherve ay shilld thoul ore,
DI men gabnd ckencthine thithon'd herss irto eitleld hou yo mmeeve-dee me, man, masce-Bu othe em Has wan.
UESRKEIR:
Fat axthimy my lant cod theme hig yno oudre?

CAnerit athe,'d, yefiad hy hreadabr se himealn.

ARYeivilllorbrur:
He ppluead nd O:
Fof thilderedint loour ief, Aadnou pith home, thatwat nre fo ngouce,


# MultiHeaded Attention

In [100]:
n_emb = 32

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_emb, head_size, bias = False)
        self.query = nn.Linear(n_emb, head_size, bias = False)
        self.value = nn.Linear(n_emb, head_size, bias = False)

    def forward(self, x):
        B, T, C = x.shape[0], x.shape[1], x.shape[2]
        q = self.query(x) #B,T,head
        k = self.key(x)   #B,T,head
        v = self.value(x) #B,T,head
        wei = q @ k.transpose(-2,-1) * C ** -0.5 #B,T,T
        tril = torch.tril(torch.ones(T,T)) #T,T
        wei = wei.masked_fill(tril == 0, float('-inf')) #B,T,T
        wei = F.softmax(wei,dim=-1)#B,T,T
        output = wei @ v #B,T,head
        return output

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    
    def forward(self,x):
        return torch.cat([h(x) for h in self.heads], dim = -1) 

class BigramModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = torch.nn.Embedding(number_of_characters,n_emb)
        self.position_embedding_table = nn.Embedding(block_size, n_emb)
        self.sa_heads = MultiHeadAttention(4,n_emb//4)
        self.lm_head = nn.Linear(n_emb, number_of_characters)

    def forward(self, x, y):
        token_embedding = self.embedding(x) # B, T, n_emb
        position_encoding = self.position_embedding_table(torch.arange(x.shape[1], device = device)) # T,C
        x = token_embedding + position_encoding # new dimension of 1 is added, and it's broadcasted up the batches
        x = self.sa_heads(x)
        logits = self.lm_head(x)
        logits_reshaped = logits.view(logits.shape[0] * logits.shape[1],-1)
        target = y.view(logits.shape[0] * logits.shape[1])
        loss = F.cross_entropy(logits_reshaped,target)
        return logits_reshaped, loss
    
    def forward_pass_only(self, x):
        token_embedding = self.embedding(x) # B, T, n_emb
        position_encoding = self.position_embedding_table(torch.arange(x.shape[1], device = device)) # T,C
        x = token_embedding + position_encoding # new dimension of 1 is added, and it's broadcasted up the batches
        x = self.sa_heads(x)
        logits = self.lm_head(x)
        return logits


    def generate(self, x, how_many_tokens):
        for _ in range(how_many_tokens):
            context = x[:, -block_size:]
            logits = self.forward_pass_only(context)
            next_letter_probability_dist = F.softmax(logits[:, -1, :], dim=-1)
            next_letter_guess = torch.multinomial(next_letter_probability_dist, 1)
            x = torch.cat((x, next_letter_guess), dim=1)
        return x

    
xb, yb = get_batch(batch_size, block_size, train)

In [101]:
model = BigramModel()
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(),lr=1e-3)
for step in range(max_iters):
    xb, yb = get_batch(batch_size, block_size, train)
    logits, loss = model.forward(xb,yb)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()
    if step % 500 == 0:
        print('Loss:',loss)

print('Loss:', loss)

context = torch.zeros((1,1), dtype = torch.long, device = device)
print(decode(model.generate(context, how_many_tokens=500)[0].tolist()))

Loss: tensor(4.2477, grad_fn=<NllLossBackward0>)
Loss: tensor(2.6790, grad_fn=<NllLossBackward0>)
Loss: tensor(2.7075, grad_fn=<NllLossBackward0>)
Loss: tensor(2.2938, grad_fn=<NllLossBackward0>)
Loss: tensor(2.4916, grad_fn=<NllLossBackward0>)
Loss: tensor(2.4201, grad_fn=<NllLossBackward0>)
Loss: tensor(2.2491, grad_fn=<NllLossBackward0>)
Loss: tensor(2.3200, grad_fn=<NllLossBackward0>)

WO:
Thereely aik you what!
Yo:
Thallisest tou forg yusith wesnd to yo wot ppmy utlit nier the hing his the tercor dis?- why nom.

Whayoun sur, nowh yo athe auty,
Whes hin! sevendpe muse:
It.

Grithis.
SLARCILAS:
VThamiver thathe swe noth wing ecechelabe hay pom I difermape hele QESIUSUETEULIENVIII by sen; an com acof all:
Thim lave, figaimn.

Lathspad obursple allancerdth empilt to ite dep me ay thiche, fer it mom toe of athithste outh he his taes pre.
BUEARIDULCAR:
Bus so tould:
If trinkn thich


# Add Feed Forward

In [108]:
n_emb = 32

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_emb, head_size, bias = False)
        self.query = nn.Linear(n_emb, head_size, bias = False)
        self.value = nn.Linear(n_emb, head_size, bias = False)

    def forward(self, x):
        B, T, C = x.shape[0], x.shape[1], x.shape[2]
        q = self.query(x) #B,T,head
        k = self.key(x)   #B,T,head
        v = self.value(x) #B,T,head
        wei = q @ k.transpose(-2,-1) * C ** -0.5#B,T,T
        tril = torch.tril(torch.ones(T,T)) #T,T
        wei = wei.masked_fill(tril == 0, float('-inf')) #B,T,T
        wei = F.softmax(wei,dim=-1)#B,T,T
        output = wei @ v #B,T,head
        return output

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    
    def forward(self,x):
        return torch.cat([h(x) for h in self.heads], dim = -1) 

class FeedForward(nn.Module):
    def __init__(self, n_emb):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_emb,n_emb),
            nn.ReLU()
        )
    def forward(self,x):
        return self.net(x)

class BigramModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = torch.nn.Embedding(number_of_characters,n_emb)
        self.position_embedding_table = nn.Embedding(block_size, n_emb)
        self.sa_heads = MultiHeadAttention(4,n_emb//4)
        self.ffwd = FeedForward(n_emb)
        self.lm_head = nn.Linear(n_emb, number_of_characters)

    def forward(self, x, y):
        token_embedding = self.embedding(x) # B, T, n_emb
        position_encoding = self.position_embedding_table(torch.arange(x.shape[1], device = device)) # T,C
        x = token_embedding + position_encoding # new dimension of 1 is added, and it's broadcasted up the batches
        x = self.sa_heads(x)
        x = self.ffwd(x)
        logits = self.lm_head(x)
        logits_reshaped = logits.view(logits.shape[0] * logits.shape[1],-1)
        target = y.view(logits.shape[0] * logits.shape[1])
        loss = F.cross_entropy(logits_reshaped,target)
        return logits_reshaped, loss
    
    def forward_pass_only(self, x):
        token_embedding = self.embedding(x) # B, T, n_emb
        position_encoding = self.position_embedding_table(torch.arange(x.shape[1], device = device)) # T,C
        x = token_embedding + position_encoding # new dimension of 1 is added, and it's broadcasted up the batches
        x = self.sa_heads(x)
        x = self.ffwd(x)
        logits = self.lm_head(x)
        return logits


    def generate(self, x, how_many_tokens):
        for _ in range(how_many_tokens):
            context = x[:, -block_size:]
            logits = self.forward_pass_only(context)
            next_letter_probability_dist = F.softmax(logits[:, -1, :], dim=-1)
            next_letter_guess = torch.multinomial(next_letter_probability_dist, 1)
            x = torch.cat((x, next_letter_guess), dim=1)
        return x

    
xb, yb = get_batch(batch_size, block_size, train)

In [109]:
model = BigramModel()
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(),lr=1e-3)
for step in range(max_iters):
    xb, yb = get_batch(batch_size, block_size, train)
    logits, loss = model.forward(xb,yb)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()
    if step % 500 == 0:
        print('Loss:',loss)

print('Loss:', loss)

context = torch.zeros((1,1), dtype = torch.long, device = device)
print(decode(model.generate(context, how_many_tokens=500)[0].tolist()))

Loss: tensor(4.1412, grad_fn=<NllLossBackward0>)
Loss: tensor(2.5886, grad_fn=<NllLossBackward0>)
Loss: tensor(2.5640, grad_fn=<NllLossBackward0>)
Loss: tensor(2.4765, grad_fn=<NllLossBackward0>)
Loss: tensor(2.2576, grad_fn=<NllLossBackward0>)
Loss: tensor(2.3373, grad_fn=<NllLossBackward0>)
Loss: tensor(2.3536, grad_fn=<NllLossBackward0>)
Loss: tensor(2.3237, grad_fn=<NllLossBackward0>)

BAY:
Wa wher fir
Thy achas toust, for,
igh's VINIGLINI: you hisneramt.

MYC:
Ky is thisth stherem yould. GwpY:
O the:
ford I lich gre thavis hal at ine, mis, thegiv for''tast thout
Kond shis! to-tepst suld ye went ofredeard wom:
Jrung bror wendce
Ist younthee titnest sthing dir sace acovestlet wit? my,, anf wkokis youlve fors of:
Theml:
Mor athe cin therit fave it to!
Whaloomard thal tevee.

NETIE:
An, proys theov ongisthy shat so to pins,
tholw nodere wantill wand morgh' itnou pysmes pot thim t
