In [2]:
import torch 
import numpy as np 
import importlib 
import torch.nn as nn
from torch.nn import functional as F


In [37]:
class ToyModel:
    def __init__(self, d_model, num_heads=2):
        self.wq = nn.Linear(d_model, d_model)
        self.wk = nn.Linear(d_model, d_model)
        self.wv = nn.Linear(d_model, d_model)

        self.num_heads = num_heads 

        self.d_head = d_model // num_heads

    def forward(self, x):
        q = self.wq(x).view(x.size(0), -1, self.num_heads, self.d_head).transpose(1, 2)
        k = self.wk(x).view(x.size(0), -1, self.num_heads, self.d_head).transpose(1, 2)
        v = self.wv(x).view(x.size(0), -1, self.num_heads, self.d_head).transpose(1, 2)
        print("QKV shape: ", q.shape)
        
        scores = (q @ k.transpose(-2, -1))/torch.sqrt(torch.tensor(self.d_head)) 
        # print("Scores shape: ", scores.shape)
        # print("Scores: ", scores)

        seq_len = x.size(1)
        # masked fill for seq_len 

        mask = torch.tril(torch.ones((seq_len,seq_len)).view(1,1,seq_len,seq_len))
        print("Mask shape: ", mask.shape)
        
        scores = scores.masked_fill(mask == 0, float('-inf'))

        attention = F.softmax(scores, dim=-1)

        attention = attention @ v

        print(attention.shape)

        attention = attention.transpose(1,2).contiguous().view(-1, seq_len, self.d_head * self.num_heads)
        print(attention.shape)
        

seq_len=10
d_model=16

model = ToyModel(d_model=d_model)

x = torch.ones((1, seq_len, d_model))
print(x.shape)
model.forward(x)

torch.Size([1, 10, 16])
QKV shape:  torch.Size([1, 2, 10, 8])
Mask shape:  torch.Size([1, 1, 10, 10])
torch.Size([1, 2, 10, 8])


In [20]:
# random 1s and zeros
x = torch.zeros((30,8))

attn_mask = torch.tril(torch.ones((8,8))).view(1,1,8,8)
print(attn_mask.shape)

pad_mask = torch.stack([torch.tensor([
    1, 1, 1, 1, 0, 0, 0, 0
]) for _ in range(30)])
pad_mask = pad_mask[:, :, None] * torch.ones(1,1,x.size(1))
pad_mask = pad_mask.transpose(-2, -1)
print(pad_mask.shape)

torch.Size([1, 1, 8, 8])
torch.Size([3, 8, 8])


In [None]:
d_model = 128
x = torch.zeros(32, 64, 128)
n = 10000

seq_len = x.shape[1]

pe = torch.zeros(seq_len, d_model)

for k in range(0, seq_len):
    for i in range(0, d_model//2):
        theta = k/(n**(2*i/d_model))
        pe[k, 2*i] = np.sin(theta)
        pe[k, 2*i + 1] = np.cos(theta)

# saves to state_dict but doesn't do anything with optimizer 

print("PE shape: ", pe.shape)
# TODO: actually start using the positional encodings

In [None]:
out = x + pe 
for i in range(0, out.shape[0]):
    assert (torch.allclose(out[i], pe))

In [49]:
batch_size = 32
d_model = 128 
seq_len = 64
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch = torch.zeros([batch_size, seq_len]).to(int).to(device)

import models.byo_gpt as byogpt 
importlib.reload(byogpt )
model = byogpt.BYOGPT(50257).to(device)
out = model(batch)
print(out.shape)


Initializing BYOGPT with vocab size: 50257
Embed shape:  torch.Size([32, 64, 128])
PE shape:  torch.Size([64, 128])
After pos_embed shape:  torch.Size([32, 64, 128])
Attention shape:  torch.Size([32, 64, 128])
torch.Size([32, 64, 50257])
