In [212]:
import math
import random

In [1]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("zhyncs/sonnet")

Downloading readme:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 93.6k/93.6k [00:00<00:00, 376kB/s]


Generating train split: 0 examples [00:00, ? examples/s]

## Andrej uses a names dataset, but I will explore sonnet generation instead

The task will be to generate (possibly Shakespearean-style) sonnets.

## Tokenization

There are a few approahces for tokenization. BPE is the real algorithm an LLm uses. For efficiency (and simplicity) in MicroGPT, I will use character-based tokenization.

In [13]:
train_ds = ds['train']

In [19]:
train_ds[-1]

{'text': "Love's fire heats water, water cools not love."}

In [20]:
unique_chars = set()

for row in train_ds:
  chars_list = list(row['text'])
  for c in chars_list:
    unique_chars.add(c)


In [30]:
# Vocabulary size
print("Vocab size:", len(unique_chars))

## Now convert this list of chars to a proper vocabulary
## char_id -> char
vocab = {index: char for index, char in enumerate(unique_chars)}

## The tokenizer does the opposite: go from text to integers
tokenizer = {char: index for index, char in enumerate(unique_chars)}

Vocab size: 59


In [29]:
vocab

{0: 'X',
 1: 'M',
 2: 'B',
 3: 'b',
 4: 'a',
 5: 'C',
 6: 'F',
 7: 'u',
 8: 'T',
 9: 'A',
 10: '!',
 11: 'r',
 12: 'i',
 13: ',',
 14: 'E',
 15: 'w',
 16: 'z',
 17: 'P',
 18: 'm',
 19: ':',
 20: 's',
 21: 'R',
 22: 'N',
 23: 'L',
 24: 'G',
 25: 'V',
 26: 'K',
 27: 'j',
 28: 'h',
 29: 'y',
 30: ' ',
 31: 'J',
 32: 'q',
 33: 'D',
 34: ';',
 35: 'n',
 36: "'",
 37: 'f',
 38: '-',
 39: 'O',
 40: 'x',
 41: 'o',
 42: 'U',
 43: 'p',
 44: 't',
 45: 'v',
 46: 'I',
 47: 'W',
 48: 'Y',
 49: 'l',
 50: 'S',
 51: 'g',
 52: 'c',
 53: 'd',
 54: '.',
 55: 'k',
 56: 'e',
 57: '?',
 58: 'H'}

## Build Autograd Now

We want all basic operations in the architecture.

In [216]:
class Value:
  def __init__(self, val : float, name="", op="", children=[]):
    self.val = val
    self.name = name
    self.children = children # should be ordered, not sets!
    self.local_grads = [] # local gradient of current node wrt children
    self.grad = 0 # total gradient (add to this)

  def __repr__(self):
    return (f"{self.name}: {self.val}")

  # Overrie original add function for Value object
  def __add__(self, other, op="+", other_name=None):
    if not other_name:
      other_name = other.name

    new_val = Value(self.val + other.val, name=f"({self.name} {op} {other_name})", children=set([self, other]))
    new_val.local_grads = [1,1]
    return new_val
  
  def __neg__(self):
    new_val = Value(-self.val, name=f"(-{self.name})", children=[self])
    new_val.local_grads = [-1]
    return new_val
  
  def __sub__(self, other):
    return self.__add__((-other), op="-", other_name= other.name)
  
  def __mul__(self, other, op="*", other_name=None):
    if not other_name:
      other_name = other.name
    
    new_val = Value(self.val * other.val, name=f"({self.name} {op} {other_name})", children=set([self, other]))
    new_val.local_grads = [other.val, self.val]
    return new_val
  
  def __pow__(self, exp : float, op="^"):
    new_val = Value(self.val ** exp, children=[self])
    new_val.local_grads = [exp * self.val ** (exp - 1)]
    return new_val
  
  def __truediv__(self, other, op="/"):
    denom = other ** (-1)
    new_val = Value(self.val * denom.val, children=[self, other])
    new_val.local_grads = [-1 / denom.val, self.val / (denom.val * denom.val)]
    return new_val
  
  def exp(self):
    new_val = Value(math.exp(self.val), name=f"exp({self.name})", children=[self])
    new_val.local_grads = [math.exp(self.val)]
    return new_val
  
  def log(self):
    new_val = Value(math.log(self.val), name=f"log({self.name})", children=[self])
    new_val.local_grads = [1 / self.val]
    return new_val
  
  def relu(self):
    new_val = Value(max(0, self.val), name=f"ReLU({self.name})", children=[self])
    new_val.local_grads = [float(self.val > 0)]
    return new_val
  
  def backward(self):
    # compute the gradient from the root
    topo = []
    visited = set()
    # first construct a topsort of the computational graph
    def construct_toposort(cur):
      if cur not in visited:
        visited.add(cur)
        for n in cur.children:
          construct_toposort(n)
        topo.append(cur)
    
    construct_toposort(self)
    self.grad = 1
    for v in reversed(topo):
      for child, local_grads in zip(v.children, v.local_grads):
        child.grad += local_grads * v.grad

In [219]:
inf = Value(float('-inf'))
inf

: -inf

In [203]:
a = Value(5, "a")
b = Value(3, "b")

In [204]:
c = a + b
d = c**3

In [205]:
d.backward()

In [206]:
d.grad

1

In [207]:
c.grad

192

In [208]:
a.grad

192

In [209]:
b.grad

192

In [210]:
a = 5
b = 3
c = a + b # 8
d = c^3

so grad(d) = 1
grad (c) = 3 c^2
grad(b) = 

SyntaxError: invalid syntax (4045663001.py, line 6)

In [211]:
a = Value(2.0)
b = Value(3.0)
c = a * b       # c = 6.0
L = c + a       # L = 8.0
L.backward()
print(a.grad)   # 4.0 (dL/da = b + 1 = 3 + 1, via both paths)
print(b.grad)   # 2.0 (dL/db = a = 2)
print(c.grad)   
print(L.grad)   

4.0
2.0
1
1


## Now that AutoGrad is working, can actually define our model!

This won't be too big of a model (just overfit to sonnets), so will have:
- embedding dimension: 16
- attention heads: 4
- layers: 1
- block size: 16 # max sequence length

In [221]:
embed_dim = 16
atten_heads = 4
layers = 1
block_size = 16
hidden_dim = int(embed_dim / atten_heads) # dimension of each head
# define some matrix
vocab_size = len(vocab)
def create_matrix(in_dim, out_dim, std=0.08):
  # creates a matrix of size in_dim x out_dim
  return [[Value(random.gauss(0, std)) for _ in range(out_dim)] for _ in range(in_dim)]

state_dict = {}
for i in range(layers):
  # Projects embedding -> Query vectors
  state_dict[f'layer_{i}_attn_wq'] = create_matrix(embed_dim, embed_dim)
  # Projects embedding -> Key vectors
  state_dict[f'layer_{i}_attn_wk'] = create_matrix(embed_dim, embed_dim)
  # Projects embedding -> Value vectors
  state_dict[f'layer_{i}_attn_wv'] = create_matrix(embed_dim, embed_dim)
  # Final projection after concatenating all heads.
  state_dict[f'layer_{i}_attn_wo'] = create_matrix(embed_dim, embed_dim)
  # Standard MLP: Linear(embed_dim -> 4*embed_dim); GELU; Linear(4*embed_dim -> embed_dim)
  state_dict[f'layer_{i}_fc1'] = create_matrix(embed_dim, 4 * embed_dim)
  state_dict[f'layer_{i}_fc2'] = create_matrix(4 * embed_dim, embed_dim)

# Used to map final hidden state -> logits over vocabulary:
state_dict[f'output_head'] = create_matrix(embed_dim, vocab_size)
# This is positional embedding
state_dict[f'pos_embedding'] = create_matrix(block_size, embed_dim)
# this is token embedding
state_dict[f'token_embedding'] = create_matrix(vocab_size, embed_dim)

In [214]:
class MHA:
  def __init__(self, wq, wk, wv, wo, config):
    self.wq = wq
    self.wk = wk
    self.wv = wv
    self.wo = wo
    self.config = config
  
  def forward(self, input, attn_mask):
    attn_weights = (self.wq @ self.wk.T) / math.sqrt(self.config.hidden_dim)

    

    masked_attn_weights = attn_weights +  attn_mask



In [223]:
def linear(input):
  pass

def softmax(input):
  pass

In [222]:
attn_mask = [[Value(float('-inf')) for _ in range(hidden_dim)] for _ in range(hidden_dim)]
for i in range(hidden_dim):
  for j in range(i):
    attn_mask[i][j] = 0
