### makemore: lect03


### day14

- 202403120630--202403120800
- 202403122000--202403122100

### day 15
- 202403130845--202403131200
  
### day 16
- 202403140630--202403140800

### day 17
- 202403152100--202403152130


### day 18
- 202403161300--202403161500

  
### day 19
- 202403170900--202403171215
- 

In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

from typing import List, Any

### Constants

In [2]:
RANDOM_SEED = 10
NUM_BLOCKS = 3
NUM_DIM_CHARS = 2
NUM_HIDDEN = 20

### Random Generator

In [3]:
import random
import numpy as np

rand_gen = torch.Generator().manual_seed(RANDOM_SEED)
RAND_GEN = rand_gen

torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

### Read File

In [4]:
words = open("names.txt").read().splitlines()
len(words), words[:5]

(32033, ['emma', 'olivia', 'ava', 'isabella', 'sophia'])

### Make Mappings: `chr -> int` and `int -> chr`

In [5]:
ord("a"), ord("z"), ord("z") - ord("a") + 1, chr(ord("a")), chr(ord("z"))

(97, 122, 26, 'a', 'z')

In [6]:
itos = {i - ord("a") + 1: chr(i) for i in range(ord("a"), ord("z") + 1)}
itos[0] = "."

stoi = {s: i for i, s in itos.items()}

# stoi
# itos

In [7]:
chars = sorted(list(set("".join(words))))
chars[:10]

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

In [8]:
stoi = {s: i + 1 for i, s in enumerate(chars)}
stoi["."] = 0
itos = {i: s for s, i in stoi.items()}

### Get Data

In [9]:
def get_data(words: List[str], num_blocks: int = 3, stoi=stoi) -> (List[torch.tensor], List[torch.tensor]):
    """get data"""

    X, Y = [], []


    # ## tese lines are for testing:
    # for word in words[:5]:
    #     print("-" * 80)

    for word in words:
        context = [0] * num_blocks
        for ch in word + ".":
            i = stoi[ch]
            # append the current char ch, i.e. the index i into Y;
            # at the same time append the current context into X;
            # (the context up to now) contains letters that preeced the char ch
            # print(" ".join((itos[ix] for ix in context)), "--->", ch)
            X.append(context)
            Y.append(i)
            context = context[1: ] + [i]
           

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

In [10]:
get_data(words)

(tensor([[ 0,  0,  0],
         [ 0,  0,  5],
         [ 0,  5, 13],
         ...,
         [26, 26, 25],
         [26, 25, 26],
         [25, 26, 24]]),
 tensor([ 5, 13, 13,  ..., 26, 24,  0]))

In [11]:
num_blocks = NUM_BLOCKS
X, Y = get_data(words, num_blocks=num_blocks)

In [12]:
X[:10]

tensor([[ 0,  0,  0],
        [ 0,  0,  5],
        [ 0,  5, 13],
        [ 5, 13, 13],
        [13, 13,  1],
        [ 0,  0,  0],
        [ 0,  0, 15],
        [ 0, 15, 12],
        [15, 12,  9],
        [12,  9, 22]])

In [13]:
Y[:10]

tensor([ 5, 13, 13,  1,  0, 15, 12,  9, 22,  9])

### Split Data: `train`, `val/dev`, and `test`

In [14]:
X.shape, X.shape[0]

(torch.Size([228146, 3]), 228146)

In [15]:
Y.shape, Y.shape[0]

(torch.Size([228146]), 228146)

In [16]:
indexes = torch.randperm(X.shape[0], generator=rand_gen)
indexes[:10]

tensor([117017,  81926, 152774,   8097, 220660,  38770, 226753, 147220, 224337,
        175417])

In [17]:
# this list changes over time?!
torch.randperm(X.shape[0], generator=rand_gen)[:10]

tensor([184694,  31158,  21725,  15842,   3894,  75184,  50599, 156241, 116075,
         47106])

In [18]:
def split_data(X, y, split=0.8):
    
    """split data into train, val, and test sets"""
    
    assert X.shape[0] == y.shape[0], "input data X and Y sizes do not match."
    assert (0 < split and split < 1), "the split for training data should be within (0, 1)."
    
    len_data = X.shape[0]
    shuffled_indices = torch.randperm(X.shape[0], generator=rand_gen)

    len_train = int(split * len_data)
    len_val = (len_data - len_train) // 2
    len_test = len_data - len_train - len_val
    
    
    X_train = X[shuffled_indices][:len_train]
    X_val = X[shuffled_indices][len_train: len_train + len_val]
    X_test = X[shuffled_indices][- len_test:]
    
    y_train = Y[shuffled_indices][:len_train]
    y_val = Y[shuffled_indices][len_train: len_train + len_val]
    y_test = Y[shuffled_indices][- len_test:]

    return X_train, X_val, X_test, y_train, y_val, y_test

In [19]:
X_train, X_val, X_test, Y_train, Y_val, Y_test = split_data(X, Y)

In [20]:
X_train.shape, X_val.shape, X_test.shape, Y_train.shape, Y_val.shape, Y_test.shape

(torch.Size([182516, 3]),
 torch.Size([22815, 3]),
 torch.Size([22815, 3]),
 torch.Size([182516]),
 torch.Size([22815]),
 torch.Size([22815]))

### MLP revisted

### Develop Model

- `C` is a mapping that translates `chars` into a `dim`-dimensional vector space;

In [21]:
### given the context X; 
### we predict the following char given in Y; 
### and the prediction belongs in one out of 27 classes; 
### i.e. there are 27 chars as a possibility for it. 

vocab_size = 27 

In [22]:
# MLP revisted
n_embed = 10 # the dimensionality of the char. embedding 
n_hidden = 200 # the number of neurons in the hidden layers

g = torch.Generator().manual_seed(2147483647)
C = torch.randn((vocab_size, n_embed), generator=g)
W1 = torch.randn((n_embed * num_blocks, n_hidden), generator=g) * (5 /3) / ((n_embed * num_blocks) ** 0.5)
b1 = torch.randn(n_hidden, generator=g) * 0.01

W2 = torch.randn((n_hidden, vocab_size), generator=g) * 0.2
b2 = torch.randn(vocab_size, generator=g) * 0.01

bngain = torch.ones((1, n_hidden))
bnbias = torch.zeros((1, n_hidden))

bnmean_running = torch.zeros((1, n_hidden))
bnstd_running = torch.ones((1, n_hidden))

parameters = [C, W1, b1, W2, b2, bngain, bnbias]
print(sum(p.nelement() for p in parameters)) # number of parameters total
for p in parameters:
    p.requires_grad = True

12297


### Pay attention about shapes

In [24]:
torch.randn((5, ), generator=rand_gen)

tensor([-0.4777,  0.5704,  0.1981,  0.3437,  0.7030])

In [25]:
torch.randn(5, generator=rand_gen)

tensor([-1.0422, -0.6502, -0.3110,  1.1155, -0.2288])

In [26]:
torch.randn((5, 1), generator=rand_gen)

tensor([[-1.1359],
        [ 0.2450],
        [-0.9370],
        [ 0.2569],
        [ 0.0738]])

### Optimization

In [27]:
X_train.shape[0]

182516

In [28]:
# mini-batch construct
batch_size = 32
ix = torch.randperm(X_train.shape[0], generator=rand_gen)[:batch_size]
# print(i, ix)
X_batch, Y_batch = X_train[ix], Y_train[ix]
X_batch.shape, C.shape

(torch.Size([32, 3]), torch.Size([27, 10]))

In [29]:
# C: 27 x 10 
# X_batch: 32 x 3 

emb = C[X_batch]
emb.shape

torch.Size([32, 3, 10])

In [30]:
emb.view(emb.shape[0], -1).shape

torch.Size([32, 30])

In [31]:
X_batch

tensor([[ 8,  5, 14],
        [ 0, 26, 21],
        [ 0,  0, 11],
        [12,  1, 25],
        [ 0,  1, 13],
        [ 3,  9,  1],
        [ 9, 15, 12],
        [ 0,  0,  0],
        [19, 19,  1],
        [10,  5,  4],
        [ 8,  5, 12],
        [18,  1, 14],
        [15, 14,  4],
        [14, 14,  1],
        [12,  5, 24],
        [ 5, 11,  5],
        [ 0, 18,  5],
        [20,  8, 15],
        [ 0,  0, 20],
        [ 1, 18,  9],
        [15, 18,  5],
        [12,  5,  5],
        [ 0,  0,  0],
        [25, 12,  1],
        [ 9, 13,  5],
        [13, 13, 25],
        [19,  1,  3],
        [ 1, 22,  9],
        [11,  1, 13],
        [19,  4, 18],
        [13, 21, 14],
        [ 0,  0,  4]])

In [32]:
C.shape

torch.Size([27, 10])

In [33]:
C[0].shape

torch.Size([10])

In [34]:
C[X_batch[2]].shape

torch.Size([3, 10])

In [35]:
X_batch[2]

tensor([ 0,  0, 11])

In [36]:
C[0]

tensor([ 1.5674, -0.2373, -0.0274, -1.1008,  0.2859, -0.0296, -1.5471,  0.6049,
         0.0791,  0.9046], grad_fn=<SelectBackward0>)

In [37]:
C[19]

tensor([-0.2129,  0.5095,  0.3271,  1.9661, -0.2409, -0.7952,  0.2720, -1.1100,
        -0.4528, -0.4958], grad_fn=<SelectBackward0>)

In [38]:
C[5]

tensor([ 0.1156,  0.8032,  0.5411, -1.1646,  0.1476, -1.0006,  0.3801,  0.4733,
        -0.9103, -0.7830], grad_fn=<SelectBackward0>)

In [39]:
C[[0, 19, 5]]

tensor([[ 1.5674, -0.2373, -0.0274, -1.1008,  0.2859, -0.0296, -1.5471,  0.6049,
          0.0791,  0.9046],
        [-0.2129,  0.5095,  0.3271,  1.9661, -0.2409, -0.7952,  0.2720, -1.1100,
         -0.4528, -0.4958],
        [ 0.1156,  0.8032,  0.5411, -1.1646,  0.1476, -1.0006,  0.3801,  0.4733,
         -0.9103, -0.7830]], grad_fn=<IndexBackward0>)

In [40]:
C[X_batch[2]]

tensor([[ 1.5674, -0.2373, -0.0274, -1.1008,  0.2859, -0.0296, -1.5471,  0.6049,
          0.0791,  0.9046],
        [ 1.5674, -0.2373, -0.0274, -1.1008,  0.2859, -0.0296, -1.5471,  0.6049,
          0.0791,  0.9046],
        [-0.5653,  0.5428,  0.1755, -2.2901, -0.7093, -0.2928, -2.1803,  0.0793,
          0.9019,  1.2028]], grad_fn=<IndexBackward0>)

In [41]:
emb = C[X_batch]
emb.shape

torch.Size([32, 3, 10])

In [42]:
emb_cat = emb.view(emb.shape[0], -1)
emb_cat.shape

torch.Size([32, 30])

In [43]:
W1.shape

torch.Size([30, 200])

In [44]:
h = emb_cat @ W1 + b1
h.shape

torch.Size([32, 200])

In [45]:
W2.shape

torch.Size([200, 27])

In [46]:
(h @ W2).shape

torch.Size([32, 27])

In [47]:
b2.shape

torch.Size([27])

In [48]:
Y_batch.shape

torch.Size([32])

In [49]:
X.shape

torch.Size([228146, 3])

In [57]:
X_batch.shape, emb.shape, emb_cat.shape, W1.shape, hpreact.shape, W2.shape, logits.shape

NameError: name 'logits' is not defined

In [51]:
hpreact.mean(0, keepdim=True).shape

In [52]:
hpreact.mean(0, keepdim=True)[0][0], hpreact[:,0].sum()/32

In [53]:
hpreact.mean(1, keepdim=True).shape

NameError: name 'hpreact' is not defined

In [None]:
hpreact.mean(0, keepdim=True).shape, hpreact.std(0, keepdim=True).shape

In [54]:
hpreact.mean(0, keepdim=True)

NameError: name 'hpreact' is not defined

In [55]:
hpreact.std(0, keepdim=True)

NameError: name 'hpreact' is not defined

In [62]:
max_steps = 5_000
batch_size = 32
lossi = []


for i in range(max_steps):

    # mini-batch construct
    ix = torch.randperm(X_train.shape[0], generator=rand_gen)[:batch_size]
    # print(i, ix)
    X_batch, Y_batch = X_train[ix], Y_train[ix]

    # forward pass    
    emb = C[X_batch] # embed chars into vectors
    emb_cat = emb.view(emb.shape[0], -1) # concatenate the vectors
    hpreact = emb_cat @ W1 + b1 # hidden layer pre-activation

    bnmeani = hpreact.mean(0, keepdim=True)
    bnstdi = hpreact.std(0, keepdim=True)
    hpreact = bngain * (hpreact - bnmeani) / bnstdi + bnbias
    
    with torch.no_grad():
        bnmean_running = 0.999 * bnmean_running + 0.001 * bnmean
        bnstd_running = 0.999 * bnstd_running + 0.001 * bnstdi
        
    h = torch.tanh(hpreact) # hidden layer
    logits = h @ W2 + b2 # output layer
    loss = F.cross_entropy(logits, Y_batch) # loss function

    # backward pass: 
    for p in parameters:
        p.grad = None
        
    loss.backward()

    # update rule:
    lr = 0.1 if (i < max_steps // 2) else 0.01
    for p in parameters:
        p.data += - lr * p.grad
        
    
    # stats
    if (i + 1) % (max_steps // 25) == 0:
        print(f"{i + 1:7d} / {max_steps:7d}; {loss.item():.4f}")
    
    lossi.append(loss.item())
    # break


NameError: name 'bnmean' is not defined

In [None]:
lossi[:10]

In [None]:
h, h.shape

In [None]:
plt.hist(h.view(-1).tolist(), 50);

In [None]:
plt.hist(hpreact.view(-1).tolist(), 50);

In [None]:
plt.figure(figsize=(10, 20))
plt.imshow(h.abs() >= 0.99, cmap="gray", interpolation="nearest")

### White = 1, True; and Black = 0 False;

In [None]:
plt.figure(figsize=(10, 20))
plt.imshow(0 * h.abs() >= 0.99, cmap="gray", interpolation="nearest")

In [None]:
h.shape, hpreact.shape

In [None]:
lossi[:10]

In [None]:
plt.plot(lossi)
plt.grid(True)

In [None]:
logits[0]

In [None]:
probs = torch.softmax(logits[0], dim=0)
probs.sum()
# plt.imshow()

In [None]:
probs = torch.softmax(logits, dim=1)

In [None]:
probs.shape

In [None]:
probs[0].shape, probs[0].sum(), probs[1].shape, probs[1].sum()

In [None]:
with torch.no_grad():
    emb = C[X_train]
    emb_cat = emb.view(emb.shape[0], -1)
    hpreact = emb_cat @ W1 + b1
    
    bnmean = hpreact.mean(0, keepdim=True)
    bnstd = hpreact.std(0, keepdim=True)
    

In [None]:
bnmean

In [None]:
bnmean_running

In [None]:
bnstd

In [None]:
bnstd_running

In [None]:
@torch.no_grad()
def evaluate(split: str):

    data_map = {
        "train": (X_train, Y_train),
        "val": (X_val, Y_val),
        "test": (X_test, Y_test),
    }
   
    x, y = data_map[split]
    emb = C[x]
    emb_cat = emb.view(emb.shape[0], -1)
    hpreact = emb_cat @ W1 + b1
    hpreact = bngain * (hpreact - hpreact.mean(0, keepdim=True)) / hpreact.std(0, keepdim=True) + bnbias

    h = torch.tanh(hpreact)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, y)
    
    print(f"{split = }, {loss.item() = }")

In [None]:
evaluate("train")    
evaluate("val")    
evaluate("test")    

In [None]:
# sample from the model 
rg = torch.Generator().manual_seed(RANDOM_SEED + 10)

for t in range(20):
    out = []
    context = [0] * NUM_BLOCKS
    while True:
        # emb = C[torch.tensor([context])]
        emb = C[torch.tensor([context])]
        h = torch.tanh(emb.view(1, -1) @ W1 + b1)
        logits = h @ W2 + b2
        probs_test = F.softmax(logits)
        probs = F.softmax(logits, dim=1)
    
        ix = torch.multinomial(probs, num_samples=1, generator=rg)
        ch = itos[ix.item()]
        # print(ch)  
        context = context[1:] + [ix]
        out.append(ix)
    
        if ix == 0:
            break
    print(t, "".join(itos[i.item()] for i in out))
    
    # print("".join()

In [None]:
emb = C[torch.tensor(context)]

In [None]:
emb.shape

In [None]:
emb.view(1, -1).shape

In [None]:
C[torch.tensor([context])].shape

In [None]:
probs_t.shape

In [None]:
probs.shape

In [None]:
probs_t.sum()

In [None]:
logits.shape

In [None]:
logits = torch.tensor([100.0, 0.0, 0.0, 0.0])
logits = torch.rand(4) * 100
probs = torch.softmax(logits, dim=0)

In [None]:
probs

In [None]:
loss = - probs[2].log()

In [None]:
logits, probs, loss

In [None]:
torch.randn(10).shape

In [None]:
for i in range(10):
    r = torch.randn(100)
    print(# r.mean().item(), 
          r.std().item())

### day 17

In [58]:
class Linear:
        
    def __init__(self, fan_in: int, fan_out: int, bias: bool = True):
        self.weight = torch.randn((fan_in, fan_out), generator=g) / fan_in ** 0.5
        self.bias = torch.randn(fan_out) if (bias is not None) else None

    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None: 
            self.out += self.bias
        return self.out
    
    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])


class BatchNorm1d:
    def __init__(self, dim, eps=10**(-5), momentum=0.1):
        self.eps = eps
        self.momentum = momentum
        self.training = Trueself.dim = dim
        

        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)

    def __call__(self, x):
        if self.training: 
            xmean = x.mean(dim=0, keepdim=True)
            xvar = x.var(dim=0, keepdim=True)
        else:
            xmean = self.running_mean
            xvar = self.running_std

        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)

        # print("-" * 80)
        # print(f"{xhat.shape = }")
        # print(f"{self.gamma.shape = }")
        # print(f"{self.beta.shape = }")
        
        
        self.out = self.gamma * xhat + self.beta
        
        with torch.no_grad():
            self.running_mean = self.momentum * self.running_mean + (1 - self.momentum) * xmean
            self.running_var = self.momentum * self.running_var + (1 - self.momentum) * xvar
    
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]

class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    
    def parameters(self):
        return []

In [59]:
vocab_size, n_embed, num_blocks

(27, 10, 3)

In [60]:
C = torch.randn((vocab_size, n_embed), generator=g)

# this is without batch normalization
layers = [
    Linear(num_blocks * n_embed, n_hidden),
    Tanh(),
    Linear(n_hidden, n_hidden),
    Tanh(),
    Linear(n_hidden, n_hidden),
    Tanh(),
    Linear(n_hidden, n_hidden),
    Tanh(),
    Linear(n_hidden, n_hidden),
    Tanh(),
    Linear(n_hidden, vocab_size),
]

# this is with batch normalization
layers = [
    Linear(num_blocks * n_embed, n_hidden),
    BatchNorm1d(n_hidden),
    Tanh(),
    Linear(n_hidden, n_hidden),
    BatchNorm1d(n_hidden),
    Tanh(),
    Linear(n_hidden, n_hidden),
    BatchNorm1d(n_hidden),
    Tanh(),
    Linear(n_hidden, n_hidden),
    BatchNorm1d(n_hidden),
    Tanh(),
    Linear(n_hidden, n_hidden),
    BatchNorm1d(n_hidden),
    Tanh(),
    Linear(n_hidden, vocab_size),
    BatchNorm1d(vocab_size),
]


NameError: name 'Trueself' is not defined

In [61]:
# layers[-1].weight *= 0.1
layers[-1].gamma *= 0.1
for layer in layers[:-1]:
    if isinstance(layer, Linear):
        layer.weight *= 0.75 # 5/3 # 1 # 3

AttributeError: 'Linear' object has no attribute 'gamma'

In [None]:
parameters = [C] + [p for layer in layers for p in layer.parameters()]

for p in parameters:
    p.requires_grad = True

In [None]:
sum(p.nelement() for p in parameters)

### optimization

In [None]:
max_steps = 500
batch_size = 32
lossi = []
ud = [] # update to data ratio

for i in range(max_steps):

    # mini-batch construct
    ix = torch.randperm(X_train.shape[0], generator=rand_gen)[:batch_size]
    # print(i, ix)
    X_batch, Y_batch = X_train[ix], Y_train[ix]

    # forward pass    
    emb = C[X_batch] # embed chars into vectors
    emb_cat = emb.view(emb.shape[0], -1) # concatenate the vectors

    # forward pass
    x = emb_cat
    for layer in layers:
        x = layer(x)

    loss = F.cross_entropy(x, Y_batch) # loss function


    # backward pass: 
    for layer in layers:
        layer.out.retain_grad()
        
    for p in parameters:
        p.grad = None
        
    loss.backward()

    # update rule:
    lr = 0.1 if (i < max_steps // 2) else 0.01
    for p in parameters:
        p.data += - lr * p.grad

    with torch.no_grad():
        ud.append(
            [(lr * p.grad).std() / p.data.std().log10().item() for p in parameters]
            # lr * p.grad / p.data
        )
        
    # stats
    if (i + 1) % (max_steps // 25) == 0:
        print(f"{i + 1:7d} / {max_steps:7d}; {loss.item():.4f}")
    
    lossi.append(loss.item())
    break


In [None]:
loss

In [None]:
x.shape

In [None]:
print(f"{len(layers) = }")
[type(layer) for layer in layers]

### Visualize Tanh Outs

In [None]:
plt.close()
plt.grid(True)
for cnt, layer in enumerate(layers):
    # print(cnt, layer.out.shape, isinstance(layer, Tanh))
    if isinstance(layer, Tanh):
        t = layer.out
        saturated = (t.abs() > 0.90).float().mean()
        name = layer.__class__.__name__
        print(f"layer {cnt}; name: {name}; mean {t.mean():0.4f}; std: {t.std():0.4f}; saturated: {100 * saturated:0.2f}%")
        hy, hx = torch.histogram(t, density=True)
        # plt.hist(t.detach())
        plt.title("Visualize Tanh Outs")
        plt.plot(hx[:-1].detach(), hy.detach(), label=f"layer {cnt}")
        # break
plt.legend()


### Visualize Tanh Grads

In [None]:
plt.close()
plt.grid(True)
for cnt, layer in enumerate(layers):
    # print(cnt, layer.out.shape, isinstance(layer, Tanh))
    if isinstance(layer, Tanh):
        t = layer.out.grad
        saturated = (t.abs() > 0.90).float().mean()
        name = layer.__class__.__name__
        print(f"layer {cnt}; name: {name}; mean {t.mean():0.4f}; std: {t.std():0.4f}; saturated: {100 * saturated:0.2f}%")
        hy, hx = torch.histogram(t, density=True)
        # plt.hist(t.detach())
        plt.title("Visualize Tanh Grads")
        plt.plot(hx[:-1].detach(), hy.detach(), label=f"layer {cnt}")
        # break
plt.legend()

In [None]:
t.detach()

In [None]:
(t.abs() > 0.9)

In [None]:
(t.abs() > 0.9).float().mean()

In [None]:
layer.__class__?

In [None]:
layer.__class__.__dict__

In [None]:
layer.__class__.__name__

### Visualize Parameters: Grad to Data Ratio

In [None]:
# plt.close()

for cnt, p in enumerate(parameters):
    if p.dim() == 2:
        t = p.grad.mean() / p.data.mean()
        name = layer.__class__.__name__
        print(f"layer {cnt}; name: {name}; shape: {p.shape}; mean {p.mean():0.4f}; std: {p.std():0.4f}; grad:data ratio: {p.grad.mean() / p.data.mean():0.2f}%")
        # hy, hx = torch.histogram(t, density=True)
        # plt.hist(t.detach())
        # plt.title("Visualize Tanh Grads")
        # plt.plot(hx[:-1].detach(), hy.detach(), label=f"layer {cnt}")
        # break

# plt.grid(True)
# plt.legend()


In [None]:
len(parameters)

### This is does not work!

In [None]:
plt.figure(figsize=(20, 4))
legends = []
for i, p in enumerate(parameters):
    if p.dim == 2:
        # plt.plot([ud[j][i] for j in range(len(ud))])
        plt.plot(ud[i])
        
    plt.plot([0, len(ud)], [-3, 3], "k")
plt.show()

### 202404009072500--??

In [None]:
DEBUG = False
# -------------------------------------------------
class Linear:
    def __init__(self, fan_in, fan_out, bias=True):
        self.weights = torch.randn((fan_in, fan_out)) / (fan_in ** 0.5) ## kaiming init
        self.bias = torch.randn(fan_out) if bias else None

    def __call__(self, x):
        self.out = x @ self.weights
        if (self.bias is not None):
            self.out += self.bias
        return self.out

    def parameters(self):
        return [self.weights] + ([] if (self.bias is None) else [self.bias])

# -------------------------------------------------
class BatchNorm1d:
    def __init__(self, dim, eps=10**(-5), momentum=0.1):
        self.eps = eps
        self.momentum = momentum
        self.training = True

        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)

    def __call__(self, x):
        if x.ndim == 2:
            dim = 0
        elif x.ndim == 3:
            dim = (0, 1)
        
        if self.training:
            xmean = x.mean(dim=dim, keepdim=True)
            xvar = x.var(dim=dim, keepdim=True)
        else:
            xmean = self.running_mean
            xvar = self.running_var

        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta

        if DEBUG:
            zz = torch.sqrt(xvar) #  + self.eps)
            
            print(f"{x.shape = }")
            print(f"{xmean.shape = }")
            print(f"{xvar.shape = }")
            print(f"{xhat.shape = }")
            print(f"{self.gamma.shape = }")
            print(f"{self.beta.shape = }")
            print(f"{self.out.shape = }")
            print(f"{zz.shape = }")
            
            print(f"{self.eps = }")
            print(f"{x = }")
            print(f"{xmean = }")
            print(f"{xvar = }")
            print(f"{xhat = }")
            print(f"{zz = }")
            print(f"{self.gamma = }")
            print(f"{self.beta = }")
            print(f"{self.out = }")

        # update buffers: 
        if self.training:
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * xmean
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * xvar

        return self.out

    def parameters(self):
        return [self.beta, self.gamma]


# -------------------------------------------------
class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
        
    def parameters(self):
        return []

# -------------------------------------------------
class Embedding:
    # def __init__(self, vocab_size, n_embed):
    def __init__(self, num_embeddings, embedding_dim):
        self.weight = torch.randn((num_embeddings, embedding_dim))
        
    def __call__(self, indexes):
        self.out = self.weight[indexes]
        return self.out
        
    def parameters(self):
        return [self.weight]

# -------------------------------------------------
class Flatten:
    def __call__(self, x):
        self.out = x.view(x.shape[0], -1)
        return self.out
        
    def parameters(self):
        return []


# -------------------------------------------------
class FlattenConsequtive:
    def __init__(self, n):
        self.n = n
        
    def __call__(self, x):
        B, T, C = x.shape
        x = x.view(B, T // self.n, C * self.n)
        if x.shape[1] == 1:
            x = x.squeeze(dim=1)
        self.out = x
        return self.out
        
    def parameters(self):
        return []


# -------------------------------------------------
class Sequential:
    def __init__(self, layers):
        self.layers = layers
        
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        self.out = x
        return self.out
    
    def parameters(self):
        # return parameters
        return [p for layer in self.layers for p in layer.parameters()]
        

In [None]:
torch.manual_seed(42)

### define a network as a list of layers:

In [None]:
# MLP revisted
n_embed = 10 # the dimensionality of the char. embedding 
n_hidden = 200 # the number of neurons in the hidden layers

In [None]:
C = torch.randn((vocab_size, n_embed))

In [None]:
C.shape

In [None]:
layers = [
    Linear(fan_in=num_blocks * n_embed, fan_out=n_hidden, bias=True),
    BatchNorm1d(dim=n_hidden),
    Tanh(),
    Linear(fan_in=n_hidden, fan_out=vocab_size, bias=True),    
]

In [None]:
layers

In [None]:
# make the last layer less confident
with torch.no_grad():
    layers[-1].weights *= 0.1

In [None]:
# parameters = [C] + [p for p in layer for layer in layers]
# parameters = [C] + [p for p in layer.paparameters() for layer in layers]

parameters = [C] + [p for layer in layers for p in layer.parameters()]

for p in parameters:
    p.requires_grad = True

In [None]:
len(parameters)

In [None]:
[p.nelement() for p in parameters]

In [None]:
sum((p.nelement() for p in parameters))

In [None]:
layers

In [None]:
C.nelement

In [None]:
layers[0]

In [None]:
layers[0].weights.nelement(), layers[0].bias.nelement()

In [None]:
layers[1]

In [None]:
layers[1].parameters()

In [None]:
layers[1].gamma.nelement(), layers[1].beta.nelement()

In [None]:
layers[2]

In [None]:
layers[2].parameters()

In [None]:
layers[3]

In [None]:
layers[3].weights

In [None]:
layers[3].weights.nelement(), layers[3].bias.nelement()

### Training: 

In [None]:
X_train.size(), Y_train.size()

In [None]:
len_data = X_train.shape[0]
print(len_data)

In [None]:
max_steps = 50_000
batch_size = 32
lossi = []


for i in range(max_steps):

    # mini-batch construct
    ix = torch.randperm(len_data)[:batch_size]
    # print(f"{ix = }")
    Xb, Yb = X_train[ix], Y_train[ix]

    # forward pass
    emb = C[X_batch]
    emb_cat = emb.view(emb.shape[0], -1)
    if DEBUG:
        print(f"{emb.shape = }")
        print(f"{emb = }")
        print(f"{emb_cat.shape = }")
        
    x = emb_cat
    for layer in layers:
        x = layer(x)
        if DEBUG:
            print("-" * 80)
            print(f"{layer = }")
            print(f"{x.sxhape = }")
            print(f"{x = }")
            
    loss = F.cross_entropy(x, Yb)

    
    # backward pass
    for p in parameters:
        p.grad = None
    
    loss.backward()


    # update rule: 
    lr = 0.1 if (i < max_steps * 0.5) else 0.01
    for p in parameters:
        p.data += - lr * p.grad

    # stats
    if (i + 1) % (max_steps // 50) == 0:
        print(f"{i + 1:7d} / {max_steps:7d}; {loss.item():.4f}")
        
    # store loss
    lossi.append(loss.detach().item())

In [None]:
plt.plot(lossi)
plt.grid(True)

In [None]:
plt.plot(
    torch.FloatTensor(lossi).view(-1, 1000).mean(dim=1)
)
plt.grid(True)

### A second way after adding layers `Embedding` and `Flatten`

In [None]:
layers = [
    Embedding(num_embeddings=vocab_size, embedding_dim=n_embed),
    Flatten(),
    Linear(fan_in=num_blocks * n_embed, fan_out=n_hidden, bias=True),
    BatchNorm1d(dim=n_hidden),
    Tanh(),
    Linear(fan_in=n_hidden, fan_out=vocab_size, bias=True),    
]

In [None]:
parameters = [p for layer in layers for p in layer.parameters()]
for p in parameters:
    p.requires_grad = True

In [None]:
max_steps = 50_000
batch_size = 32
lossi = [] 


for i in range(max_steps):

    # mini-batch construct
    ix = torch.randperm(len_data)[:batch_size]
    # print(f"{ix = }")
    Xb, Yb = X_train[ix], Y_train[ix]

    # forward pass
    x = Xb
    for layer in layers:
        x = layer(x)    
    loss = F.cross_entropy(x, Yb)
 
    # backward pass
    for p in parameters:
        p.grad = None
    
    loss.backward()


    # update rule: 
    lr = 0.1 if (i < max_steps * 0.5) else 0.01
    for p in parameters:
        p.data += - lr * p.grad

    # stats
    if (i + 1) % (max_steps // 50) == 0:
        print(f"{i + 1:7d} / {max_steps:7d}; {loss.item():.4f}")
        
    # store loss
    lossi.append(loss.detach().item())

In [None]:
plt.plot(lossi)
plt.grid(True)

In [None]:
plt.plot(
    torch.FloatTensor(lossi).view(-1, 1000).mean(dim=1)
)
plt.grid(True)

### A third way with torch-countainers e.g. `Sequential`

In [None]:
max_steps = 50_000
batch_size = 32
lossi = []

In [None]:
model = Sequential(
        layers = [
            Embedding(num_embeddings=vocab_size, embedding_dim=n_embed),
            Flatten(),
            Linear(fan_in=num_blocks * n_embed, fan_out=n_hidden, bias=True),
            BatchNorm1d(dim=n_hidden),
            Tanh(),
            Linear(fan_in=n_hidden, fan_out=vocab_size, bias=True),    
        ]
)

parameters = model.parameters()

for p in parameters:
    p.requires_grad = True

In [None]:
# testing: 
u = [p for p in model.layers[-1].weights][0][0]
u.requires_grad

In [None]:
model.layers

In [None]:
for i in range(max_steps):

    # mini-batch construct
    ix = torch.randperm(len_data)[:batch_size]
    # print(f"{ix = }")
    Xb, Yb = X_train[ix], Y_train[ix]

    # forward pass
    logits = model(Xb)    
    loss = F.cross_entropy(logits, Yb)

    # backward pass
    for p in parameters:
        p.grad = None

    loss.backward()

    # update rule: 
    lr = 0.1 if (i < max_steps * 0.5) else 0.01
    for p in parameters:
        p.data += - lr * p.grad

    # stats
    if (i + 1) % (max_steps // 50) == 0:
        print(f"{i + 1:7d} / {max_steps:7d}; {loss.item():.4f}")
        
    # store loss
    lossi.append(loss.detach().item())


In [None]:
plt.plot(lossi)
plt.grid(True)

In [None]:
plt.plot(
    torch.FloatTensor(lossi[:max_steps]).view(-1, 1000).mean(dim=1)
)
plt.grid(True)

### evaluate the model

In [None]:
for layer in model.layers:
    layer.training = False

In [None]:
@torch.no_grad()
def split_eval(split: str, model):

    _data_map = {
        "train": (X_train, Y_train),
        "val": (X_val, Y_val),
        "test": (X_test, Y_test),
    }


    x, y = _data_map[split]

    logits = model(x)
    loss = F.cross_entropy(logits, y)

    return loss

In [None]:
for split in ["train", "val", "test"]:
    print(f"{split}: {split_eval(split, model)}")

### sample from the model

In [None]:
for i in range(20):
    out = []
    
    context = [0] * num_blocks
    while True:
        # x = torch.tensor(context).reshape(1, 3)
        x = torch.tensor([context])
        logits = model(x)
        probs = F.softmax(logits, dim=1)
        
        ix = torch.multinomial(probs, num_samples=1, generator=rg)
        ch = itos[ix.item()]
        # print(ch)  
        context = context[1:] + [ix]
        out.append(ix)
    
        if ix == 0:
            break
    
    print(i, "".join(itos[i.item()] for i in out))
    

### `FlattenConsequtive` 

In [None]:
# del model 
# del model_new

In [None]:
NUM_BLOCKS = 8
num_blocks = NUM_BLOCKS
X, Y = get_data(words, num_blocks=num_blocks)
X_train, X_val, X_test, Y_train, Y_val, Y_test = split_data(X, Y)
print(X_train.shape, Y_train.shape)

In [None]:
Xb[:1].shape

In [None]:
torch.tensor(context).reshape(1, 3)

In [None]:
torch.tensor([context]).shape

In [None]:
e = torch.randn((4, 8, 10)) # >> (4, 4, 20)

In [None]:
e.view(-1, e.shape[-2] // 2, e.shape[-1] * 2).shape

In [None]:
n_embed, n_hidden

In [None]:
del model 

num_var = 2

model = Sequential(
        layers = [
            Embedding(num_embeddings=vocab_size, embedding_dim=n_embed),
            
            FlattenConsequtive(num_var),
            Linear(fan_in=num_var * n_embed, fan_out=n_hidden, bias=True),
            BatchNorm1d(dim=n_hidden),
            Tanh(),

            FlattenConsequtive(num_var),
            Linear(fan_in=num_var * n_hidden, fan_out=n_hidden, bias=True),
            BatchNorm1d(dim=n_hidden),
            Tanh(),

            
            FlattenConsequtive(num_var),
            Linear(fan_in=num_var * n_hidden, fan_out=n_hidden, bias=True),
            BatchNorm1d(dim=n_hidden),
            Tanh(),

            
            Linear(fan_in=n_hidden, fan_out=vocab_size, bias=True),    
        ]
)

parameters = model.parameters()
print("number of params:", sum([p.nelement() for p in parameters]))
for p in parameters:
    p.requires_grad = True

In [None]:
# mini-batch construct
ix = torch.randperm(len_data)[:batch_size]
# print(f"{ix = }")
Xb, Yb = X_train[ix], Y_train[ix]
print(f"{Xb.shape = }")
print(f"{Yb.shape = }")
pred  = model(Xb)
print(f"{pred.shape = }")

In [None]:
for layer in model.layers:
    print(layer.__class__.__name__, ":", tuple(layer.out.shape))

In [None]:
max_steps = 50_000
batch_size = 32
lossi = []

for i in range(max_steps):
    
    # mini-batch construct
    ix = torch.randperm(len_data)[:batch_size]
    # print(f"{ix = }")
    Xb, Yb = X_train[ix], Y_train[ix]

    # forward pass
    logits = model(Xb)    
    loss = F.cross_entropy(logits, Yb)

    # backward pass
    for p in parameters:
        p.grad = None

    loss.backward()

    # update rule: 
    lr = 0.1 if (i < max_steps * 0.5) else 0.01
    for p in parameters:
        p.data += - lr * p.grad

    # stats
    if (i + 1) % (max_steps // 50) == 0:
        print(f"{i + 1:7d} / {max_steps:7d}; {loss.item():.4f}")
        
    # store loss
    lossi.append(loss.detach().item())


In [None]:
plt.plot(lossi)
plt.grid(True)

In [None]:
plt.plot(
    torch.FloatTensor(lossi[:max_steps]).view(-1, 1000).mean(dim=1)
)
plt.grid(True)

In [None]:
for split in ["train", "val", "test"]:
    print(f"{split}: {split_eval(split, model)}")