In [1]:
import numpy as np

def softmax(x, axis=-1):
    x_max = np.max(x, axis=axis, keepdims=True)
    e = np.exp(x - x_max)
    return e / np.sum(e, axis=axis, keepdims=True)

def gelu(x):
    return 0.5 * x * (1.0 + np.tanh(np.sqrt(2/np.pi) * (x + 0.044715 *(x**3))))


In [2]:
class TokenEmbedding:
    def __init__(self, vocab_size, d_model, seed=42):
        rng = np.random.default_rng(seed)
        self.E = rng.standard_normal((vocab_size, d_model)) / np.sqrt(d_model)

    def forward(self, x):
        return self.E[x]

In [3]:
class PositionalEncoding:
    def __init__(self, max_len, d_model, mode='sinusoidal', seed=123):
        self.mode = mode
        self.max_len= max_len
        self.d_model = d_model
        if mode == 'sinusoidal':
            self.P = self._get_sinusoidal_encoding(max_len, d_model)
        elif mode == 'learned':
            rng = np.random.default_rng(seed)
            self.P = rng.standard_normal((max_len, d_model)) / np.sqrt(d_model)
        else:
            raise ValueError("mode must be 'sinusoidal' or 'learned'")
        
    def _get_sinusoidal_encoding(self, max_len, d_model):  # Fixed: Added self parameter
        pos = np.arange(max_len)[:, None]
        i = np.arange(d_model)[None, :]
        rates = 1 / np.power(10000, (2*(i//2)) / d_model)
        angles = pos * rates
        P = np.zeros((max_len, d_model), dtype=np.float32)
        P[:, 0::2] = np.sin(angles[:, 0::2])
        P[:, 1::2] = np.cos(angles[:, 1::2])
        return P
    
    def forward(self, x):
        B, T, d = x.shape
        assert d == self.d_model
        return self.P[:T][None, :, :]

In [4]:
class LayerNorm:
    def __init__(self, d_model, eps=1e-5):
        self.eps = eps
        self.gamma = np.ones((d_model,), dtype=np.float32)
        self.beta = np.zeros((d_model,), dtype=np.float32)

    def forward(self, x):
        mean = x.mean(axis=-1, keepdims=True)
        var = ((x - mean)**2).mean(axis=-1, keepdims=True)
        x_norm = (x - mean) / np.sqrt(var + self.eps)
        return self.gamma * x_norm + self.beta

In [5]:
def make_causal_mask(T):
    mask = np.triu(np.ones((T, T), dtype=np.float32), k=1)
    mask = np.where(mask==1, -1e9, 0.0)
    return mask[None, None, :, :]

In [6]:
class ScaledDotProductAttention:
    def forward(self, Q, K, V, mask=None):
        dh = Q.shape[-1]
        scores = np.matmul(Q, K.transpose(0,1,3,2)) / np.sqrt(dh)
        if mask is not None:
            scores += mask
        A = softmax(scores, axis=-1)
        out = np.matmul(A, V)
        return out, A

In [7]:
class MultiHeadSelfAttention:
    def __init__(self, d_model, num_heads, seed=7):
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_head = d_model // num_heads
        rng = np.random.default_rng(seed)
        self.W_Q = rng.standard_normal((d_model, d_model)) / np.sqrt(d_model)
        self.W_K = rng.standard_normal((d_model, d_model)) / np.sqrt(d_model)
        self.W_V = rng.standard_normal((d_model, d_model)) / np.sqrt(d_model)
        self.W_O = rng.standard_normal((d_model, d_model)) / np.sqrt(d_model)
        self.attention = ScaledDotProductAttention()

    def _split(self, x):
        B,T, _=x.shape
        return x.reshape(B, T, self.num_heads, self.d_head).transpose(0,2,1,3)
    
    def _merge(self, x):
        B, H, T, dh = x.shape
        return x.transpose(0,2,1,3).reshape(B, T, H*dh)
    
    def forward(self, x, mask):
        B, T, d = x.shape
        Q = x @ self.W_Q
        K = x @ self.W_K
        V = x @ self.W_V
        Q = self._split(Q)
        K = self._split(K)
        V = self._split(V)
        out, A = self.attention.forward(Q, K, V, mask)
        out = self._merge(out) @ self.W_O
        return out, A

In [8]:
class FeedForward:
    def __init__(self, d_model, d_ff, activation="gelu", seed=99):
        rng = np.random.default_rng(seed)
        self.W1 = rng.standard_normal((d_model, d_ff)) / np.sqrt(d_model)
        self.b1 = np.zeros((d_ff,), dtype=np.float32)
        self.W2 = rng.standard_normal((d_ff, d_model)) / np.sqrt(d_ff)
        self.b2 = np.zeros((d_model,), dtype=np.float32)
        self.activation = activation

    def forward(self, x):
        h = x @ self.W1 + self.b1
        if self.activation == "gelu":
            h = gelu(h)
        else:
            h = np.maximum(0.0, h)  # ReLU
        return h @ self.W2 + self.b2

In [9]:
class DecoderBlock:
    def __init__(self,d_model,num_heads,d_ff, seed=100):
        self.ln1 = LayerNorm(d_model)
        self.attention = MultiHeadSelfAttention(d_model, num_heads, seed=seed+1)
        self.ln2 = LayerNorm(d_model)
        self.ffn = FeedForward(d_model, d_ff, activation="gelu", seed=seed+2)

    def forward(self, x, mask):
        a_in = self.ln1.forward(x)
        a_out, A = self.attention.forward(a_in, mask)
        x = x + a_out
        f_in = self.ln2.forward(x)
        f_out = self.ffn.forward(f_in)
        x = x + f_out
        return x, A

In [10]:
class GPTDecoder: 
    def __init__(self, vocab_size, max_len, d_model=128, num_heads=4, 
                 d_ff=512, num_layers=2, pos_encoding="sinusoidal", seed=2024):
        self.vocab_size = vocab_size
        self.max_len = max_len
        self.tok_emb = TokenEmbedding(vocab_size, d_model, seed=seed)
        self.pos_emb = PositionalEncoding(max_len, d_model, mode=pos_encoding, seed=seed+1)
        self.blocks = [DecoderBlock(d_model, num_heads, d_ff, seed=seed+10*i) for i in range(num_layers)]
        self.ln_finals = LayerNorm(d_model)
    
    def forward(self, x_tokens, return_attentions=False):
        B, T = x_tokens.shape
        assert T <= self.max_len
        x = self.tok_emb.forward(x_tokens)
        x = x + self.pos_emb.forward(x)

        mask = make_causal_mask(T)
        attn_list = []


        for block in self.blocks:
            x, A = block.forward(x, mask)
            if return_attentions:
                attn_list.append(A)
        
        h = self.ln_finals.forward(x)

        E = self.tok_emb.E
        logits = h @ E.T

        next_probs = softmax(logits[:, -1, :], axis=-1)

        return (logits, next_probs, attn_list) if return_attentions else (logits, next_probs)


In [None]:
class SimpleCharTokenizer:
    def __init__(self, vocab):
        self.chars = sorted(set(vocab))
        self.stoi = {c:i for i,c in enumerate(self.chars)}
        self.itos = {i:c for c,i in self.stoi.items()}
    def encode(self, text):
        return np.array([self.stoi[c] for c in text], dtype=np.int32)
    def decode(self, indices):
        return ''.join([self.itos[i] for i in indices])

    def vocab_size(self):
        return len(self.chars)
    
# =====DEMO======
if __name__ == "__main__":
    vocab = "abcdefghijklmnopqrstuvwxyz .,"
    tok = SimpleCharTokenizer(vocab)
    V = tok.vocab_size()  

    model = GPTDecoder(
        vocab_size=V, max_len=32,
        d_model=64, num_heads=4, d_ff=256, num_layers=2,
        pos_encoding="sinusoidal", seed=1234
    )

    def pad(ids, T):
        out = np.zeros((T,), dtype=np.int32)
        out[:min(T,len(ids))] = ids[:T]
        return out

    texts = ["hello world.", "transformers, yay"]
    T = 16
    x_tokens = np.stack([pad(tok.encode(t), T) for t in texts])  # [B,T]

    logits, next_probs, attn_list = model.forward(x_tokens, return_attentions=True)  

    print("Logits shape:", logits.shape)         # [B,T,V]
    print("Next-token probs shape:", next_probs.shape)  # [B,V]
    print("Sum probs per sample:", next_probs.sum(axis=-1))  # ~ [1.0, 1.0]


    aw = attn_list[0][0, 0]  
    print("Upper-triangle attention sum ~ 0:", float(np.triu(aw, k=1).sum()))


    topk = 5
    for b in range(x_tokens.shape[0]):
        idxs = next_probs[b].argsort()[-topk:][::-1]
        print(f"Sample {b} top-{topk} next tokens:",
              [(tok.itos[i], float(next_probs[b, i])) for i in idxs])

Logits shape: (2, 16, 29)
Next-token probs shape: (2, 29)
Sum probs per sample: [1. 1.]
Upper-triangle attention sum ~ 0: 0.0
Sample 0 top-5 next tokens: [('e', 0.10356238858061897), ('g', 0.08674164074088994), ('f', 0.08216100729846752), ('w', 0.07069346943813608), ('h', 0.06532337093262273)]
Sample 1 top-5 next tokens: [('e', 0.09836639580453611), ('a', 0.09055941651552268), ('g', 0.0889718299751409), ('f', 0.06864797769441527), ('w', 0.06552950037292793)]
