In [2]:
import random as r

from engine import Value
from modules import *
from ops import *
from gpt import *

In [3]:
batch_size = 2
vocab_len = 10
model_dim = 8
max_seq_len = 5
seq_len = 3
num_heads = 2
head_dim = 4
mlp_mult = 4

In [64]:
class CrossEntropyLoss(Module):
    def __init__(self, vocab_len: int, pad_token: int = None):
        self.vocab_len = vocab_len
        self.pad_token = pad_token

    def __call__(self, logits, targets):
        '''
        inputs: 
        logits - list of lists of lists of shape (batch_size, seq_len, vocab_len) full of Value objects
        targets - list of lists of shape (batch_size, seq_len) full of integers representing token indices

        output: a single Value object representing loss of the model
        '''
        assert isinstance(targets, list) and isinstance(targets[0], list) and isinstance(targets[0][0], int)
        assert len(logits) == len(targets) and len(logits[0]) == len(targets[0])
        # prolly should assert that each vec in logits is a valid distribution (sums to 1), but i'm lazy
                                                  
        one_hots = vector_wise_apply(self._one_hot, targets)
        log_logits = vector_wise_apply(log, logits)
        individual_losses = entry_wise_mult(one_hots, log_logits)

        # sum then multiply by -1
        return -1 * vector_wise_apply(sum, vector_wise_apply(sum, vector_wise_apply(sum, individual_losses)))

    def _one_hot(self, targets_vec):
        '''
        turns list of tokens into list of one-hot vectors with 1's at the index of the given token
        meant to be used with vector_wise_apply
        '''
        assert all(isinstance(t, int) for t in targets_vec)
        return [[0] * t + [1] + [0] * (vocab_len - t - 1) for t in targets_vec]

In [72]:
logits = [[[Value(r.uniform(-1,1)).exp() for _ in range(vocab_len)]
      for _ in range(seq_len)]
     for _ in range(batch_size)]
logits = vector_wise_apply(softmax, logits)
pretty_print_tensor(logits)
celoss = CrossEntropyLoss(vocab_len, pad_token = vocab_len - 1)
targets = [[r.randint(0, vocab_len - 1) for _ in range(seq_len)]
           for _ in range(batch_size)]
pretty_print_tensor(targets)
loss = celoss(logits, targets)
print(loss)

[
  [
    [Value(data=0.263, grad=0.000), Value(data=0.117, grad=0.000), Value(data=0.038, grad=0.000), Value(data=0.075, grad=0.000), Value(data=0.072, grad=0.000), Value(data=0.034, grad=0.000), Value(data=0.033, grad=0.000), Value(data=0.053, grad=0.000), Value(data=0.036, grad=0.000), Value(data=0.279, grad=0.000)]
    [Value(data=0.131, grad=0.000), Value(data=0.035, grad=0.000), Value(data=0.055, grad=0.000), Value(data=0.043, grad=0.000), Value(data=0.157, grad=0.000), Value(data=0.281, grad=0.000), Value(data=0.077, grad=0.000), Value(data=0.065, grad=0.000), Value(data=0.035, grad=0.000), Value(data=0.122, grad=0.000)]
    [Value(data=0.042, grad=0.000), Value(data=0.239, grad=0.000), Value(data=0.067, grad=0.000), Value(data=0.085, grad=0.000), Value(data=0.032, grad=0.000), Value(data=0.069, grad=0.000), Value(data=0.045, grad=0.000), Value(data=0.038, grad=0.000), Value(data=0.061, grad=0.000), Value(data=0.320, grad=0.000)]
  ]
  [
    [Value(data=0.067, grad=0.000), Value

In [48]:
# TODO: make Embedding module also do unembedding w/ shared weights? 
# wouldn't be exactly faithful to pytorch implementation but i'd like to use gradient accumulation & save parameters