In [2]:
import random as r

from engine import Value
from modules import *
from ops import *
from gpt import *

In [3]:
batch_size = 2
vocab_len = 10
model_dim = 8
max_seq_len = 5
seq_len = 3
num_heads = 2
head_dim = 4
mlp_mult = 4

In [46]:
class CrossEntropyLoss(Module):
    def __init__(self, vocab_len: int, pad_token: int = None):
        self.vocab_len = vocab_len
        self.pad_token = pad_token

    def __call__(self, logits, targets):
        '''
        inputs: 
        logits - list of lists of lists of shape (batch_size, seq_len, vocab_len) full of Value objects
        targets - list of lists of shape (batch_size, seq_len) full of integers representing token indices

        output: a single Value object representing loss of the model
        '''
        assert isinstance(targets, list) and isinstance(targets[0], list) and isinstance(targets[0][0], int)
        assert len(logits) == len(targets) and len(logits[0]) == len(targets[0])
        # prolly should assert that each vec in logits is a valid distribution (sums to 1)
                                                  
        one_hots = vector_wise_apply(self._one_hot, targets)
        pretty_print_tensor(one_hots)

        log_logits = vector_wise_apply(log, logits)
        pretty_print_tensor(log_logits)

        losses = entry_wise_mult(one_hots, log_logits)
        pretty_print_tensor(losses)

        # sum

        # mult by -1

        # return
        

    def _one_hot(self, targets_vec):
        '''
        turns list of tokens into list of one-hot vectors with 1's at the index of the given token
        meant to be used with vector_wise_apply
        '''
        assert all(isinstance(t, int) for t in targets_vec)
        return [[0] * t + [1] + [0] * (vocab_len - t - 1) for t in targets_vec]

In [52]:
def sum(vec):
    '''
    sums up all values in a vector
    returns a single Value object, so if it's being called by vector_wise_apply that means it removes the last dimension in the process
    '''
    assert isinstance(vec, list) and isinstance(vec[0], (float, int, Value))
    tot = vec[0]
    for x in vec[1:]:
        tot = tot + x
    return tot

In [54]:
x = [[[Value(r.uniform(-1,1)).exp() for _ in range(model_dim)]
      for _ in range(seq_len)]
     for _ in range(batch_size)]
print(get_shape(x))
y = vector_wise_apply(sum, x)
print(get_shape(y))

[2, 3, 8]
[2, 3]


In [50]:
logits = [[[Value(r.uniform(-4,4)).exp() for _ in range(vocab_len)]
      for _ in range(seq_len)]
     for _ in range(batch_size)]
logits = vector_wise_apply(softmax, logits)
pretty_print_tensor(logits)
celoss = CrossEntropyLoss(vocab_len, pad_token = vocab_len - 1)
targets = [[r.randint(0, vocab_len - 1) for _ in range(seq_len)]
           for _ in range(batch_size)]
pretty_print_tensor(targets)
loss = celoss(logits, targets)
print(loss)
pretty_print_tensor(loss)

[
  [
    [Value(data=1.000, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.000, grad=0.000)]
    [Value(data=0.000, grad=0.000), Value(data=1.000, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.000, grad=0.000)]
    [Value(data=0.000, grad=0.000), Value(data=0.000, grad=0.000), Value(data=1.000, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.000, grad=0.000)]
  ]
  [
    [Value(data=0.000, grad=0.000), Value

In [48]:
# TODO: make Embedding module also do unembedding w/ shared weights? 
# wouldn't be exactly faithful to pytorch implementation but i'd like to use gradient accumulation & save parameters