In [2]:
import random as r

from engine import Value
from modules import *
from ops import *
from gpt import *

In [3]:
class Config:
    def __init__(self):
        self.batch_size = 2
        self.vocab_len = 10
        self.model_dim = 8
        self.max_seq_len = 5
        self.seq_len = 3
        self.num_heads = 2
        self.head_dim = self.model_dim // self.num_heads
        self.mlp_mult = 4
        self.dropout_rate = 0.1
        self.num_layers = 2

In [4]:
class GPT(Module):
    def __init__(self, config):
        self.vocab_len = config.vocab_len
        self.model_dim = config.model_dim
        self.max_seq_len = config.max_seq_len
        self.seq_len = config.seq_len
        self.num_heads = config.num_heads
        self.head_dim = config.head_dim
        self.mlp_mult = config.mlp_mult
        self.dropout_rate = config.dropout_rate
        self.num_layers = config.num_layers

        self.tok_embeddings = Embedding(self.vocab_len, self.model_dim)
        self.scale = self.model_dim ** -0.5
        self.pos_embeddings = Embedding(self.max_seq_len, self.model_dim)

        self.mask = Mask(self.max_seq_len)
        
        self.layers = [ResidualLayer(self.model_dim, self.num_heads, self.head_dim, self.max_seq_len, self.mlp_mult, self.mask) 
                       for _ in range(config.num_layers)]

        self.output_proj = Linear(self.model_dim, self.vocab_len)

        self.criterion = CrossEntropyLoss(self.vocab_len, pad_token = self.vocab_len - 1)

    def __call__(self, input_token_ids, target_token_ids = None):
        input_shape = get_shape(input_token_ids)
        if len(input_shape) == 1: # if only one sequence is passed in, aka batch_size==1
            input_shape = [1] + input_shape
            input_tokens = [input_token_ids]

        if target_token_ids: # if training
            assert input_shape == get_shape(target_token_ids)
            target_shape = get_shape(target_token_ids)
            assert input_shape[1] == self.max_seq_len
            dropout_rate = self.dropout_rate
        else: # if inference
            target_shape = None
            assert input_shape[1] <= self.max_seq_len
            dropout_rate = 0.

        x = vector_wise_apply(self.tok_embeddings, input_token_ids)
        pos = vector_wise_apply(self.pos_embeddings, [list(range(input_shape[1])) for _ in range(input_shape[0])])
        x = entry_wise_add(x, pos)
        x = vector_wise_apply(mult_vec_by_scalar, x, self.scale)

        for layer in self.layers:
            x = layer(x, dropout_rate)

        logits = vector_wise_apply(self.output_proj, vector_wise_apply(layer_norm, x))
        probabilities = vector_wise_apply(softmax, logits)

        loss = self.criterion(probabilities, target_token_ids) if target_token_ids else None
        
        return probabilities, loss
            

In [32]:
config = Config()
gpt = GPT(config)
input_token_ids = [[r.randint(0, config.vocab_len - 1) for _ in range(config.max_seq_len)] for _ in range(config.batch_size)]
target_token_ids = [[r.randint(0, config.vocab_len - 1) for _ in range(config.max_seq_len)] for _ in range(config.batch_size)]
probabilities, loss = gpt(input_token_ids, target_token_ids)
pretty_tensor_print(probabilities)
print(loss)

OverflowError: math range error

In [6]:
# TODO: make Embedding module also do unembedding w/ shared weights? 
# wouldn't be exactly faithful to pytorch implementation but i'd like to use gradient accumulation & save parameters