In [2]:
import random as r
import time

from engine import Value
from modules import *
from ops import *
from gpt import *

# addition
should we do modular addition instead??? problem with that is we'd have to use a huge modulo to have a reasonably sized training dataset right?

In [4]:
max_digits = 2

In [5]:
def generate_data(k: int):
    '''
    creates a string representation of an arithmetic addition operation where the digits are in reverse-order
    ex:
    123 + 45 = 168 would be '321+54=861'
    this reversal is meant to make the task easier on teh transformer
    '''
    max_num = int('9' * k)
    num1 = r.randint(0, max_num)
    num2 = r.randint(0, max_num)
    num3 = num1 + num2
    return f'b{str(num1)[::-1]}+{str(num2)[::-1]}={str(num3)[::-1]}e'

In [6]:
dataset = set()
max_possible_dataset_size = int('9' * max_digits)**2
dataset_size = 2
i = 0
while dataset_size < max_possible_dataset_size:
    i += 1
    dataset_size = 2 ** i
dataset_size = 2 ** (i - 1)
assert dataset_size < max_possible_dataset_size
while len(dataset) < dataset_size:
    dataset.add(generate_data(max_digits))

In [7]:
split_size = int(0.95 * len(dataset))
dataset = list(dataset)
train_dataset = dataset[:split_size]
val_dataset = dataset[split_size:]
print(len(train_dataset))
print(len(val_dataset))

7782
410


In [8]:
tokenizer = {
    '0':0, '1':1, '2':2, '3':3, '4':4, '5':5, '6':6, '7':7, '8':8, '9':9, 
    '+':10, '=':11, 
    'b':12, # beginning of sequence token
    'e':13, # end of sequence token
    'p':14 # padding token
}
v = len(tokenizer)
max_data_len = 4 + max_digits * 3 + 1

def tokenize(equation):
    out = []
    for c in equation:
        out.append(tokenizer[c])
    while len(out) < max_data_len:
        out.append(tokenizer['p'])
    return out

print(train_dataset[0])
tokens = tokenize(train_dataset[0])
print(tokens)

input_tokens, target_tokens = tokens[:-1], tokens[1:]
print(input_tokens, target_tokens)

reverse_tokenizer = {val:tok for tok, val in zip(tokenizer, tokenizer.values())}
print(reverse_tokenizer)

def decode_tokens(tokens):
    char_list = [reverse_tokenizer[t] for t in tokens]
    eos_idx = char_list.index('e')
    chars = "".join(char_list[1:eos_idx])
    num1, temp = chars.split('+')
    num2, num3 = temp.split('=')
    return f'{num1[::-1]}+{num2[::-1]}={num3[::-1]}'
    
output = decode_tokens(tokens)
print(output)

b38+84=131e
[12, 3, 8, 10, 8, 4, 11, 1, 3, 1, 13]
[12, 3, 8, 10, 8, 4, 11, 1, 3, 1] [3, 8, 10, 8, 4, 11, 1, 3, 1, 13]
{0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8', 9: '9', 10: '+', 11: '=', 12: 'b', 13: 'e', 14: 'p'}
83+48=131


In [9]:
batch_size = 16
train_iterations = split_size // batch_size
val_iterations = (dataset_size - split_size) // batch_size
val_frequency = train_iterations // val_iterations
print(train_iterations, val_iterations, val_frequency)

config = {
    'vocab_len':v,
    'model_dim':4,
    'max_seq_len':max_data_len - 1,
    'num_heads':2,
    'head_dim':2,
    'mlp_mult':2,
    'dropout_rate':0.1,
    'num_layers':1
}
model = GPT(config)

eta = 0.01

486 25 19


In [10]:
start_time = time.time()
for i in range(train_iterations):
        
    train_batch_data = train_dataset[i:i + batch_size]
    train_input_toks, train_target_toks = [], []
    for entry in train_batch_data:
        train_toks_seq = tokenize(entry)
        train_input_toks.append(train_toks_seq[:-1])
        train_target_toks.append(train_toks_seq[1:])

    probabilities, train_loss = model(train_input_toks, train_target_toks)
        
    if i % val_frequency == 0:
        val_batch_data = val_dataset[i % val_frequency:(i % val_frequency) + batch_size]
        val_input_toks, val_target_toks = [], []
        for entry in val_batch_data:
            val_toks_seq = tokenize(entry)
            val_input_toks.append(val_toks_seq[:-1])
            val_target_toks.append(val_toks_seq[1:])
            
        probabilities, val_loss = model(val_input_toks, val_target_toks)
        print(f'step {i} train loss: {train_loss.data} val loss: {val_loss.data} time: {time.time() - start_time}')

    ## backward pass
    #set params to 0
    for p in model.parameters():
        p.grad = 0.0
    # clac gradients
    train_loss.backward()
    # performing a step of SGD
    for p in model.parameters():
        p.data -= eta * p.grad

step 0 train loss: Value(data=433.288, grad=0.000) val loss: Value(data=433.288, grad=0.000) time: 2.5095489025115967
step 19 train loss: Value(data=419.274, grad=0.000) val loss: Value(data=423.468, grad=0.000) time: 44.46553683280945
step 38 train loss: Value(data=418.610, grad=0.000) val loss: Value(data=421.179, grad=0.000) time: 89.63981699943542
step 57 train loss: Value(data=415.949, grad=0.000) val loss: Value(data=423.663, grad=0.000) time: 133.78260684013367
step 76 train loss: Value(data=416.133, grad=0.000) val loss: Value(data=420.213, grad=0.000) time: 180.54873394966125
step 95 train loss: Value(data=412.627, grad=0.000) val loss: Value(data=420.394, grad=0.000) time: 226.94694471359253
step 114 train loss: Value(data=419.604, grad=0.000) val loss: Value(data=420.669, grad=0.000) time: 272.7566969394684
step 133 train loss: Value(data=413.515, grad=0.000) val loss: Value(data=418.737, grad=0.000) time: 318.67963886260986
step 152 train loss: Value(data=415.829, grad=0.00

ValueError: math domain error