In [2]:
import random as r

from engine import Value
from modules import *
from ops import *
from gpt import *

# addition
should we do modular addition instead??? problem with that is we'd have to use a huge modulo to have a reasonably sized training dataset right?

In [4]:
max_digits = 2

In [5]:
def generate_data(k: int):
    '''
    creates a string representation of an arithmetic addition operation where the digits are in reverse-order
    ex:
    123 + 45 = 168 would be '321+54=861'
    this reversal is meant to make the task easier on teh transformer
    '''
    max_num = int('9' * k)
    num1 = r.randint(0, max_num)
    num2 = r.randint(0, max_num)
    num3 = num1 + num2
    return f'b{str(num1)[::-1]}+{str(num2)[::-1]}={str(num3)[::-1]}e'

In [6]:
dataset = set()
max_possible_dataset_size = int('9' * max_digits)**2
dataset_size = 2
i = 0
while dataset_size < max_possible_dataset_size:
    i += 1
    dataset_size = 2 ** i
dataset_size = 2 ** (i - 1)
assert dataset_size < max_possible_dataset_size
while len(dataset) < dataset_size:
    dataset.add(generate_data(max_digits))
#print(dataset)

In [7]:
split_size = int(0.95 * len(dataset))
dataset = list(dataset)
train_dataset = dataset[:split_size]
val_dataset = dataset[split_size:]
print(len(train_dataset))
print(len(val_dataset))

7782
410


In [8]:
tokenizer = {
    '0':0, '1':1, '2':2, '3':3, '4':4, '5':5, '6':6, '7':7, '8':8, '9':9, 
    '+':10, '=':11, 
    'b':12, # beginning of sequence token
    'e':13, # end of sequence token
    'p':14 # padding token
}
v = len(tokenizer)
max_data_len = 4 + max_digits * 3 + 1

def tokenize(equation):
    out = []
    for c in equation:
        out.append(tokenizer[c])
    while len(out) < max_data_len:
        out.append(tokenizer['p'])
    return out

print(train_dataset[0])
tokens = tokenize(train_dataset[0])
print(tokens)

input_tokens, target_tokens = tokens[:-1], tokens[1:]
print(input_tokens, target_tokens)

reverse_tokenizer = {val:tok for tok, val in zip(tokenizer, tokenizer.values())}
print(reverse_tokenizer)

def decode_tokens(tokens):
    char_list = [reverse_tokenizer[t] for t in tokens]
    eos_idx = char_list.index('e')
    chars = "".join(char_list[1:eos_idx])
    num1, temp = chars.split('+')
    num2, num3 = temp.split('=')
    return f'{num1[::-1]}+{num2[::-1]}={num3[::-1]}'
    
output = decode_tokens(tokens)
print(output)

b89+86=661e
[12, 8, 9, 10, 8, 6, 11, 6, 6, 1, 13]
[12, 8, 9, 10, 8, 6, 11, 6, 6, 1] [8, 9, 10, 8, 6, 11, 6, 6, 1, 13]
{0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8', 9: '9', 10: '+', 11: '=', 12: 'b', 13: 'e', 14: 'p'}
98+68=166


In [19]:
batch_size = 16
train_iterations = split_size // batch_size
val_iterations = (dataset_size - split_size) // batch_size
val_frequency = train_iterations // val_iterations
print(train_iterations, val_iterations, val_frequency)

config = {
    'vocab_len':v,
    'model_dim':4,
    'max_seq_len':max_data_len - 1,
    'num_heads':2,
    'head_dim':2,
    'mlp_mult':2,
    'dropout_rate':0.1,
    'num_layers':1
}
model = GPT(config)

eta = -0.01

486 25 19


In [25]:
for i in range(val_frequency + 1):#train_iterations):
        
    if i % val_frequency == 0:
        val_batch_data = val_dataset[i % val_frequency:(i % val_frequency) + batch_size]
        val_input_toks, val_target_toks = [], []
        for entry in val_batch_data:
            val_toks_seq = tokenize(entry)
            val_input_toks.append(val_toks_seq[:-1])
            val_target_toks.append(val_toks_seq[1:])
            
        probabilities, loss = model(val_input_toks, val_target_toks)
        print(f'step {i} loss: {loss}')
        
    train_batch_data = train_dataset[i:i + batch_size]
    train_input_toks, train_target_toks = [], []
    for entry in train_batch_data:
        train_toks_seq = tokenize(entry)
        train_input_toks.append(train_toks_seq[:-1])
        train_target_toks.append(train_toks_seq[1:])

    probabilities, loss = model(train_input_toks, train_target_toks)

    ## backward pass
    #set params to 0
    for p in model.parameters():
        p.grad = 0.0
    # clac gradients
    loss.backward()
    # performing a step of SGD
    for p in model.parameters():
        p.data += -eta * p.grad

step 0 loss: Value(data=433.288, grad=0.000)
step 19 loss: Value(data=433.288, grad=0.000)


In [4]:
e = Embedding(3, 4)
pretty_tensor_print(e.weight)
print(e.parameters())

[
  [Value(data=-0.028, grad=0.000), Value(data=0.016, grad=0.000), Value(data=0.034, grad=0.000), Value(data=-0.006, grad=0.000)]
  [Value(data=0.025, grad=0.000), Value(data=-0.002, grad=0.000), Value(data=0.010, grad=0.000), Value(data=0.001, grad=0.000)]
  [Value(data=-0.016, grad=0.000), Value(data=-0.026, grad=0.000), Value(data=-0.017, grad=0.000), Value(data=0.027, grad=0.000)]
]
[Value(data=-0.028, grad=0.000), Value(data=0.016, grad=0.000), Value(data=0.034, grad=0.000), Value(data=-0.006, grad=0.000), Value(data=0.025, grad=0.000), Value(data=-0.002, grad=0.000), Value(data=0.010, grad=0.000), Value(data=0.001, grad=0.000), Value(data=-0.016, grad=0.000), Value(data=-0.026, grad=0.000), Value(data=-0.017, grad=0.000), Value(data=0.027, grad=0.000)]


In [16]:
mask = Mask(5)
mhsa = MultiHeadSelfAttention(4, 2, 2, 5, mask)
print(mhsa.parameters())

[Value(data=-0.017, grad=0.000), Value(data=0.013, grad=0.000), Value(data=-0.032, grad=0.000), Value(data=-0.007, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.004, grad=0.000), Value(data=0.010, grad=0.000), Value(data=0.013, grad=0.000), Value(data=-0.003, grad=0.000), Value(data=0.000, grad=0.000), Value(data=-0.045, grad=0.000), Value(data=0.025, grad=0.000), Value(data=0.008, grad=0.000), Value(data=-0.043, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.022, grad=0.000), Value(data=-0.016, grad=0.000), Value(data=-0.006, grad=0.000), Value(data=-0.019, grad=0.000), Value(data=0.000, grad=0.000), Value(data=-0.012, grad=0.000), Value(data=-0.029, grad=0.000), Value(data=-0.002, grad=0.000), Value(data=-0.004, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.006, grad=0.000), Value(data=-0.025, grad=0.000), Value(data=0.003, grad=0.000), Value(data=-0.007, grad=0.000), Value(data=0.000, grad=0.000), Value(data=-0.044, grad=0.000), Value(data=-0.014, gra

In [18]:
mlp = MultiLayerPerceptron(2,4,2)
print(mlp.parameters())

[Value(data=-0.003, grad=0.000), Value(data=-0.018, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.010, grad=0.000), Value(data=-0.029, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.006, grad=0.000), Value(data=-0.007, grad=0.000), Value(data=0.000, grad=0.000), Value(data=-0.024, grad=0.000), Value(data=-0.025, grad=0.000), Value(data=0.000, grad=0.000), Value(data=-0.024, grad=0.000), Value(data=-0.030, grad=0.000), Value(data=-0.007, grad=0.000), Value(data=0.038, grad=0.000), Value(data=0.000, grad=0.000), Value(data=0.022, grad=0.000), Value(data=0.006, grad=0.000), Value(data=0.004, grad=0.000), Value(data=0.008, grad=0.000), Value(data=0.000, grad=0.000)]
