In [1]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

plt.style.use(['seaborn-v0_8-colorblind', 'seaborn-v0_8-darkgrid'])
plt.rcParams.update({'font.size': 20})

np.set_printoptions(suppress=True, precision=4)

# Automatically reload your external source code
%load_ext autoreload
%autoreload 2

In [2]:
from gpts import GPT, GPTPico1
from addition_dataset import *

In [24]:
# doing 5k samps as said in email
x_train, y_train, x_val, y_val, char2ind_map_digit = get_reduction_dataset_multi(N_per=2000, max_operand_digits=5,val_prop=0.1)
vocab_sz = len(char2ind_map_digit)
seq_len_digit = x_train.shape[1]
tf.keras.backend.clear_session()
tf.random.set_seed(0)

# Your code here
picogpt_digits = GPTPico1(vocab_sz=vocab_sz, seq_len=seq_len_digit, padding_char_enc=vocab_sz-1, num_heads=8, embed_dim=64)
picogpt_digits.compile(loss='temporal_cross_entropy')

train_loss_hist, val_loss_hist, val_acc_hist, n_epochs= picogpt_digits.fit(
    x=x_train,
    y=y_train,
    x_val=x_val,
    y_val=y_val,
    batch_size=128,
    max_epochs=100,
    patience=15,
    lr_patience=5,
    lr_decay_factor=0.5,
    lr_max_decays=4,
    val_every=1,
    verbose=True
)
N_show = 30
x_splits = [x_train, x_val]
split_labels = ['train', 'val']
for i in range(2):
    x = x_splits[i]
    split = split_labels[i]
    print(50*'=')
    print(split)
    print(50*'=')
    N = len(x)
    ind2char_map_digit = make_ind2char_mapping(char2ind_map_digit)
    x_str = convert_int2str(x_int=x.numpy(), ind2char_map=ind2char_map_digit)
    prompts, correct_answers = split_sum_and_answer(x_str)
    print(f'x_str: {prompts}')
    for i in range(N_show):
        curr_prompt = prompts[i]
        curr_ans = correct_answers[i]
        answer = picogpt_digits.generate_sequence(prompt=curr_prompt,
                                            length=seq_len_digit,
                                            char2ind_map=char2ind_map_digit,
                                            ind2char_map=ind2char_map_digit,
                                            end_char='.')
        print(f'Answer: {answer}')
        print('Correct answer is:', curr_ans)
        print('---------------------')

---------------------------------------------------------------------------
Dense layer output(Output_Layer) shape: [1, 9, 14]
Transformer_Block_0:
	Transformer_Block_0_MLP:
	Dropout layer output(Transformer_Block_0_MLP_Dropout) shape: [1, 9, 64]
	Dense layer output(Transformer_Block_0_MLP_Dense2) shape: [1, 9, 64]
	Dense layer output(Transformer_Block_0_MLP_Dense1) shape: [1, 9, 256]
	Transformer_Block_0_MHA:
	Dropout layer output(Transformer_Block_0_MHA_Dropout) shape: [1, 9, 64]
	Dense layer output(Transformer_Block_0_MHA_Dense) shape: [1, 9, 64]
	Transformer_Block_0_MHA_Attention:
	Dropout layer output(attention_dropout) shape: [1, 8, 9, 9]
	Transformer_Block_0_MHA_QKV:
	Dense layer output(QKVBlock_Value) shape: [1, 9, 64]
	Dense layer output(QKVBlock_Key) shape: [1, 9, 64]
	Dense layer output(QKVBlock_Query) shape: [1, 9, 64]
Positional_Encoding_Block_0:
	Dropout layer output(Positional_Encoding_Block_0_Dropout) shape: [1, 9, 64]
	Positional encoding layer output(Positional_Encodi

In [25]:
# doing 5k samps as said in email
x_train, y_train, x_val, y_val, char2ind_map_add = get_blank_dataset(N=1000, operation = '+', max_operand_digits=1,val_prop=0.1)
vocab_sz = len(char2ind_map_add)
seq_len_add = x_train.shape[1]
tf.keras.backend.clear_session()
tf.random.set_seed(0)

# Your code here
picogpt_addition = GPTPico1(vocab_sz=vocab_sz, seq_len=seq_len_add, padding_char_enc=vocab_sz-1, num_heads=8, embed_dim=64)
picogpt_addition.compile(loss='temporal_cross_entropy')

train_loss_hist, val_loss_hist, val_acc_hist, n_epochs= picogpt_addition.fit(
    x=x_train,
    y=y_train,
    x_val=x_val,
    y_val=y_val,
    batch_size=128,
    max_epochs=100,
    patience=15,
    lr_patience=5,
    lr_decay_factor=0.5,
    lr_max_decays=4,
    val_every=1,
    verbose=True
)
N_show = 30
x_splits = [x_train, x_val]
split_labels = ['train', 'val']
for i in range(2):
    x = x_splits[i]
    split = split_labels[i]
    print(50*'=')
    print(split)
    print(50*'=')
    N = len(x)
    ind2char_map_add = make_ind2char_mapping(char2ind_map_add)
    x_str = convert_int2str(x_int=x.numpy(), ind2char_map=ind2char_map_add)
    prompts, correct_answers = split_sum_and_answer(x_str)
    # print(f'x_str: {x_str}')
    for i in range(N_show):
        curr_prompt = prompts[i]
        curr_ans = correct_answers[i]
        answer = picogpt_addition.generate_sequence(prompt=curr_prompt,
                                            length=seq_len_add,
                                            char2ind_map=char2ind_map_add,
                                            ind2char_map=ind2char_map_add,
                                            end_char='.')
        print(f'Answer: {answer}')
        print('Correct answer is:', curr_ans)
        print('---------------------')

---------------------------------------------------------------------------
Dense layer output(Output_Layer) shape: [1, 6, 14]
Transformer_Block_0:
	Transformer_Block_0_MLP:
	Dropout layer output(Transformer_Block_0_MLP_Dropout) shape: [1, 6, 64]
	Dense layer output(Transformer_Block_0_MLP_Dense2) shape: [1, 6, 64]
	Dense layer output(Transformer_Block_0_MLP_Dense1) shape: [1, 6, 256]
	Transformer_Block_0_MHA:
	Dropout layer output(Transformer_Block_0_MHA_Dropout) shape: [1, 6, 64]
	Dense layer output(Transformer_Block_0_MHA_Dense) shape: [1, 6, 64]
	Transformer_Block_0_MHA_Attention:
	Dropout layer output(attention_dropout) shape: [1, 8, 6, 6]
	Transformer_Block_0_MHA_QKV:
	Dense layer output(QKVBlock_Value) shape: [1, 6, 64]
	Dense layer output(QKVBlock_Key) shape: [1, 6, 64]
	Dense layer output(QKVBlock_Query) shape: [1, 6, 64]
Positional_Encoding_Block_0:
	Dropout layer output(Positional_Encoding_Block_0_Dropout) shape: [1, 6, 64]
	Positional encoding layer output(Positional_Encodi

In [85]:
prompt = '2+0='
answer = picogpt_addition.generate_sequence(prompt=prompt,
                                    length=seq_len_add,
                                    char2ind_map=char2ind_map_add,
                                    ind2char_map=ind2char_map_add,
                                    end_char='.', live_print=True)
prompt = '00010d1='
answer = picogpt_digits.generate_sequence(prompt=prompt,
                                    length=seq_len_digit,
                                    char2ind_map=char2ind_map_digit,
                                    ind2char_map=ind2char_map_digit,
                                    end_char='.', live_print=True)

2+0=2.
00010d1=1.


In [100]:
# picogpt_addition
# picogpt_digits

def high_d_addition(prompt, verbose=True):
    if verbose:
        print(f'Prompt: {prompt}')

    # === extract the two operand strings ===
    # split on '+' then strip the trailing '='
    left, right = prompt.split('+')
    oper1 = left
    oper2 = right.rstrip('=')

    remained   = 0     # carry from the previous digit
    running_sum = []    # collect result digits (low→high)

    # oper1, oper2 = split(prompt)
    # remained = 0
    # running_sum = []
    for d in range(max(len(oper1),len(oper2))):
        cur_d = -d-1
        cur_oper1 = 0
        cur_oper2 = 0
        if d < len(oper1):
            prompt_digit_1 = f'{oper1}d{d}='
            cur_oper1 = picogpt_digits.generate_sequence(prompt=prompt_digit_1, length=seq_len_digit, char2ind_map=char2ind_map_digit, ind2char_map=ind2char_map_digit, end_char='.', live_print=False)
            cur_oper1 = int(cur_oper1[0])
        if d < len(oper2):
            prompt_digit_2 = f'{oper2}d{d}='
            cur_oper2 = picogpt_digits.generate_sequence(prompt=prompt_digit_2, length=seq_len_digit, char2ind_map=char2ind_map_digit, ind2char_map=ind2char_map_digit, end_char='.', live_print=False)
            cur_oper2 = int(cur_oper2[0])
        if verbose:
            print(f'\tdigit_pico extracted {cur_oper1} and {cur_oper2}')
        add_prompt=f'{cur_oper1}+{cur_oper2}='
        curr_sum = picogpt_addition.generate_sequence(prompt=add_prompt, length=seq_len_add, char2ind_map=char2ind_map_add,ind2char_map=ind2char_map_add, end_char='.', live_print=False)
        if verbose:
            print(f'\taddition_pico figured out {add_prompt}{curr_sum}')

        if (curr_sum[1]) != '.':
            remainder_cur = int(curr_sum[0])*10 + int(curr_sum[1])
            prompt_remain = f'000{remainder_cur}d{1}='
            remainder = picogpt_digits.generate_sequence(prompt=prompt_remain, length=seq_len_digit, char2ind_map=char2ind_map_digit, ind2char_map=ind2char_map_digit, end_char='.', live_print=False)
            prompt_remain_sum = f'000{remainder_cur}d{0}='
            curr_sum = picogpt_digits.generate_sequence(prompt=prompt_remain_sum, length=seq_len_digit, char2ind_map=char2ind_map_digit, ind2char_map=ind2char_map_digit, end_char='.', live_print=False)
            if verbose:
                print(f'\tdigit_pico sees both {remainder} and {curr_sum}')
            to_remained = True
        else:
            to_remained = False
        
        if (remained > 0): # check if there was a remained last time
            add_re_prompt = f'{int(curr_sum[0])}+{remained}='
            curr_sum = picogpt_addition.generate_sequence(prompt=add_re_prompt, length=seq_len_add, char2ind_map=char2ind_map_add,ind2char_map=ind2char_map_add, end_char='.', live_print=False)
            if verbose:
                print(f'\taddition_pico added previous remainder {add_re_prompt}{curr_sum}')
            if (curr_sum[1]) != '.':
                remainder_cur = int(curr_sum[0])*10 + int(curr_sum[1])
                prompt_remain = f'000{remainder_cur}d{1}='
                remainder = picogpt_digits.generate_sequence(prompt=prompt_remain, length=seq_len_digit, char2ind_map=char2ind_map_digit, ind2char_map=ind2char_map_digit, end_char='.', live_print=False)
                prompt_remain_sum = f'000{remainder_cur}d{0}='
                curr_sum = picogpt_digits.generate_sequence(prompt=prompt_remain_sum, length=seq_len_digit, char2ind_map=char2ind_map_digit, ind2char_map=ind2char_map_digit, end_char='.', live_print=False)
                if verbose:
                    print(f'\tdigit_pico sees both {remainder} and {curr_sum}')
                to_remained = True

        if (d == max(len(oper1),len(oper2)) - 1):
            running_sum.append(int(curr_sum[0]))
            if to_remained:
                if int(remainder[0])>0:
                    running_sum.append(int(remainder[0]))
            result = ''.join(str(d) for d in reversed(running_sum))
            if verbose:
                print(f'Result : {result}')
            return result

        if(to_remained):
            remained = int(remainder[0])
        else:
            remained = 0
        running_sum.append(int(curr_sum[0]))
        if verbose:
            result = ''.join(str(d) for d in reversed(running_sum))
            print(f'Running sum : {result}')

prompt = '2025+9992='
high_d_addition(prompt)

Prompt: 2025+9992=
	digit_pico extracted 5 and 2
	addition_pico figured out 5+2=['7', '.']
Running sum : 7
	digit_pico extracted 2 and 9
	addition_pico figured out 2+9=['1', '1', '.']
	digit_pico sees both ['1', '.'] and ['1', '.']
Running sum : 17
	digit_pico extracted 0 and 9
	addition_pico figured out 0+9=['9', '.']
	addition_pico added previous remainder 9+1=['1', '0', '.']
	digit_pico sees both ['1', '.'] and ['0', '.']
Running sum : 017
	digit_pico extracted 2 and 9
	addition_pico figured out 2+9=['1', '1', '.']
	digit_pico sees both ['1', '.'] and ['1', '.']
	addition_pico added previous remainder 1+1=['2', '.']
Result : 12017


'12017'

In [102]:
prompt = '9992+0025='
high_d_addition(prompt)

Prompt: 9992+0025=
	digit_pico extracted 2 and 5
	addition_pico figured out 2+5=['7', '.']
Running sum : 7
	digit_pico extracted 9 and 2
	addition_pico figured out 9+2=['1', '1', '.']
	digit_pico sees both ['1', '.'] and ['1', '.']
Running sum : 17
	digit_pico extracted 9 and 0
	addition_pico figured out 9+0=['9', '.']
	addition_pico added previous remainder 9+1=['1', '0', '.']
	digit_pico sees both ['1', '.'] and ['0', '.']
Running sum : 017
	digit_pico extracted 9 and 0
	addition_pico figured out 9+0=['9', '.']
	addition_pico added previous remainder 9+1=['1', '0', '.']
	digit_pico sees both ['1', '.'] and ['0', '.']
Result : 10017


'10017'

In [103]:
prompt = '9992+0025='
high_d_addition(prompt, verbose=False)

'10017'

In [None]:
# doing 5k samps as said in email
x_train, y_train, x_val, y_val, char2ind_map_add_4 = get_blank_dataset(N=5000, operation = '+', max_operand_digits=4, val_prop=0.1)
vocab_sz = len(char2ind_map_add_4)
seq_len_add_4 = x_train.shape[1]
tf.keras.backend.clear_session()
tf.random.set_seed(0)

# Your code here
picogpt_addition_4 = GPTPico1(vocab_sz=100, seq_len=seq_len_add_4, padding_char_enc=99, num_heads=8, embed_dim=64)
picogpt_addition_4.compile(loss='temporal_cross_entropy')

train_loss_hist, val_loss_hist, val_acc_hist, n_epochs= picogpt_addition_4.fit(
    x=x_train,
    y=y_train,
    x_val=x_val,
    y_val=y_val,
    batch_size=128,
    max_epochs=100,
    patience=15,
    lr_patience=5,
    lr_decay_factor=0.5,
    lr_max_decays=4,
    val_every=1,
    verbose=True
)
N_show = 30
x_splits = [x_train, x_val]
split_labels = ['train', 'val']
for i in range(2):
    x = x_splits[i]
    split = split_labels[i]
    print(50*'=')
    print(split)
    print(50*'=')
    N = len(x)
    ind2char_map_add_4 = make_ind2char_mapping(char2ind_map_add_4)
    x_str = convert_int2str(x_int=x.numpy(), ind2char_map=ind2char_map_add_4)
    prompts, correct_answers = split_sum_and_answer(x_str)
    # print(f'x_str: {x_str}')
    for i in range(N_show):
        curr_prompt = prompts[i]
        curr_ans = correct_answers[i]
        answer = picogpt_addition_4.generate_sequence(prompt=curr_prompt,
                                            length=seq_len_add_4,
                                            char2ind_map=char2ind_map_add_4,
                                            ind2char_map=ind2char_map_add_4,
                                            end_char='.')
        print(f'Answer: {answer}')
        # print(true == int(answerd))
        print('Correct answer is:', curr_ans)
        print('---------------------')

---------------------------------------------------------------------------
Dense layer output(Output_Layer) shape: [1, 15, 100]
Transformer_Block_0:
	Transformer_Block_0_MLP:
	Dropout layer output(Transformer_Block_0_MLP_Dropout) shape: [1, 15, 64]
	Dense layer output(Transformer_Block_0_MLP_Dense2) shape: [1, 15, 64]
	Dense layer output(Transformer_Block_0_MLP_Dense1) shape: [1, 15, 256]
	Transformer_Block_0_MHA:
	Dropout layer output(Transformer_Block_0_MHA_Dropout) shape: [1, 15, 64]
	Dense layer output(Transformer_Block_0_MHA_Dense) shape: [1, 15, 64]
	Transformer_Block_0_MHA_Attention:
	Dropout layer output(attention_dropout) shape: [1, 8, 15, 15]
	Transformer_Block_0_MHA_QKV:
	Dense layer output(QKVBlock_Value) shape: [1, 15, 64]
	Dense layer output(QKVBlock_Key) shape: [1, 15, 64]
	Dense layer output(QKVBlock_Query) shape: [1, 15, 64]
Positional_Encoding_Block_0:
	Dropout layer output(Positional_Encoding_Block_0_Dropout) shape: [1, 15, 64]
	Positional encoding layer output(Posi