In [1]:
# What to do next time
# 1. Think of problem setting I want to focus on (continual stream forever, or just long but not infinitely long)
# 2. Test if adding memory to attention layers, adding more layers, and adding more memory units helps or hurts.
# 3. Think of how I can get memory to hold more information without having to perform backprop on the same sequence 10+ times.
#    Maybe do updates for each token forward pass (even though they happen in parallel?).
#    Perhaps consider an update rule other than backprop. We don't necessarily even need to have the same objective
#    for the memory as the overall model.
# 4. Updating the full model + memory at the same time made a significant difference, but you generally don't want to
#    update the same model on the same data multiple times for iid data and computational efficiency reasons (I think),
#    so I need to think about what the training loop should look like as a whole.

In [2]:
import os
import random
import string
import sys
sys.path.append('minGPT/')

import torch
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

from mingpt.bpe import BPETokenizer
from mingpt.model import GPT
from mingpt.utils import set_seed, setup_logging

from config_setup import get_config
from data_handling import *
from training import *

2024-05-07 15:29:53.938007: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
### Prepare the config ###

config = get_config([])
config.merge_from_dict({
    'model.vocab_size': 10,
    'model.block_size': 16,
    'model.model_type': None,
    'model.n_layer': 1,
    'model.n_head': 4,
    'model.n_embd': 512,
    'model.fc_mem_dim': 256,
    'trainer.batch_size': 32,
    'trainer.device': 'cuda',
})

batch_size = config.trainer.batch_size
device = config.trainer.device

# print(config)
setup_logging(config)

if config.system.wandb:
    import wandb
    tags = [config.system.tag] if config.system.tag else None
    wandb.init(project=config.system.project, sync_tensorboard=True,
                config=config.to_dict(), tags=tags)

In [4]:
n_samples = 1_000_000
n_test = 10_000
n_train = n_samples - n_test

dataset = torch.randint(0, config.model.vocab_size, (n_samples, config.model.block_size))
dataset[:, 0] = 0 # Always start with 0

train_data = dataset[:-n_test]
test_data = dataset[-n_test:]

In [5]:
model = GPT(config.model)
model = model.to(config.trainer.device)
optimizer = model.configure_optimizers(config.trainer)
model.reset_memory(batch_size)

ltm_optimizer = torch.optim.AdamW(
    optimizer.param_groups[:2],
    lr = config.trainer.learning_rate,
    betas = config.trainer.betas,
)
stm_optimizer = torch.optim.Adam(
    optimizer.param_groups[2:],
    lr = 0.01,
    betas = config.trainer.betas,
)

del optimizer

number of parameters: 3.30M


In [6]:
bar = tqdm(range(0, n_train, batch_size))

for i in bar:
    batch = train_data[i:i+batch_size]
    if batch.size(0) < batch_size:
        continue

    X, y = batch[:, :-1], batch[:, 1:]
    X, y = X.to(device), y.to(device)

    # If you change the batch size, you will need to remake the optimizer
    # after resetting the memory because resetting the memory replaces the
    # memory parameters if the batch size is different.
    model.reset_memory(batch_size)

    # # Memorize the current sequence
    # mem_X = (X + 5) % 10
    # mem_y = (y + 5) % 10
    # for _ in range(30):
    #     _, loss = model(mem_X, mem_y)

    #     stm_optimizer.zero_grad()
    #     loss.backward()
    #     stm_optimizer.step()

    # Learn to work with the memory
    _, loss = model(X, y)

    ltm_optimizer.zero_grad()
    loss.backward()
    ltm_optimizer.step()

    bar.set_description(f"Loss: {loss.item():.4f}")

Loss: 2.3037:  26%|██▌       | 7951/30938 [00:43<02:05, 183.67it/s]


KeyboardInterrupt: 