In [1]:
import sys
sys.path.append('/kaggle/input/mingpt/')
sys.path.append('/kaggle/input/reflextransformer')

In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F

import math

from mingpt.utils import set_seed
from mingpt.bpe import BPETokenizer
set_seed(3407)

In [3]:
import reflexTransformer

In [4]:
device = 'cuda'

In [5]:
# [ ] I try to get all possible params considered, but it is not that easy
# [ ] nano has blocksize 1024
# nano uses 0 dropout
class Config(): 
        # either model_type or (n_layer, n_head, n_embd) must be given in the config
        # C.model_type = 'gpt'
        n_layer = 6
        n_head = 6
        n_embd = 64*6
        # these options must be filled in externally
        vocab_size = 50257
        block_size = 1024
        head_size = n_embd // n_head
        # dropout hyperparameters
        embd_pdrop = 0
        resid_pdrop = 0
        attn_pdrop = 0
config = Config()

In [6]:
model = reflexTransformer.ReflexTransformer(config)

number of parameters: 30.33M


In [7]:
model.load_state_dict(torch.load('/kaggle/input/level1-6l6h-training/ckpt.pt', weights_only=False)['model'])
model.to(device)
model.eval()

ReflexTransformer(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 384)
    (wpe): Embedding(1024, 384)
    (drop): Dropout(p=0, inplace=False)
    (h): ModuleList(
      (0-5): 6 x Block(
        (ra): MultiHeadReflexAttention(
          (heads): ModuleList(
            (0-5): 6 x ReflexAttention(
              (key): Linear(in_features=384, out_features=64, bias=False)
              (query): Linear(in_features=384, out_features=64, bias=False)
              (value): Linear(in_features=384, out_features=64, bias=False)
              (dropout): Dropout(p=0, inplace=False)
            )
          )
          (proj): Linear(in_features=384, out_features=384, bias=True)
          (dropout): Dropout(p=0, inplace=False)
        )
        (mlp): ModuleDict(
          (c_fc): Linear(in_features=384, out_features=1536, bias=True)
          (c_proj): Linear(in_features=1536, out_features=384, bias=True)
          (act): NewGELU()
          (dropout): Dropout(p=0, inplace=False)
       

In [8]:
from datasets import load_dataset

ds = load_dataset("allenai/ai2_arc", "ARC-Easy")

In [9]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cuda" # the device to load the model onto

tokenizer = AutoTokenizer.from_pretrained("gpt2")



In [10]:
def format_dataset(dataset):
    """
    Formats a dataset stored in ds['train'] for GPT-2 input.

    Parameters:
    dataset (Dataset): The dataset containing multiple ARC examples.

    Returns:
    list: A list of formatted strings suitable for GPT-2 input.
    """
    formatted_data = []
    for example in dataset:
        question = example["question"]
        choices = example["choices"]
        answer_key = example["answerKey"]

        # Create a prompt structure
        formatted_choices = "\n".join([f"{label}. {text}" for label, text in zip(choices["label"], choices["text"])])
        correct_answer = f"The correct answer is: {answer_key}"

        # Combine everything into a single formatted string
        formatted_input = f"Question: {question}\n\nChoices:\n{formatted_choices}\n\n{correct_answer}"
        formatted_data.append(formatted_input)
    
    return formatted_data

In [11]:
res = format_dataset(ds['train'])

In [12]:
tokenizer.pad_token = tokenizer.eos_token
train = tokenizer(res, return_tensors="pt", max_length=1024, truncation=True, padding=True)['input_ids']

In [13]:
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW

# Hyperparameters
batch_size = 32
epochs = 1
learning_rate = 5e-5

# Prepare data loader
train_dataset = TensorDataset(train)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Move model to device
model.to(device)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    for batch in train_dataloader:
        batch = batch[0].to(device)  # Move to device
        optimizer.zero_grad()       # Zero gradients
        
        # Forward pass
        outputs, loss = model(batch, targets=batch)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        print(f"Loss: {loss.item()}")

Epoch 1/1
Loss: 6.781310081481934
Loss: 5.056482315063477
Loss: 4.056164264678955
Loss: 3.628506660461426
Loss: 3.3245227336883545
Loss: 3.2118449211120605
Loss: 3.092536211013794
Loss: 3.080886125564575
Loss: 2.83040714263916
Loss: 2.643763542175293
Loss: 2.77748966217041
Loss: 2.903712272644043
Loss: 2.5691919326782227
Loss: 2.4700398445129395
Loss: 2.4894444942474365
Loss: 2.5609207153320312
Loss: 2.4441065788269043
Loss: 2.668217897415161
Loss: 2.440274953842163
Loss: 2.6520400047302246
Loss: 2.510871410369873
Loss: 2.6296803951263428
Loss: 2.362942695617676
Loss: 2.444359302520752
Loss: 2.0768115520477295
Loss: 2.2504847049713135
Loss: 2.191472291946411
Loss: 2.1208527088165283
Loss: 2.451037883758545
Loss: 2.1456515789031982
Loss: 2.1040449142456055
Loss: 2.2893497943878174
Loss: 1.8656495809555054
Loss: 2.1696741580963135
Loss: 2.2204275131225586
Loss: 2.155660629272461
Loss: 2.076470375061035
Loss: 1.9478766918182373
Loss: 2.0486342906951904
Loss: 2.0903825759887695
Loss: 2.332

In [29]:
def preprocess_data(dataset, tokenizer, max_length=512):
    """
    Tokenizes and processes the dataset for GPT-2 training.
    
    Args:
    - dataset (list): List of strings (formatted examples).
    - tokenizer: GPT-2 tokenizer.
    - max_length (int): Maximum sequence length.

    Returns:
    - torch.Tensor: Input IDs tensor.
    """
    inputs = tokenizer(dataset, return_tensors="pt", max_length=max_length, truncation=True, padding=True)
    return inputs["input_ids"]

In [30]:
def format_for_gpt2_no_answer(arc_example):
    """
    Formats an ARC dataset example for GPT-2 input without including the answer.

    Parameters:
    arc_example (dict): A dictionary containing the ARC question, answer choices, and correct answer.

    Returns:
    str: A formatted string without the answer.
    """
    question = arc_example["question"]
    choices = arc_example["choices"]

    # Create a prompt structure without the answer
    formatted_choices = "\n".join([f"{label}. {text}" for label, text in zip(choices["label"], choices["text"])])
    formatted_input = f"Question: {question}\n\nChoices:\n{formatted_choices}\n\n"
    
    return formatted_input


In [31]:
formatted_test_data_no_answer = [format_for_gpt2_no_answer(example) for example in ds['test']]
test_input_ids_no_answer = preprocess_data(formatted_test_data_no_answer, tokenizer)

In [34]:
correct = 0
total = 0

with torch.no_grad():
    for example in ds['test']:
        question = example["question"]
        choices = example["choices"]["text"]
        labels = example["choices"]["label"]
        correct_answer = example["answerKey"]

        # Format input without the answer
        formatted_input = format_for_gpt2_no_answer(example)
        input_ids = tokenizer(formatted_input, return_tensors="pt").input_ids.to(device)

        # Generate model output
        output_ids = model.generate(input_ids, max_new_tokens=5)
        predicted_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        # Determine which choice the model selected
        for label, choice in zip(labels, choices):
            if choice in predicted_text:
                predicted_answer = label
                break
        else:
            predicted_answer = None  # If no choice matches

        # Check if the prediction is correct
        if predicted_answer == correct_answer:
            correct += 1
        total += 1

accuracy = correct / total
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Test Accuracy: 25.08%


In [35]:
total

2376

In [36]:
correct

596

In [None]:
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from mingpt.utils import set_seed
set_seed(3407)

In [None]:
import pickle

class SortDataset(Dataset):
    """ 
    Dataset for the Sort problem. E.g. for problem length 6:
    Input: 0 0 2 1 0 1 -> Output: 0 0 0 1 1 2
    Which will feed into the transformer concatenated as:
    input:  0 0 2 1 0 1 0 0 0 1 1
    output: I I I I I 0 0 0 1 1 2
    where I is "ignore", as the transformer is reading the input sequence
    """

    def __init__(self, split, length=6, num_digits=3):
        assert split in {'train', 'test'}
        self.split = split
        self.length = length
        self.num_digits = num_digits
    
    def __len__(self):
        return 10000 # ...
    
    def get_vocab_size(self):
        return self.num_digits
    
    def get_block_size(self):
        # the length of the sequence that will feed into transformer, 
        # containing concatenated input and the output, but -1 because
        # the transformer starts making predictions at the last input element
        return self.length * 2 - 1

    def __getitem__(self, idx):
        
        # use rejection sampling to generate an input example from the desired split
        while True:
            # generate some random integers
            inp = torch.randint(self.num_digits, size=(self.length,), dtype=torch.long)
            # half of the time let's try to boost the number of examples that 
            # have a large number of repeats, as this is what the model seems to struggle
            # with later in training, and they are kind of rate
            if torch.rand(1).item() < 0.5:
                if inp.unique().nelement() > self.length // 2:
                    # too many unqiue digits, re-sample
                    continue
            # figure out if this generated example is train or test based on its hash
            h = hash(pickle.dumps(inp.tolist()))
            inp_split = 'test' if h % 4 == 0 else 'train' # designate 25% of examples as test
            if inp_split == self.split:
                break # ok
        
        # solve the task: i.e. sort
        sol = torch.sort(inp)[0]

        # concatenate the problem specification and the solution
        cat = torch.cat((inp, sol), dim=0)

        # the inputs to the transformer will be the offset sequence
        x = cat[:-1].clone()
        y = cat[1:].clone()
        # we only want to predict at output locations, mask out the loss at the input locations
        y[:self.length-1] = -1
        return x, y

In [None]:
# print an example instance of the dataset
train_dataset = SortDataset('train', length = 500, num_digits=10)
test_dataset = SortDataset('test')
x, y = train_dataset[0]
#for a, b in zip(x,y):
#    print(int(a),int(b))

In [None]:
# create a GPT instance
from mingpt.model import GPT

#model_config = GPT.get_default_config()
#model_config.model_type = 'gpt-nano'
#model_config.vocab_size = train_dataset.get_vocab_size()
#model_config.block_size = train_dataset.get_block_size()
#model = GPT(model_config)

In [None]:
# create a Trainer object
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 2000
train_config.num_workers = 0
train_config.batch_size = 8
trainer = Trainer(train_config, model, train_dataset)

In [None]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()