In [92]:
%load_ext autoreload
%autoreload 2

import torch
from torch import optim, nn, utils, Tensor
from dataclasses import dataclass
import einops
from einops import einsum
import math
from easy_transformer.utils import get_corner, gelu_new, tokenize_and_concatenate
from easy_transformer import EasyTransformer
import pandas as pd
import json
import lightning as L
import numpy as np
from torch.utils.data import Dataset
from copy import deepcopy
from bisect import bisect_right
from datasets import load_dataset, load_dataset_builder

from TransformerModule import DemoTransformer, AttentionOnly

print(torch.cuda.is_available())
print(torch.__version__)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
True
2.4.0+cu121


In [105]:
@dataclass
class ConfigOnlyAt:
    d_model: int = 10 # 256
    debug: bool = False
    layer_norm_eps: float = 1e-5
    d_vocab: int = 50257
    init_range: float = 0.02
    n_ctx: int = 47 #1024
    d_head: int = 10 # 64
    d_mlp: int = 10 # 3072
    n_heads: int = 2 # 8
    n_layers: int = 2

cfg = ConfigOnlyAt()
print(cfg)

class LiTransformer(L.LightningModule):
    def __init__(self, cfg):
        super().__init__()
        self.transformer = DemoTransformer(cfg)
        self.criterion = nn.CrossEntropyLoss(ignore_index=0)

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        # it is independent of forward
        context_window, target_window = batch
        context_window, target_window = context_window.to(device), target_window.to(device)
        logits = self.transformer(context_window)
        logits = einops.rearrange(logits, "batch position vocabulary -> batch vocabulary position")
        loss = self.criterion(logits, target_window)
        
        # Logging to TensorBoard (if installed) by default
        self.log("train_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        context_window, target_window = batch
        context_window, target_window = context_window.to(device), target_window.to(device)
        logits = self.transformer(context_window)
        logits = einops.rearrange(logits, "batch position vocabulary -> batch vocabulary position")
        loss = self.criterion(logits, target_window)
        
        # Logging to TensorBoard (if installed) by default
        self.log("val_loss", loss)
        return loss

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=1e-3)
        return optimizer
firstTranformer = LiTransformer(cfg).to(device)
firstTranformer.device

ConfigOnlyAt(d_model=10, debug=False, layer_norm_eps=1e-05, d_vocab=50257, init_range=0.02, n_ctx=47, d_head=10, d_mlp=10, n_heads=2, n_layers=2)


In [3]:
reference_gpt2 = EasyTransformer.from_pretrained("gpt2-small", fold_ln=False, center_unembed=False, center_writing_weights=False, device = 'cpu')
reference_text = "I am an amazing autoregressive, decoder-only, GPT-2 style transformer. One day I will exceed human level intelligence and take over the world!"
tokens = reference_gpt2.to_tokens(reference_text)
target_tokens = torch.cat((tokens[0][1:], torch.tensor([50256])), 0).view(1, -1)
logits, cache = reference_gpt2.run_with_cache(reference_text)
log_probs = logits.log_softmax(dim=-1)
probs = logits.log_softmax(dim=-1)
reference_gpt2.tokenizer.batch_decode(logits.argmax(dim=-1)[0])[-1]

Moving model to device:  cpu
Finished loading pretrained model gpt2-small into EasyTransformer!


' I'

In [4]:
data = pd.read_json('../../data/train_1M.jsonl', lines=True)
data.insert(3, "tokens", [reference_gpt2.to_tokens(data.contents[i])[0] for i, x in enumerate(data.contents)], True)
data

Token indices sequence length is longer than the specified maximum sequence length for this model (1288 > 1024). Running this sequence through the model will result in indexing errors


Unnamed: 0,contents,metadata,id,tokens
0,Alsatian Cheese Tart\n\nFrench Chef Michel Ber...,"{'pile_set_name': ['Pile-CC', 'OpenWebText2']}",21,"[tensor(50256), tensor(2348), tensor(49720), t..."
1,depicted by four young winged men in Roman-lik...,"{'pile_set_name': ['Wikipedia (en)', 'Pile-CC']}",81,"[tensor(50256), tensor(10378), tensor(5722), t..."
2,Available in:\n\ndescription\n\nOn Tuesday Mar...,"{'pile_set_name': ['Pile-CC', 'Pile-CC']}",102,"[tensor(50256), tensor(10493), tensor(287), te..."
3,Date:\n\nDiscipline:\n\nSource:\n\nProduct num...,"{'pile_set_name': ['Pile-CC', 'Pile-CC']}",107,"[tensor(50256), tensor(10430), tensor(25), ten..."
4,other meetings she had chaired. Stockholders w...,"{'pile_set_name': ['Pile-CC', 'Pile-CC']}",110,"[tensor(50256), tensor(847), tensor(8292), ten..."
...,...,...,...,...
999995,wedding day and itinerary runs without a hitch...,"{'pile_set_name': ['Pile-CC', 'USPTO Backgroun...",34118665,"[tensor(50256), tensor(86), tensor(6048), tens..."
999996,a faster motor. Motor speed is already a param...,"{'pile_set_name': ['USPTO Backgrounds', 'Pile-...",34118691,"[tensor(50256), tensor(64), tensor(5443), tens..."
999997,Flag Law on 29 May 1936.\n\nFlag of Turkey\n\n...,"{'pile_set_name': ['Pile-CC', 'OpenWebText2']}",34118794,"[tensor(50256), tensor(34227), tensor(3854), t..."
999998,"president; he should, at best, be reviving cel...","{'pile_set_name': ['OpenWebText2', 'Pile-CC']}",34118819,"[tensor(50256), tensor(22540), tensor(26), ten..."


In [5]:
len(data.tokens.values[0])

323

In [6]:
data.contents.values[1]

"depicted by four young winged men in Roman-like dresses, driving vessels and blowing air into horns. The central upper square is an old man representing the Year, with the Wheel of Time, while at the upper corners are the personifications of the Rivers of Paradise. The other six upper squares depict the Four Seasons, as well as Samson and Abel (or Cain).\n\nThe two lower corners show the personifications of the Sun (left, symbolizing Sunday) and the Moon (right, much deteriorated, symbolizing Monday), while the  side outer squares represent the months (only eight of which survive). At the bottom are incomplete scenes of the discovery of Holy Cross.\n\nSources\n\nExternal links\n\nOfficial cathedral's website \nPage with details of the figures \nPage with links to websites and the newest literature  (2012)Publication Date:\n\nDiscipline:\n\nSource:\n\nProduct number:\n\nLength:\n\nAlso Available in:\n\ndescription\n\nIn October 2004 Fernández Pujals, founder of Telepizza, an internatio

In [79]:
class CustomGPTDataset(Dataset):
    def __init__(self, data, sequence_length):
        self.data = data
        self.sequence_length = sequence_length
        self.n_samples = len(self.data)
        self.li = [len(tokens) for tokens in self.data.tokens.values]
        self.cumulative_li = deepcopy(self.li)
        self.cumulative_li[0] += - self.sequence_length 
        for i in range(1, len(self.cumulative_li)):
            self.cumulative_li[i] = self.cumulative_li[i-1] + self.cumulative_li[i] - self.sequence_length
            
    def __len__(self):
        return self.cumulative_li[-1]

    def __getitem__(self, idx):
        string_i = bisect_right(self.cumulative_li, idx)
        diff = self.cumulative_li[string_i] - idx
        context_window = self.data.tokens.values[string_i][-diff-self.sequence_length:-diff]
        if diff == 1:
            target_window = torch.cat((self.data.tokens.values[string_i][-diff-self.sequence_length+1:-diff], torch.tensor([50256])), 0)
        else:
            target_window = self.data.tokens.values[string_i][-diff-self.sequence_length+1:-diff+1]
        return context_window, target_window

In [80]:
training_data = CustomGPTDataset(data, sequence_length = 47)
min(training_data.li), max(training_data.li)

In [84]:
train_dataloader = utils.data.DataLoader(training_data, batch_size=64, shuffle=False, pin_memory = False)
len(next(iter(train_dataloader)))

2

In [107]:
# train the model (hint: here are some helpful Trainer arguments for rapid idea iteration)
trainer = L.Trainer(limit_train_batches=100, max_epochs=1)
trainer.fit(model=firstTranformer, train_dataloaders=train_dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/eluator/AI_Safety/lib/python3.12/site-packages/lightning/pytorch/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type             | Params | Mode 
---------------------------------------------------------
0 | transformer | DemoTransformer  | 1.1 M  | train
1 | criterion   | CrossEntropyLoss | 0      | train
---------------------------------------------------------
1.1 M     Trainable params
0         Non-trainable params
1.1 M     Total params
4.233     Total estimated model params size (MB)
17        Modules in train mode
0         Modules in eval mode


Training: |                                               | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


In [112]:
firstTranformer = firstTranformer.to(device)
print(firstTranformer.device)
trainer.validate(firstTranformer, dataloaders = train_dataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


cuda:0


Validation: |                                             | 0/? [00:00<?, ?it/s]


Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined

In [16]:
# _ = rand_float_test(LayerNorm, [2, 4, 768])
# _ = load_gpt2_test(LayerNorm, reference_gpt2.ln_final, "blocks.11.hook_resid_post")
# rand_int_test(Embed, [2, 4])
# load_gpt2_test(Embed, reference_gpt2.embed, tokens)
# rand_int_test(PosEmbed, [2, 4])
# load_gpt2_test(PosEmbed, reference_gpt2.pos_embed, tokens)
# rand_float_test(Attention, [2, 4, 768])
# load_gpt2_test(Attention, reference_gpt2.blocks[0].attn, cache["blocks.0.ln1.hook_normalized"])
# rand_float_test(MLP, [2, 4, 768])
# load_gpt2_test(MLP, reference_gpt2.blocks[0].mlp, cache["blocks.0.ln2.hook_normalized"])
# rand_float_test(Unembed, [2, 4, 768])
# load_gpt2_test(Unembed, reference_gpt2.unembed, cache["ln_final.hook_normalized"])
# rand_float_test(TransformerBlock, [2, 4, 768])
# load_gpt2_test(TransformerBlock, reference_gpt2.blocks[0], cache["resid_pre", 0])
# rand_int_test(DemoTransformer, [2, 4])
# load_gpt2_test(DemoTransformer, reference_gpt2, tokens)