In [1]:
from transformer_lens import HookedTransformer
import torch
from torch.utils.data import DataLoader
from functools import partial
from utils.backup_analysis import load_model
from sys import getsizeof
from collections.abc import Mapping, Container
from utils.component_evaluation import compute_copy_score

In [2]:
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7f11c4fd2d00>

In [3]:
def print_gpu_memory_usage(label="", device="cuda:0"):
    allocated = torch.cuda.memory_allocated(device) / (1024 ** 3)  # Convert bytes to GB
    reserved = torch.cuda.memory_reserved(device) / (1024 ** 3)
    print(f"{label} - Memory Allocated: {allocated:.2f} GB, Memory Reserved: {reserved:.2f} GB")

def tensor_memory_size(tensor):
    # Get the number of elements in the tensor
    num_elements = tensor.numel()
    # Determine the size of each element based on the data type
    element_size = tensor.element_size()  # Returns size in bytes per element
    # Calculate total memory footprint
    memory_size = num_elements * element_size
    return memory_size

In [4]:
BASE_MODEL = 'pythia-2.8b'
VARIANT = None
CACHE = 'model_cache'
DEVICE = 'cuda:0'

In [5]:
model = load_model(BASE_MODEL, VARIANT, 143000, CACHE, DEVICE, large_model=True)
print_gpu_memory_usage("After loading model")



  return self.fget.__get__(instance, owner)()
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model pythia-2.8b into HookedTransformer
After loading model - Memory Allocated: 10.60 GB, Memory Reserved: 10.61 GB


In [7]:
from utils.data_utils import generate_data_and_caches
model.tokenizer.add_bos_token = False
ioi_dataset, abc_dataset = generate_data_and_caches(model, 100)

In [8]:
list_of_heads = [(9, 4), (12, 2)]
copy_scores = compute_copy_score(model, list_of_heads, ioi_dataset=ioi_dataset, batch_size=12, verbose=True) 

-------
Seq: Then, Alex and Anthony had a lot of fun at the school. Anthony gave a ring to Alex
Target: Alex
(1): a (2): the (3):less (4):, (5):

-------
Seq: Then, Alex and Anthony had a lot of fun at the school. Anthony gave a ring to Alex
Target: Anthony
(1):, (2): off (3):L (4):
 (5): it
-------
Seq: Then, Alex and Anthony had a lot of fun at the school. Anthony gave a ring to Alex
Target: Anthony
(1):, (2): off (3):
 (4):L (5):/
-------
Seq: Then, Connor and Roman were working at the house. Roman decided to give a basketball to Connor
Target: Connor
(1):. (2): of (3):, (4):
 (5): a
-------
Seq: Then, Connor and Roman were working at the house. Roman decided to give a basketball to Connor
Target: Roman
(1):
 (2): a (3): and (4): on (5): the
-------
Seq: Then, Connor and Roman were working at the house. Roman decided to give a basketball to Connor
Target: Roman
(1):
 (2): a (3): the (4): on (5): and
-------
Seq: Then, Eva and Mark were working at the store. Eva decided to give a bon

In [9]:
print_gpu_memory_usage("After computing copy scores")

After computing copy scores - Memory Allocated: 10.61 GB, Memory Reserved: 26.36 GB


In [7]:
# from utils.head_metrics import BatchIOIDataset, collate_fn
# ioi_dataset.__class__ = BatchIOIDataset
# ioi_dataloader = DataLoader(ioi_dataset, batch_size=20, collate_fn=partial(collate_fn, device=model.cfg.device))

In [8]:
from transformer_lens import ActivationCache
from typing import List

def concatenate_activation_caches(caches: List[ActivationCache]) -> ActivationCache:
    # input to ActivationCache should be a dictionary where the keys are individual keys of
    # caches[0] and the values are concatenated across all batches via the first dimension
    return ActivationCache({k: torch.cat([c[k] for c in caches], dim=0) for k in caches[0].keys()}, model.cfg.model_name, True)

def run_with_cache_batched(model: HookedTransformer, input_ids: torch.Tensor, batch_size: int = 20):
    results = []
    caches = []
    total_size = input_ids.shape[0]
    for i in range(0, total_size, batch_size):
        # Adjust batch end index to avoid going out of bounds
        batch_input = input_ids[i:min(i+batch_size, total_size)]
        batch_logits, batch_cache = model.run_with_cache(batch_input)
        # send to CPU
        batch_logits = batch_logits.cpu()
        batch_cache = batch_cache.to('cpu')
        print_gpu_memory_usage(f"Batch {i//batch_size}")
        results.append(batch_logits)
        caches.append(batch_cache)
    return torch.cat(results, dim=0), concatenate_activation_caches(caches)

logits_from_batched, cache_from_batched = run_with_cache_batched(model, ioi_dataset.toks.long(), batch_size=12)

Batch 0 - Memory Allocated: 10.61 GB, Memory Reserved: 24.87 GB
Batch 1 - Memory Allocated: 10.61 GB, Memory Reserved: 24.87 GB


In [9]:
running_size = 0
for k in cache_from_batched.keys():
    #print(f"Key: {k}, Size: {tensor_memory_size(cache_from_batched[k]) / (1024 ** 3):.2f} GB")
    #print(tensor_memory_size(cache_from_batched[k]))
    running_size += tensor_memory_size(cache_from_batched[k])
    #print(f"New running size: {running_size} bytes")
print(f"Total size: {running_size / (1024 ** 3):.2f} GB")

Total size: 23.15 GB


: 

In [19]:
running_size

15052800

In [15]:
logits, cache = model.run_with_cache(ioi_dataset.toks.long())

OutOfMemoryError: CUDA out of memory. Tried to allocate 460.00 MiB. GPU 0 has a total capacty of 47.54 GiB of which 8.38 MiB is free. Process 1312631 has 308.00 MiB memory in use. Process 2252695 has 47.22 GiB memory in use. Of the allocated memory 44.73 GiB is allocated by PyTorch, and 2.18 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [26]:
cache['hook_embed'].shape

torch.Size([100, 21, 768])

In [27]:
for k in cache.keys():
    if cache[k].shape[0] != 100:
        print(k, cache[k].shape)

In [128]:
torch.allclose(logits, logits_from_batched, atol=1e-01)

True

In [129]:
for k in cache.keys():
    if not torch.allclose(cache[k], cache_from_batched[k], atol=1e-01):
        print(k, cache[k].shape, cache_from_batched[k].shape)
        # if 'attn_scores' not in k:
        for p in range(100):
            if not torch.allclose(logits[p][ioi_dataset.word_idx['end'][p]], logits_from_batched[p][ioi_dataset.word_idx['end'][p]], atol=1e-02):
                print(f"index {p}")
                print(ioi_dataset.ioi_prompts[p])
                print(logits[p][ioi_dataset.word_idx['end'][p]][:5], logits_from_batched[p][ioi_dataset.word_idx['end'][p]][:5])