## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import numpy as np


In [3]:
from transformers import GPTNeoXForCausalLM, AutoTokenizer
from safetensors.torch import load_file

from bergson.approx_unrolling.utils import TensorDict

import os

In [4]:
torch.zeros(
    1,
    dtype=torch.float16,
    requires_grad=True,
)

tensor([0.], dtype=torch.float16, requires_grad=True)

## Load model

In [5]:
from transformers import GPTNeoXForCausalLM, AutoTokenizer

model_str = "EleutherAI/pythia-14m"
step = 5000
model = GPTNeoXForCausalLM.from_pretrained(
    model_str,
    revision=f"step{step}",
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(
    model_str,
    revision=f"step{step}",
)

## 1. Exammining the matrices

In [128]:
def test_comparison(path_1, path_2):
    files_1 = os.listdir(path_1)
    files_2 = os.listdir(path_2)

    for file_1 in files_1:
        if file_1 in files_2:
            if file_1.endswith(".safetensors"):
                tensor_1 = TensorDict(
                    load_file(
                        os.path.join(path_1, file_1),
                        device="cuda",
                    )
                )
                tensor_2 = TensorDict(
                    load_file(
                        os.path.join(path_2, file_1),
                        device="cuda",
                    )
                )
                diff = tensor_1 - tensor_2
                all_close = tensor_1.allclose(tensor_2, rtol=1e-5, atol=1e-5)
                all_close_values = all(all_close.values())
                if not all_close_values:
                    print(file_1)
                    print("Differences found:")
                    print(diff.max())
                # check if all_close has any key that is False


In [129]:
path_1 = "/root/bergson/tests/caches/cache_1/.models/EleutherAI/pythia-14m/checkpoint_1000/influence_results/factors_ekfac_half"
path_2 = "/root/bergson/tests/caches/cache_2/.models/EleutherAI/pythia-14m/checkpoint_1000/influence_results/factors_ekfac_half"

In [None]:
test_comparison(path_2, path_1)

In [None]:
test_comparison(path_2, path_1)

In [84]:
tensor_1 = TensorDict(load_file(os.path.join(path_1, "gradient_covariance.safetensors"), device="cuda"))
tensor_2 = TensorDict(load_file(os.path.join(path_2, "gradient_covariance.safetensors"), device="cuda"))

In [None]:
(tensor_1 - tensor_2).argmax()

In [None]:
for k, v in (tensor_1 - tensor_2).items():
    print(k, v.max().item(), v.min().item(), v.mean().item(), v.std().item())

In [None]:
d = TensorDict(
    load_file(
        "/root/bergson/bergson/approx_unrolling/.models/EleutherAI/pythia-14m/segment_0/influence_results/factors_ekfac_half/average_gradient_covariance.safetensors",
        device="cuda",
    )
)


d_2 = TensorDict(
    load_file(
        "/root/bergson/.models/EleutherAI/influence_results/factors_ekfac_half/gradient_covariance.safetensors",
        device="cuda",
    )
)

diff = d - d_2

for k, v in diff.items():
    if v.max() < 1e-5:
        continue
    print(k)
    print(v.max())
    print("----" * 10)

In [None]:
number = 15335424
# determine prime decomposition of number
number / 7488


In [5]:
from bergson.approx_unrolling.utils import TensorDict


d = TensorDict(d)

In [7]:
test_list = [d, d]

In [None]:
sum(test_list)

In [42]:
path = "/home/louis/bergson/bergson/approx_unrolling/influence_results/wikitext/factors_ekfac_half"

# list of all subfolders or files
import os

subfolders = []
for dirpath, dirnames, filenames in os.walk(path):
    for dirname in dirnames:
        subfolders.append(os.path.join(dirpath, dirname))
    for filename in filenames:
        subfolders.append(os.path.join(dirpath, filename))


In [6]:
filenames
filenames_json = [f for f in subfolders if f.endswith(".json")]
filesnames_safetensors = [f for f in subfolders if f.endswith(".safetensors")]

In [None]:
filenames_json

In [None]:
import json

json_file = filenames_json[0]
with open(json_file, "r") as f:
    data = json.load(f)
data

In [None]:
filesnames_safetensors

In [None]:
from safetensors import safe_open

for i in range(len(filesnames_safetensors)):
    path = filesnames_safetensors[i]
    tensors = {}
    with safe_open(path, framework="pt", device=0) as f:
        for k in f.keys():
            tensors[k] = f.get_tensor(k)
    print(tensors.keys())

In [11]:
path = filesnames_safetensors[-2]
with safe_open(path, framework="pt", device=0) as f:
    for k in f.keys():
        tensors[k] = f.get_tensor(k)

In [29]:
gpt_2 = torch.load("/home/louis/bergson/bergson/approx_unrolling/checkpoints/model.pth")

In [6]:
all_weights = model.state_dict()

all_mlps = {k: v for k, v in all_weights.items() if "mlp" in k}


In [None]:
len(all_mlps)

In [None]:
type(model.named_modules())

In [31]:
module_keys = [m[0] for m in model.named_modules()]

In [48]:
track_attention = True
track_mlp = True
total_modules = []
for m in module_keys:
    if "dropout" in m.lower() or "layernorm" in m.lower():
        continue

    if "attention" in m.lower() and track_attention:
        total_modules.append(m)
    if "mlp" in m.lower() and track_mlp:
        total_modules.append(m)


In [4]:
from examples.wikitext.pipeline import get_wikitext_dataset

train_dataset = get_wikitext_dataset(
    split="eval_train",
)

In [7]:
# sample from train_dataset
sample = train_dataset[0]


## OOM debugging

In [8]:
import pickle
from collections import defaultdict


def analyze_gpu_memory_trace(pickle_file):
    with open(pickle_file, "rb") as f:
        snapshot = pickle.load(f)

    print("=== GPU Memory Timeline ===")
    allocations = []

    for trace in snapshot.get("traces", []):
        if trace.get("action") in ["alloc", "free"]:
            time_s = trace.get("ts", 0) / 1000000
            size_gb = trace.get("size", 0) / (1024**3)
            action = trace.get("action")

            # Get user code location
            frames = trace.get("frames", [])
            user_frame = None
            for frame in frames:
                filename = frame.get("filename", "")
                if not any(skip in filename for skip in ["torch", "cuda", "python", "site-packages"]):
                    user_frame = frame
                    break

            if user_frame and size_gb > 0.001:  # > 1MB
                allocations.append(
                    {
                        "time": time_s,
                        "size_gb": size_gb,
                        "action": action,
                        "file": user_frame.get("filename", "").split("/")[-1],
                        "line": user_frame.get("line", "?"),
                        "function": user_frame.get("name", "unknown"),
                    }
                )

    # Sort by time
    allocations.sort(key=lambda x: x["time"])

    print(f"{'Time (s)':>8} | {'Action':>5} | {'Memory (GB)':>10} | {'Location':>20}")
    print("-" * 60)

    running_total = 0
    for alloc in allocations:
        if alloc["action"] == "alloc":
            running_total += alloc["size_gb"]
        else:
            running_total -= alloc["size_gb"]

        location = f"{alloc['file']}:{alloc['line']}"
        print(f"{alloc['time']:8.3f} | {alloc['action']:>5} | {alloc['size_gb']:10.3f} | {location:>20}")

    print(f"\nPeak memory usage: {max(running_total, 0):.3f} GB")


# Use it

snapshot_path = "/root/quelle/bergson/approx_unrolling/auto_memory_trace.pickle"
analyze_gpu_memory_trace(snapshot_path)

=== GPU Memory Timeline ===
Time (s) | Action | Memory (GB) |             Location
------------------------------------------------------------

Peak memory usage: 0.000 GB


In [6]:
import torch
import pickle

# Enable recording
torch.cuda.memory._record_memory_history()

# Create a test tensor to ensure something gets recorded
print("Creating test tensor...")
test_tensor = torch.randn(1000, 1000).cuda()
print(f"Test tensor created, memory allocated: {torch.cuda.memory_allocated() / 1024**3:.3f} GB")

# Your existing code here...
# ... your training code ...

# Before dumping, check if there's any recorded history
try:
    snapshot = torch.cuda.memory._snapshot()
    print(f"Snapshot contains {len(snapshot.get('traces', []))} traces")

    # Dump to file
    torch.cuda.memory._dump_snapshot("debug_memory_trace.pickle")
    print("Snapshot dumped to file")

except Exception as e:
    print(f"Error creating snapshot: {e}")

# Disable recording
torch.cuda.memory._record_memory_history()

Creating test tensor...
Test tensor created, memory allocated: 0.004 GB
Snapshot contains 0 traces
Snapshot dumped to file


In [6]:
batch = 32
d = 100

A = torch.randn(batch, d, device="cuda")

In [8]:
# batch outer product
A_out = torch.einsum("bi,bj->bij", A, A)

In [23]:
A_out_mean = A_out.mean(dim=0)

In [24]:
A_out_mean

tensor([[ 1.2634e+00, -2.6464e-01,  1.8185e-01,  ..., -6.5764e-02,
         -3.6784e-01, -7.1913e-02],
        [-2.6464e-01,  1.3255e+00,  7.4584e-04,  ...,  2.5179e-01,
         -4.0879e-01, -1.2299e-01],
        [ 1.8185e-01,  7.4584e-04,  1.0283e+00,  ..., -3.0263e-01,
         -5.8569e-02,  1.7342e-01],
        ...,
        [-6.5764e-02,  2.5179e-01, -3.0263e-01,  ...,  9.6525e-01,
         -2.2347e-01, -6.7726e-02],
        [-3.6784e-01, -4.0879e-01, -5.8569e-02,  ..., -2.2347e-01,
          1.5817e+00,  7.9668e-03],
        [-7.1913e-02, -1.2299e-01,  1.7342e-01,  ..., -6.7726e-02,
          7.9668e-03,  6.3061e-01]], device='cuda:0')

In [25]:
A_out_mean_2 = A.T @ A / batch

In [28]:
A_out_mean.allclose(A_out_mean_2, atol=1e-5)

True

In [31]:
A_out_3 = torch.einsum("bi,bj->ij", A, A) / batch

In [32]:
A_out_3.allclose(A_out_mean_2, atol=1e-5)

True