## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import numpy as np


In [3]:
from transformers import GPTNeoXForCausalLM, AutoTokenizer
from safetensors.torch import load_file

from quelle.approx_unrolling.utils import TensorDict

import os

In [None]:
torch.zeros(
    1,
    dtype=torch.float16,
    requires_grad=True,
)

tensor([0.], dtype=torch.float16, requires_grad=True)

## Load model

In [5]:
from transformers import GPTNeoXForCausalLM, AutoTokenizer

model_str = "EleutherAI/pythia-14m"
step = 5000
model = GPTNeoXForCausalLM.from_pretrained(
    model_str,
    revision=f"step{step}",
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(
    model_str,
    revision=f"step{step}",
)

## Debugging gradient covariance

In [None]:
import random


torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)  # For multi-GPU
np.random.seed(42)
random.seed(42)

from quelle.approx_unrolling.language_task import LanguageModelingTask
from quelle.approx_unrolling.pile_data import get_pile_dataset
from quelle.hessians.analyzer import prepare_model
from quelle.hessians.arguments import FactorArguments


module_keys = [m[0] for m in model.named_modules()]

task = LanguageModelingTask(module_keys=module_keys)


factors_name = "ekfac"
factor_args = FactorArguments(strategy=factors_name)  # type:ignore
factors_name += "_half"

model = prepare_model(model, task)


from quelle.hessians.analyzer import Analyzer
from quelle.hessians.utils.dataset import DataLoaderKwargs
from transformers import default_data_collator


analyzer = Analyzer(
    analysis_name="",
    model=model,
    task=task,
)

module_partition_names, target_module_partitions = analyzer._get_module_partition(
    module_partitions=factor_args.lambda_module_partitions,
    target_module_partitions=None,
)


logger:logger.py:log:INFO:  Tracking modules with names: ['gpt_neox.layers.0.attention.query_key_value', 'gpt_neox.layers.0.attention.dense', 'gpt_neox.layers.0.mlp.dense_h_to_4h', 'gpt_neox.layers.0.mlp.dense_4h_to_h', 'gpt_neox.layers.1.attention.query_key_value', 'gpt_neox.layers.1.attention.dense', 'gpt_neox.layers.1.mlp.dense_h_to_4h', 'gpt_neox.layers.1.mlp.dense_4h_to_h', 'gpt_neox.layers.2.attention.query_key_value', 'gpt_neox.layers.2.attention.dense', 'gpt_neox.layers.2.mlp.dense_h_to_4h', 'gpt_neox.layers.2.mlp.dense_4h_to_h', 'gpt_neox.layers.3.attention.query_key_value', 'gpt_neox.layers.3.attention.dense', 'gpt_neox.layers.3.mlp.dense_h_to_4h', 'gpt_neox.layers.3.mlp.dense_4h_to_h', 'gpt_neox.layers.4.attention.query_key_value', 'gpt_neox.layers.4.attention.dense', 'gpt_neox.layers.4.mlp.dense_h_to_4h', 'gpt_neox.layers.4.mlp.dense_4h_to_h', 'gpt_neox.layers.5.attention.query_key_value', 'gpt_neox.layers.5.attention.dense', 'gpt_neox.layers.5.mlp.dense_h_to_4h', 'gpt_neox

In [7]:
train_dataset = get_pile_dataset(model_str="EleutherAI/pythia-14m", step=0, max_samples=1)
dataloader_kwargs = DataLoaderKwargs(collate_fn=default_data_collator)
analyzer.set_dataloader_kwargs(dataloader_kwargs)
dataloader_params = analyzer._configure_dataloader(dataloader_kwargs)
dataset = train_dataset

if factor_args.covariance_max_examples is None:
    total_data_examples = len(dataset)
else:
    total_data_examples = min([factor_args.covariance_max_examples, len(dataset)])

data_partition_indices, target_data_partitions = analyzer._get_data_partition(
    total_data_examples=total_data_examples,
    data_partitions=factor_args.covariance_data_partitions,
    target_data_partitions=None,
)

for data_partition in target_data_partitions:
    start_index, end_index = data_partition_indices[data_partition]

loader = analyzer._get_dataloader(
    dataset=train_dataset,
    per_device_batch_size=32,
    dataloader_params=dataloader_params,
    indices=list(range(start_index, end_index)),
    allow_duplicates=False,
)


pile_data:pile_data.py:get_pile_dataset:INFO:  Loading Pile 10k dataset...


pile_data:pile_data.py:get_pile_dataset:INFO:  Loading tokenizer for EleutherAI/pythia-14m at step 0...
pile_data:pile_data.py:get_pile_dataset:INFO:  Limiting to 1 samples...
pile_data:pile_data.py:get_pile_dataset:INFO:  Tokenizing dataset...


num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


pile_data:pile_data.py:get_pile_dataset:INFO:  Grouping texts into chunks of 2048 tokens...


num_proc must be <= 1. Reducing num_proc to 1 for dataset of size 1.


pile_data:pile_data.py:get_pile_dataset:INFO:  Final dataset size: 1 samples
pile_data:pile_data.py:get_pile_dataset:INFO:  Each sample has 2048 tokens
logger:logger.py:log:INFO:  Using the DataLoader parameters: {'num_workers': 0, 'collate_fn': <function default_data_collator at 0x7ba99811f9c0>, 'pin_memory': False, 'timeout': 0, 'worker_init_fn': None, 'multiprocessing_context': None, 'generator': None, 'prefetch_factor': None, 'persistent_workers': False, 'pin_memory_device': ''}.


In [6]:
# cov = analyzer.fit_covariance_matrices(
#     factors_name=factors_name,
#     dataset=train_dataset,
#     per_device_batch_size=32,
#     initial_per_device_batch_size_attempt=32,
#     dataloader_kwargs=dataloader_kwargs,
#     factor_args=factor_args,
# )

# cov_2 = analyzer.fit_covariance_matrices(
#     factors_name=factors_name,
#     dataset=train_dataset,
#     per_device_batch_size=32,
#     initial_per_device_batch_size_attempt=32,
#     dataloader_kwargs=dataloader_kwargs,
#     factor_args=factor_args,
# )

In [8]:
from quelle.hessians.factor.covariance import fit_covariance_matrices_with_loader


d = fit_covariance_matrices_with_loader(
    model=analyzer.model,
    state=analyzer.state,
    task=analyzer.task,
    loader=loader,
    factor_args=factor_args,
    tracked_module_names=module_partition_names[0],
    disable_tqdm=analyzer.disable_tqdm,
)[1]

d["gradient_covariance"].keys()

test_key = "gpt_neox.layers.0.attention.query_key_value"

activ_1 = TensorDict(d["activation_covariance"])
grad_1 = TensorDict(d["gradient_covariance"])


Fitting covariance matrices [0/1]   0%|           [time left: ?, time spent: 00:00]

Fitting covariance matrices [1/1] 100%|██████████ [time left: 00:00, time spent: 00:00]


In [9]:
from quelle.hessians.factor.covariance import fit_covariance_matrices_with_loader


d_2 = fit_covariance_matrices_with_loader(
    model=analyzer.model,
    state=analyzer.state,
    task=analyzer.task,
    loader=loader,
    factor_args=factor_args,
    tracked_module_names=module_partition_names[0],
    disable_tqdm=analyzer.disable_tqdm,
)[1]
activ_2 = TensorDict(d_2["activation_covariance"])
grad_2 = TensorDict(d_2["gradient_covariance"])


Fitting covariance matrices [1/1] 100%|██████████ [time left: 00:00, time spent: 00:00]


In [10]:
diff_activ = activ_1 - activ_2
diff_grad = grad_1 - grad_2

close_activ = activ_1.allclose(activ_2, atol=1e-4, rtol=1e-4)
close_grad = grad_1.allclose(grad_2, atol=1e-4, rtol=1e-4)


In [11]:
for k, v in close_activ.items():
    if not v:
        print(f"activations differ for {k}: {diff_activ[k].max()}")

In [12]:
for k, v in close_grad.items():
    if not v:
        print(f"activations differ for {k}: {diff_grad[k].max()}")

activations differ for gpt_neox.layers.0.attention.query_key_value: 11.477673530578613
activations differ for gpt_neox.layers.0.attention.dense: 501.98358154296875
activations differ for gpt_neox.layers.0.mlp.dense_h_to_4h: 20.246353149414062
activations differ for gpt_neox.layers.0.mlp.dense_4h_to_h: 501.98358154296875
activations differ for gpt_neox.layers.1.attention.query_key_value: 20.3133544921875
activations differ for gpt_neox.layers.1.attention.dense: 351.5631103515625
activations differ for gpt_neox.layers.1.mlp.dense_h_to_4h: 10.070480346679688
activations differ for gpt_neox.layers.1.mlp.dense_4h_to_h: 351.5631103515625
activations differ for gpt_neox.layers.2.attention.query_key_value: 44.117828369140625
activations differ for gpt_neox.layers.2.attention.dense: 365.1512451171875
activations differ for gpt_neox.layers.2.mlp.dense_h_to_4h: 28.146560668945312
activations differ for gpt_neox.layers.2.mlp.dense_4h_to_h: 365.1512451171875
activations differ for gpt_neox.layers.3

In [59]:
state = analyzer.state
from accelerate.utils import find_batch_size, send_to_device
from torch import GradScaler, autocast
from quelle.hessians.module.utils import set_attention_mask
from quelle.hessians.utils.state import State, no_sync

enable_amp = True

enable_grad_scaler = enable_amp and factor_args.amp_dtype == torch.float16

scaler = GradScaler(device="cuda", init_scale=factor_args.amp_scale, enabled=enable_grad_scaler)
for index, batch in enumerate(loader):
    batch = send_to_device(batch, device=state.device)

    attention_mask = task.get_attention_mask(batch=batch)

    if attention_mask is not None:
        set_attention_mask(model=model, attention_mask=attention_mask)

    with no_sync(model=model, state=state):
        model.zero_grad(set_to_none=True)
        with autocast(
            device_type=state.device.type,
            enabled=enable_amp,
            dtype=factor_args.amp_dtype,
        ):
            loss = task.compute_train_loss(
                batch=batch,
                model=model,
                sample=True,
            )

        scaler.scale(loss).backward()

In [60]:
print(loss)

tensor(9166.1602, device='cuda:0', grad_fn=<NllLossBackward0>)


In [49]:
modules = list(model.modules())

In [45]:
modules[11].storage

{'activation_eigenvectors': None,
 'activation_eigenvalues': None,
 'gradient_eigenvectors': None,
 'gradient_eigenvalues': None,
 'lambda_matrix': None,
 'num_lambda_processed': None,
 'activation_covariance': None,
 'gradient_covariance': None,
 'num_activation_covariance_processed': None,
 'num_gradient_covariance_processed': None}

In [46]:
from quelle.hessians.module.utils import load_factors
from quelle.hessians.utils.constants import COVARIANCE_FACTOR_NAMES


for factor_name in COVARIANCE_FACTOR_NAMES:
    factor = load_factors(
        model=model,
        factor_name=factor_name,
        tracked_module_names=None,
        cpu=True,
    )

In [14]:
from quelle.hessians.module.tracked_module import ModuleMode, TrackedModule

for module in model.modules():
    
    if isinstance(module, TrackedModule):


TrackedLinear(
  (original_module): Linear(in_features=128, out_features=384, bias=True)
)
TrackedLinear(
  (original_module): Linear(in_features=128, out_features=128, bias=True)
)
TrackedLinear(
  (original_module): Linear(in_features=128, out_features=512, bias=True)
)
TrackedLinear(
  (original_module): Linear(in_features=512, out_features=128, bias=True)
)
TrackedLinear(
  (original_module): Linear(in_features=128, out_features=384, bias=True)
)
TrackedLinear(
  (original_module): Linear(in_features=128, out_features=128, bias=True)
)
TrackedLinear(
  (original_module): Linear(in_features=128, out_features=512, bias=True)
)
TrackedLinear(
  (original_module): Linear(in_features=512, out_features=128, bias=True)
)
TrackedLinear(
  (original_module): Linear(in_features=128, out_features=384, bias=True)
)
TrackedLinear(
  (original_module): Linear(in_features=128, out_features=128, bias=True)
)
TrackedLinear(
  (original_module): Linear(in_features=128, out_features=512, bias=True)
)

## 1. Exammining the matrices

In [128]:
def test_comparison(path_1, path_2):
    files_1 = os.listdir(path_1)
    files_2 = os.listdir(path_2)

    for file_1 in files_1:
        if file_1 in files_2:
            if file_1.endswith(".safetensors"):
                tensor_1 = TensorDict(
                    load_file(
                        os.path.join(path_1, file_1),
                        device="cuda",
                    )
                )
                tensor_2 = TensorDict(
                    load_file(
                        os.path.join(path_2, file_1),
                        device="cuda",
                    )
                )
                diff = tensor_1 - tensor_2
                all_close = tensor_1.allclose(tensor_2, rtol=1e-5, atol=1e-5)
                all_close_values = all(all_close.values())
                if not all_close_values:
                    print(file_1)
                    print("Differences found:")
                    print(diff.max())
                # check if all_close has any key that is False


In [129]:
path_1 = "/root/quelle/tests/caches/cache_1/.models/EleutherAI/pythia-14m/checkpoint_1000/influence_results/factors_ekfac_half"
path_2 = "/root/quelle/tests/caches/cache_2/.models/EleutherAI/pythia-14m/checkpoint_1000/influence_results/factors_ekfac_half"

In [None]:
test_comparison(path_2, path_1)

In [None]:
test_comparison(path_2, path_1)

In [84]:
tensor_1 = TensorDict(load_file(os.path.join(path_1, "gradient_covariance.safetensors"), device="cuda"))
tensor_2 = TensorDict(load_file(os.path.join(path_2, "gradient_covariance.safetensors"), device="cuda"))

In [None]:
(tensor_1 - tensor_2).argmax()

In [None]:
for k, v in (tensor_1 - tensor_2).items():
    print(k, v.max().item(), v.min().item(), v.mean().item(), v.std().item())

In [None]:
d = TensorDict(
    load_file(
        "/root/quelle/quelle/approx_unrolling/.models/EleutherAI/pythia-14m/segment_0/influence_results/factors_ekfac_half/average_gradient_covariance.safetensors",
        device="cuda",
    )
)


d_2 = TensorDict(
    load_file(
        "/root/quelle/.models/EleutherAI/influence_results/factors_ekfac_half/gradient_covariance.safetensors",
        device="cuda",
    )
)

diff = d - d_2

for k, v in diff.items():
    if v.max() < 1e-5:
        continue
    print(k)
    print(v.max())
    print("----" * 10)

In [None]:
number = 15335424
# determine prime decomposition of number
number / 7488


In [5]:
from quelle.approx_unrolling.utils import TensorDict


d = TensorDict(d)

In [7]:
test_list = [d, d]

In [None]:
sum(test_list)

In [42]:
path = "/home/louis/quelle/quelle/approx_unrolling/influence_results/wikitext/factors_ekfac_half"

# list of all subfolders or files
import os

subfolders = []
for dirpath, dirnames, filenames in os.walk(path):
    for dirname in dirnames:
        subfolders.append(os.path.join(dirpath, dirname))
    for filename in filenames:
        subfolders.append(os.path.join(dirpath, filename))


In [6]:
filenames
filenames_json = [f for f in subfolders if f.endswith(".json")]
filesnames_safetensors = [f for f in subfolders if f.endswith(".safetensors")]

In [None]:
filenames_json

In [None]:
import json

json_file = filenames_json[0]
with open(json_file, "r") as f:
    data = json.load(f)
data

In [None]:
filesnames_safetensors

In [None]:
from safetensors import safe_open

for i in range(len(filesnames_safetensors)):
    path = filesnames_safetensors[i]
    tensors = {}
    with safe_open(path, framework="pt", device=0) as f:
        for k in f.keys():
            tensors[k] = f.get_tensor(k)
    print(tensors.keys())

In [11]:
path = filesnames_safetensors[-2]
with safe_open(path, framework="pt", device=0) as f:
    for k in f.keys():
        tensors[k] = f.get_tensor(k)

In [29]:
gpt_2 = torch.load("/home/louis/quelle/quelle/approx_unrolling/checkpoints/model.pth")

In [6]:
all_weights = model.state_dict()

all_mlps = {k: v for k, v in all_weights.items() if "mlp" in k}


In [None]:
len(all_mlps)

In [None]:
type(model.named_modules())

In [31]:
module_keys = [m[0] for m in model.named_modules()]

In [48]:
track_attention = True
track_mlp = True
total_modules = []
for m in module_keys:
    if "dropout" in m.lower() or "layernorm" in m.lower():
        continue

    if "attention" in m.lower() and track_attention:
        total_modules.append(m)
    if "mlp" in m.lower() and track_mlp:
        total_modules.append(m)


In [4]:
from examples.wikitext.pipeline import get_wikitext_dataset

train_dataset = get_wikitext_dataset(
    split="eval_train",
)

In [7]:
# sample from train_dataset
sample = train_dataset[0]
