In [1]:
from IPython import get_ipython
from IPython.display import clear_output, display

ipython = get_ipython()
ipython.magic("load_ext autoreload")
ipython.magic("autoreload 2")

  ipython.magic("load_ext autoreload")
  ipython.magic("autoreload 2")


In [2]:
import os
from typing import List, Optional, Union, Dict, Tuple
from pathlib import Path 

import torch
from torch import Tensor
import numpy as np
import einops
from fancy_einsum import einsum
import circuitsvis as cv

import transformer_lens.utils as tl_utils

from transformer_lens import HookedTransformer
import transformer_lens.patching as patching

from transformers import AutoModelForCausalLM

from torch import Tensor
from jaxtyping import Float
import plotly.express as px

from functools import partial

from torchtyping import TensorType as TT

from path_patching_cm.path_patching import Node, IterNode, path_patch, act_patch
from path_patching_cm.ioi_dataset import IOIDataset, NAMES
from neel_plotly import imshow as imshow_n

from utils.visualization import imshow_p, plot_attention_heads, plot_attention

from utils.visualization_utils import (
    plot_attention_heads,
    scatter_attention_and_contribution,
    get_attn_head_patterns
)

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

In [3]:
torch.set_grad_enabled(False)

model = HookedTransformer.from_pretrained(
    "EleutherAI/pythia-160m",
    center_unembed=True,
    center_writing_weights=True,
    fold_ln=True,
    refactor_factored_attn_matrices=False,
)
model.set_use_hook_mlp_in(True)

config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/375M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model EleutherAI/pythia-160m into HookedTransformer


## Greater-Than

In [3]:
from data.greater_than_dataset import get_prob_diff, YearDataset, get_valid_years

In [5]:
ds = YearDataset(get_valid_years(model.tokenizer, 1100, 1800), 1000, Path("data/potential_nouns.txt"), model.tokenizer)

def batch(iterable, n:int=1):
   current_batch = []
   for item in iterable:
       current_batch.append(item)
       if len(current_batch) == n:
           yield current_batch
           current_batch = []
   if current_batch:
       yield current_batch

clean = list(batch(ds.good_sentences, 9))
labels = list(batch(ds.years_YY, 9))
corrupted = list(batch(ds.bad_sentences, 9))

In [6]:
IDX = 768
#model.to_str_tokens(ds.good_toks[IDX]), model.to_str_tokens(ds.bad_toks[IDX])

In [7]:
#input_length = 1 + len(model.tokenizer(ds.good_sentences[0])[0])
prob_diff = get_prob_diff(model.tokenizer)

In [13]:
from utils.circuit_utils import run_with_batches

clean_logits = run_with_batches(model, ds.good_toks.to(device), batch_size=20, max_seq_len=12)
corrupted_logits = run_with_batches(model, ds.bad_toks.to(device), batch_size=20, max_seq_len=12)

In [14]:
clean_logits.shape

torch.Size([1000, 12, 50304])

In [16]:
prob_diff(clean_logits,ds.years_YY)

tensor(-0.8314, device='cuda:0')

In [17]:
prob_diff(corrupted_logits,ds.years_YY)

tensor(0.3180, device='cuda:0')

In [18]:
ds.good_sentences[0]

'The test lasted from the year 1702 to the year 17'

In [5]:
from data.greater_than_dataset import get_year_indices
from transformers import PreTrainedTokenizer

def get_prob_diff(tokenizer: PreTrainedTokenizer):
    year_indices = get_year_indices(tokenizer) 
    def prob_diff(logits, per_prompt, years):
        # Prob diff (negative, since it's a loss)
        probs = torch.softmax(logits[:, -1], dim=-1)[:, year_indices]
        diffs = []
        for prob, year in zip(probs, years):
            diffs.append(prob[year + 1 :].sum() - prob[: year + 1].sum())
        return -torch.stack(diffs).mean().to('cuda')
    return prob_diff

In [6]:
from utils.model_utils import clear_gpu_memory, load_model
from utils.circuit_utils import CircuitMetric
import utils.circuit_utils as cu

model_name = "pythia-160m"
model_full_name = "EleutherAI/pythia-160m"
model_tl_full_name = "EleutherAI/pythia-160m"
cache_dir = "model_cache"
batch_size = 20


model = load_model(
    model_full_name, model_tl_full_name, "step143000", cache_dir=cache_dir
)

# set up data
ds = YearDataset(get_valid_years(model.tokenizer, 1100, 1800), 100, Path("data/potential_nouns.txt"), model.tokenizer)

prob_diff = get_prob_diff(model.tokenizer)
prob_diff_metric = CircuitMetric("prob_diff", partial(prob_diff, years=ds.years_YY))

metrics = [prob_diff_metric]

# get baselines
clean_logits = cu.run_with_batches(model, ds.good_toks.to(device), batch_size=20, max_seq_len=12)
corrupted_logits = cu.run_with_batches(model, ds.bad_toks.to(device), batch_size=20, max_seq_len=12)

clean_prob_diff = prob_diff_metric(clean_logits)
print(f"Clean logit diff: {clean_prob_diff:.4f}")

corrupted_prob_diff = prob_diff_metric(corrupted_logits)
print(f"Corrupted logit diff: {corrupted_prob_diff:.4f}")

clear_gpu_memory(model)

# specify checkpoint schedule

ckpts = [1, 2]

# get values over time
results_dict = cu.get_chronological_circuit_performance_flexible(
    model_full_name,
    model_tl_full_name,
    cache_dir,
    ckpts,
    clean_tokens=ds.good_toks.to(device),
    corrupted_tokens=ds.bad_toks.to(device),
    metrics=metrics,
    max_seq_len=12,
    batch_size=batch_size,
)

# save results
os.makedirs(f"results/{model_name}-no-dropout", exist_ok=True)

for metric in results_dict.keys():
    torch.save(
        results_dict[metric], f"results/{model_name}-no-dropout/{metric}.pt"
    )

model_cache/pythia-160m/step143000


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model EleutherAI/pythia-160m into HookedTransformer
Clean logit diff: -0.8333
Corrupted logit diff: 0.2896
Moving model to device:  cpu
Loading model for step 1...
model_cache/pythia-160m/step1


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model EleutherAI/pythia-160m into HookedTransformer
Getting metric values...
Moving model to device:  cpu
Loading model for step 2...
model_cache/pythia-160m/step2


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loaded pretrained model EleutherAI/pythia-160m into HookedTransformer
Getting metric values...


### Sentiment

In [None]:
from