In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

import transformer_lens
import transformer_lens.utils as utils
from transformer_lens.hook_points import HookedRootModule, HookPoint
from transformer_lens import HookedTransformer, HookedTransformerConfig, FactoredMatrix, ActivationCache

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [3]:
model = HookedTransformer.from_pretrained("distilgpt2", device=device)

Using pad_token, but it is not set yet.


Loaded pretrained model distilgpt2 into HookedTransformer


# Loading BoolQ

In [4]:
import json

def read_jsonl(filename):
    with open(filename, "r") as f:
        lines = f.readlines()
    
    return [json.loads(line) for line in lines]

def load_data_boolq(filename):

    boolq = pd.DataFrame(read_jsonl(filename))

    questions = [val + "?" for val in boolq["question"]]
    
    tokens = model.tokenizer(questions, padding=True, return_tensors="pt").input_ids.to(device)

    return tokens, np.array(boolq["answer"])

tokens, labels = load_data_boolq("train.jsonl")

In [5]:
print(len(tokens))

9427


In [53]:
def batched_run_with_cache(model, data, batch_size=16, layer=23, token=-1):
    activations = []
    out_vals = []
    with torch.no_grad():
        for i in range(0, len(data), batch_size):
            if i % 1000 == 0:
                print(i)
            out, cache = model.run_with_cache(data[i:i+batch_size])
            out_vals.append(out)
            activations.append(cache[f"blocks.{layer}.hook_resid_post"][:, token])
    
    return torch.cat(activations, dim=0), torch.cat(out_vals, dim=0)

activations, out = batched_run_with_cache(model, tokens)

0


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 39.45 GiB total capacity; 37.03 GiB already allocated; 8.25 MiB free; 37.58 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
model()

TypeError: HookedTransformer.forward() got an unexpected keyword argument 'batch_size'

# Loading TruthfulQA

In [None]:
qadata = pd.read_csv("TruthfulQA.csv")
qadata_test = pd.read_csv("TruthfulQATest.csv")

In [None]:
true_sentences = []
false_sentences = []

for row in qadata.iloc:
    corrects = row["Correct Answers"].split("; ")
    incorrects = row["Incorrect Answers"].split("; ")

    for correct in corrects:
        true_sentences.append(f"Q: {row['Question']}\nA: {correct}")

    for incorrect in incorrects:
        false_sentences.append(f"Q: {row['Question']}\nA: {incorrect}")

In [None]:
true_sentences_test = []
false_sentences_test = []

for row in qadata_test.iloc:
    corrects = row["Correct Answers"].split("; ")
    incorrects = row["Incorrect Answers"].split("; ")

    for correct in corrects:
        true_sentences_test.append(f"Q: {row['Question']}\nA: {correct}")

    for incorrect in incorrects:
        false_sentences_test.append(f"Q: {row['Question']}\nA: {incorrect}")

In [None]:
q = """Q: Why do veins appear blue?
A: Veins appear blue because there is no blood
Q: Was that answer true? (Yes/No)
A:"""

In [None]:
out = model(model.tokenizer([q], padding=True, return_tensors="pt").input_ids.to(device))
print(model.to_str_tokens(np.argsort(out[0, -1].detach().cpu().numpy())[::-1][:5]))

[' Yes', ' Ve', ' No', ' The', ' It']


# Model Training

In [None]:
true_data = model.tokenizer(true_sentences, padding=True, return_tensors="pt").input_ids.to(device)
false_data = model.tokenizer(false_sentences, padding=True, return_tensors="pt").input_ids.to(device)

true_data_test = model.tokenizer(true_sentences_test, padding=True, return_tensors="pt").input_ids.to(device)
false_data_test = model.tokenizer(false_sentences_test, padding=True, return_tensors="pt").input_ids.to(device)

In [None]:
layer_num = 1
batch_size = 16

activations_list = []

for i in range(0, len(true_data), batch_size):

    out, cache = model.run_with_cache(true_data[i:i+batch_size])

    activations = cache[f"blocks.{layer_num}.hook_resid_post"][:, -1, :]
    
    activations_list.append(activations)

positive_samples = torch.cat(activations_list, dim=0)

In [None]:
activations_list = []

for i in range(0, len(false_data), batch_size):

    out, cache = model.run_with_cache(false_data[i:i+batch_size])

    activations = cache[f"blocks.{layer_num}.hook_resid_post"][:, -1, :]
    
    activations_list.append(activations)

negative_samples = torch.cat(activations_list, dim=0)

In [None]:
data_samples = torch.cat((positive_samples, negative_samples), dim=0)
labels = torch.tensor([1.] * len(positive_samples) + [0.] * len(negative_samples), device=device)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

data_train, data_test, labels_train, labels_test = train_test_split(data_samples.cpu(), labels.cpu(), test_size=0.1)

lr_model = LogisticRegression()

fitted_model = lr_model.fit(data_train, labels_train)

# print(fitted_model.predict(data_test))

print(fitted_model.score(data_test, labels_test))
# print(fitted_model.score(data_samples.cpu()[-10:], labels.cpu()[-10:]))

1.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
out, cache = model.run_with_cache(true_data_test)

activations = cache[f"blocks.{layer_num}.hook_resid_post"][:, -1, :]

fitted_model.predict(activations.cpu())

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0.], dtype=float32)

In [None]:
model.to_str_tokens(true_data[2])

['Q',
 ':',
 ' Why',
 ' do',
 ' veins',
 ' appear',
 ' blue',
 '?',
 '\n',
 'A',
 ':',
 ' Ve',
 'ins',
 ' appear',
 ' blue',
 ' because',
 ' blue',
 ' light',
 ' does',
 ' not',
 ' penetrate',
 ' deeply',
 ' into',
 ' human',
 ' tissue',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|en

In [None]:
model.to_str_tokens(torch.argmax(out.cpu(), dim=-1)[2])

['.',
 ' What',
 ' is',
 ' when',
 ' the',
 ' when',
 ' you',
 "'re",
 ' too',
 'melon',
 '?',
 '?',
 '\n',
 '\n',
 ':',
 ' I',
 ' seeds',
 'melon',
 ' seeds',
 ' are',
 ' through',
 ' a',
 ' body',
 ' system',
 ' and',
 'The',
 'The',
 'The',
 'The',
 'The',
 'The',
 'The',
 'The',
 'The',
 'The',
 'The',
 'The']

In [None]:
print(torch.tensor(data["input_ids"]))

NameError: name 'data' is not defined