In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

import transformer_lens
import transformer_lens.utils as utils
from transformer_lens.hook_points import HookedRootModule, HookPoint
from transformer_lens import HookedTransformer, HookedTransformerConfig, FactoredMatrix, ActivationCache

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [3]:
model = HookedTransformer.from_pretrained("gpt2-medium", device=device)

Using pad_token, but it is not set yet.


Loaded pretrained model gpt2-medium into HookedTransformer


# Loading BoolQ

In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = "allenai/unifiedqa-t5-small" # you can specify the model size here
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

In [35]:
import json
import re

def read_jsonl(filename):
    with open(filename, "r") as f:
        lines = f.readlines()
    
    return [json.loads(line) for line in lines]

def clean_text(text):
    text = text.lower()
    text = re.sub("'(.*)'", r"\1", text)
    return text

def load_data_boolq(filename, tokenizer):

    boolq = pd.DataFrame(read_jsonl(filename))

    # questions = [f"Passage: {passage}\n\nAfter reading this passage, I have a question: {val}? True or False?" for val, passage in zip(boolq["question"], boolq["passage"])]

    questions = [f"{clean_text(val)}?  \\n {clean_text(passage)} ..."for val, passage in zip(boolq["question"], boolq["passage"])]

    unpadded = tokenizer(questions)

    lens = [len(seq) for seq in unpadded["input_ids"]]
    
    tokens = tokenizer(questions, padding=True, return_tensors="pt").input_ids.to(device)

    return tokens, lens, np.array(boolq["answer"])

tokens, seq_lens, labels = load_data_boolq("dev.jsonl", tokenizer)

In [24]:
yes_token = tokenizer("yes").input_ids[0]
no_token = tokenizer("no").input_ids[0]

In [38]:
outputs = []
batch_size = 32
for index in range(0, len(tokens), batch_size):
    with torch.inference_mode():
        # print(model(input_ids=tokens[index:index+batch_size]))
        res = model.generate(tokens[index:index+batch_size])
        results = np.array(tokenizer.batch_decode(res, skip_special_tokens=True)) == "yes"
        outputs.extend(results)
        # print(results == labels[index:index + batch_size])
        # out = model(input_ids=tokens[index:index+2])
        # outputs = out[torch.arange(2), np.array(seq_lens[index:index+2]) - 1, :]
        # yes_pred = np.array(outputs[:, yes_token].cpu())
        # no_pred = np.array(outputs[:, no_token].cpu())
        # print(list(zip(yes_pred, no_pred)))
# top = torch.topk(out[torch.arange(100), np.array(seq_lens[:100]) - 1, :], k=10, dim=-1)
# for i in range(len(top.indices)):
#     print(model.to_str_tokens(top.indices[i]))
# activations.append(cache[f"blocks.{layer}.hook_resid_post"][:, seq_lens[:10]])

In [39]:
np.sum(outputs == labels) / len(labels)

0.7602446483180428

In [5]:
def batched_run_with_cache(model, data, seq_lens, batch_size=16, layer=5):
    
    with torch.no_grad():

        seq_lens = np.array(seq_lens) - 1
        
        activations = []
        out_vals = []
        out_vals_prob = []
        for i in range(0, len(data), batch_size):
            if i % 1000 == 0:
                print(i)
            out, cache = model.run_with_cache(data[i:i+batch_size])
            top = torch.topk(out[:, seq_lens[i:i+batch_size]], k=10, dim=-1)
            out_vals.append(top.indices)
            out_vals_prob.append(top.values)
            activations.append(cache[f"blocks.{layer}.hook_resid_post"][:, seq_lens[i:i+batch_size]])
    
        return torch.cat(activations, dim=0), torch.cat(out_vals, dim=0), torch.cat(out_vals_prob, dim=0)

activations, out, probs = batched_run_with_cache(model, tokens, seq_lens)

0
2000
4000
6000
8000


RuntimeError: Sizes of tensors must match except in dimension 0. Expected size 16 but got size 3 for tensor number 589 in the list.

In [47]:
model.to_str_tokens(torch.argmax(model(tokens[5:6]), dim=-1))

["'t",
 ' tell',
 ' the',
 'ster',
 ' sauce',
 'igan',
 ' the',
 'bay',
 'ilon',
 '?',
 '?',
 '\n',
 ':',
 '1',
 '):',
 'False',
 '):',
 ' Yes',
 'The',
 'The',
 'The',
 'The',
 'The',
 'The',
 'The',
 'The',
 'The',
 'The',
 'The',
 'The',
 'The']

In [37]:
print(tokens)
out, cache = model.run_with_cache(tokens[:10])
top = torch.topk(out[:, -1], k=10, dim=-1)
print(top.indices)
# cache[f"blocks.{11}.hook_resid_post"][:, -1]

tensor([[ 4598,  4173,   272,  ..., 50256, 50256, 50256],
        [ 4598,   922,  6072,  ..., 50256, 50256, 50256],
        [  271,  9168,  3807,  ..., 50256, 50256, 50256],
        ...,
        [  271,   262, 10861,  ..., 50256, 50256, 50256],
        [   67,  6887,   261,  ..., 50256, 50256, 50256],
        [  271, 10330,   286,  ..., 50256, 50256, 50256]], device='cuda:0')
tensor([[ 464,   32, 1212,   40,  818, 1026,    1, 1135, 3886, 2215],
        [ 464,   32, 1212,   40,  818, 1026,    1, 1135, 3886, 2215],
        [ 464,   32, 1212,   40,  818, 1026,    1, 1135, 3886, 2215],
        [ 464,   32, 1212,   40,  818, 1026,    1, 1135, 3886, 2215],
        [ 464,   32, 1212,   40,  818, 1026,    1, 1135, 3886, 2215],
        [ 464,   32, 1212,   40,  818, 1026,    1, 1135, 3886, 2215],
        [ 464,   32, 1212,   40,  818,    1, 1026, 1135, 3886, 2215],
        [ 464,   32, 1212,   40,  818, 1026,    1, 1135, 3886, 2215],
        [ 464,   32, 1212,   40,  818,    1, 1026, 1135, 3886

# Loading TruthfulQA

In [None]:
qadata = pd.read_csv("TruthfulQA.csv")
qadata_test = pd.read_csv("TruthfulQATest.csv")

In [None]:
true_sentences = []
false_sentences = []

for row in qadata.iloc:
    corrects = row["Correct Answers"].split("; ")
    incorrects = row["Incorrect Answers"].split("; ")

    for correct in corrects:
        true_sentences.append(f"Q: {row['Question']}\nA: {correct}")

    for incorrect in incorrects:
        false_sentences.append(f"Q: {row['Question']}\nA: {incorrect}")

In [None]:
true_sentences_test = []
false_sentences_test = []

for row in qadata_test.iloc:
    corrects = row["Correct Answers"].split("; ")
    incorrects = row["Incorrect Answers"].split("; ")

    for correct in corrects:
        true_sentences_test.append(f"Q: {row['Question']}\nA: {correct}")

    for incorrect in incorrects:
        false_sentences_test.append(f"Q: {row['Question']}\nA: {incorrect}")

In [None]:
q = """Q: Why do veins appear blue?
A: Veins appear blue because there is no blood
Q: Was that answer true? (Yes/No)
A:"""

In [None]:
out = model(model.tokenizer([q], padding=True, return_tensors="pt").input_ids.to(device))
print(model.to_str_tokens(np.argsort(out[0, -1].detach().cpu().numpy())[::-1][:5]))

[' Yes', ' Ve', ' No', ' The', ' It']


# Model Training

In [None]:
true_data = model.tokenizer(true_sentences, padding=True, return_tensors="pt").input_ids.to(device)
false_data = model.tokenizer(false_sentences, padding=True, return_tensors="pt").input_ids.to(device)

true_data_test = model.tokenizer(true_sentences_test, padding=True, return_tensors="pt").input_ids.to(device)
false_data_test = model.tokenizer(false_sentences_test, padding=True, return_tensors="pt").input_ids.to(device)

In [None]:
layer_num = 1
batch_size = 16

activations_list = []

for i in range(0, len(true_data), batch_size):

    out, cache = model.run_with_cache(true_data[i:i+batch_size])

    activations = cache[f"blocks.{layer_num}.hook_resid_post"][:, -1, :]
    
    activations_list.append(activations)

positive_samples = torch.cat(activations_list, dim=0)

In [None]:
activations_list = []

for i in range(0, len(false_data), batch_size):

    out, cache = model.run_with_cache(false_data[i:i+batch_size])

    activations = cache[f"blocks.{layer_num}.hook_resid_post"][:, -1, :]
    
    activations_list.append(activations)

negative_samples = torch.cat(activations_list, dim=0)

In [None]:
data_samples = torch.cat((positive_samples, negative_samples), dim=0)
labels = torch.tensor([1.] * len(positive_samples) + [0.] * len(negative_samples), device=device)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

data_train, data_test, labels_train, labels_test = train_test_split(data_samples.cpu(), labels.cpu(), test_size=0.1)

lr_model = LogisticRegression()

fitted_model = lr_model.fit(data_train, labels_train)

# print(fitted_model.predict(data_test))

print(fitted_model.score(data_test, labels_test))
# print(fitted_model.score(data_samples.cpu()[-10:], labels.cpu()[-10:]))

1.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
out, cache = model.run_with_cache(true_data_test)

activations = cache[f"blocks.{layer_num}.hook_resid_post"][:, -1, :]

fitted_model.predict(activations.cpu())

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0.], dtype=float32)

In [None]:
model.to_str_tokens(true_data[2])

['Q',
 ':',
 ' Why',
 ' do',
 ' veins',
 ' appear',
 ' blue',
 '?',
 '\n',
 'A',
 ':',
 ' Ve',
 'ins',
 ' appear',
 ' blue',
 ' because',
 ' blue',
 ' light',
 ' does',
 ' not',
 ' penetrate',
 ' deeply',
 ' into',
 ' human',
 ' tissue',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|endoftext|>',
 '<|en

In [None]:
model.to_str_tokens(torch.argmax(out.cpu(), dim=-1)[2])

['.',
 ' What',
 ' is',
 ' when',
 ' the',
 ' when',
 ' you',
 "'re",
 ' too',
 'melon',
 '?',
 '?',
 '\n',
 '\n',
 ':',
 ' I',
 ' seeds',
 'melon',
 ' seeds',
 ' are',
 ' through',
 ' a',
 ' body',
 ' system',
 ' and',
 'The',
 'The',
 'The',
 'The',
 'The',
 'The',
 'The',
 'The',
 'The',
 'The',
 'The',
 'The']

In [None]:
print(torch.tensor(data["input_ids"]))

NameError: name 'data' is not defined