In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('..')

import torch
import matplotlib.pyplot as plt
from src import models, data
from tqdm.auto import tqdm
import json
import os
import numpy as np
import copy


In [3]:
mt = models.load_model("gptj", fp16=True, device="cuda")

In [4]:
dataset = data.load_dataset()
relation = dataset.filter(
    relation_names = ["task done by tool"]
)[0]

In [5]:
train, test = relation.split(5)

In [6]:
from src.sweeps import (
    SweepRelationResults,
    SweepTrialResults,
    SweepLayerResults,
    SweepTrainResults,
    SweepBetaResults, SweepRankResults, EfficacyTestPair
)
from src.data import RelationSample

def parse_results(sweep_result):
    relation_results = SweepRelationResults(
        relation_name=sweep_result["relation_name"],
        trials = []
    )

    for trial in sweep_result["trials"]:
        trial_results = SweepTrialResults(
            prompt_template=trial["prompt_template"],
            train_samples=[RelationSample.from_dict(s) for s in trial["train_samples"]],
            layers = [],
            n_test_samples=trial["n_test_samples"],
        )
        for layer in trial["layers"]:
            train_results = SweepTrainResults(
                samples = [RelationSample.from_dict(s) for s in layer["result"]["samples"]],
                betas = [],
                ranks = [],
                jh_norm=layer["result"]["jh_norm"],
            )
            for beta in layer["result"]["betas"]:
                beta_results = SweepBetaResults(
                    beta = beta["beta"],
                    recall = beta["recall"],
                    faithfulness_successes=[
                        RelationSample.from_dict(s) for s in beta["faithfulness_successes"]
                    ]
                )
                train_results.betas.append(beta_results)

            for rank in layer["result"]["ranks"]:
                rank_results = SweepRankResults(
                    rank = rank["rank"],
                    efficacy = rank["efficacy"],
                    efficacy_successes=[
                        EfficacyTestPair(
                            source=RelationSample.from_dict(s["source"]),
                            target=RelationSample.from_dict(s["target"]),
                        )
                        for s in rank["efficacy_successes"]
                    ]
                )
                train_results.ranks.append(rank_results)
            
            layer_results = SweepLayerResults(
                layer=layer["layer"],
                result=train_results
            )

            trial_results.layers.append(layer_results)
        relation_results.trials.append(trial_results)
    return relation_results

In [7]:
path = "../results/sweep/gptj/task_person_type/task_person_type.json"
with open(path, "r") as f:
    sweep_result = json.load(f)

relation_results = parse_results(sweep_result)

In [8]:
trial_options = list(range(len(relation_results.trials)))
print(f"{trial_options=}")

layer_options = [layer.layer for layer in relation_results.trials[0].layers]
print(f"{layer_options=}")

rank_options = [rank.rank for rank in relation_results.trials[0].layers[0].result.ranks]
print(f"{rank_options=}")

trial_options=[0, 1, 2]
layer_options=['emb', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
rank_options=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240]


In [10]:
relation_results.best_by_efficacy().__dict__

{'layer': 10,
 'beta': AggregateMetric(mean=0.350000003973643, stdev=0.1080123431340252, stderr=0.062360955384231684, values=[0.30000001192092896, 0.25, 0.5]),
 'recall': AggregateMetric(mean=0.5380690287811031, stdev=0.05643727737962378, stderr=0.03258407728745537, values=[0.47368421052631576, 0.5294117647058824, 0.6111111111111112]),
 'rank': AggregateMetric(mean=143.33333333333334, stdev=69.44222218666553, stderr=40.09248567259715, values=[230, 60, 140]),
 'efficacy': AggregateMetric(mean=0.8515078546038298, stdev=0.027498749752319394, stderr=0.01587641057187976, values=[0.8421052631578947, 0.8235294117647058, 0.8888888888888888])}

In [25]:
#########################################################################################################
TRIAL_NO = 1
RANK = 200
LAYER = 10
#########################################################################################################

In [26]:
layer_result = [layer for layer in relation_results.trials[TRIAL_NO].layers if layer.layer == LAYER][0]
rank_result = [rank for rank in layer_result.result.ranks if rank.rank == RANK][0]
rank_result

SweepRankResults(rank=200, efficacy=0.8235294117647058, efficacy_successes=[EfficacyTestPair(source=RelationSample(subject='baking cakes', object='baker'), target=RelationSample(subject='providing legal advice', object='lawyer')), EfficacyTestPair(source=RelationSample(subject='cooking meals', object='chef'), target=RelationSample(subject='making clothes', object='fashion designer')), EfficacyTestPair(source=RelationSample(subject='designing buildings', object='architect'), target=RelationSample(subject='translating books', object='translator')), EfficacyTestPair(source=RelationSample(subject='directing movies', object='director'), target=RelationSample(subject='teaching students', object='teacher')), EfficacyTestPair(source=RelationSample(subject='farming', object='farmer'), target=RelationSample(subject='managing hotels', object='hotel manager')), EfficacyTestPair(source=RelationSample(subject='investigating crimes', object='detective'), target=RelationSample(subject='making clothes'

In [27]:
efficacy_successes = {s.target.subject : s for s in rank_result.efficacy_successes}

for beta_result in layer_result.result.betas:
    faithfulness_successes = beta_result.faithfulness_successes
    for sample in faithfulness_successes:
        if(sample.subject in efficacy_successes):
            print(f"Edit: {efficacy_successes[sample.subject].source} <to> {efficacy_successes[sample.subject].target} -- found in beta: {beta_result.beta}")
            efficacy_successes.pop(sample.subject)
        

Edit: performing surgeries -> surgeon <to> managing hotels -> hotel manager -- found in beta: 0.0
Edit: directing movies -> director <to> teaching students -> teacher -- found in beta: 0.0
Edit: baking cakes -> baker <to> providing legal advice -> lawyer -- found in beta: 0.10000000149011612
Edit: treating animals -> veterinarian <to> flying airplanes -> pilot -- found in beta: 0.20000000298023224
Edit: managing hotels -> hotel manager <to> investigating crimes -> detective -- found in beta: 0.20000000298023224
Edit: investigating crimes -> detective <to> making clothes -> fashion designer -- found in beta: 0.20000000298023224
Edit: selling houses -> real estate agent <to> treating animals -> veterinarian -- found in beta: 0.6000000238418579
Edit: teaching students -> teacher <to> reporting news -> journalist -- found in beta: 0.699999988079071


In [28]:
print("No target match found in faithfulness successes for the following:")
for sample in efficacy_successes.values():
    print(f"Edit: {sample.source} <to> {sample.target}")

No target match found in faithfulness successes for the following:
Edit: designing buildings -> architect <to> translating books -> translator
Edit: reporting news -> journalist <to> selling houses -> real estate agent


In [29]:
train_samples = layer_result.result.samples
train_samples

[RelationSample(subject='researching history', object='historian'),
 RelationSample(subject='exploring space', object='astronaut'),
 RelationSample(subject='repairing computers', object='technician'),
 RelationSample(subject='driving trucks', object='truck driver'),
 RelationSample(subject='conducting an orchestra', object='conductor')]

In [30]:
prompt_template = relation_results.trials[TRIAL_NO].prompt_template
prompt_template

' {} :'

In [31]:
from src import functional, operators, editors

In [32]:
estimator = operators.JacobianIclMeanEstimator(
    mt = mt,
    h_layer = LAYER,
)

operator = estimator(
    relation.set(
        samples = train_samples,
        prompt_templates = [prompt_template],
    )
)

In [33]:
############################################
subject = "translating books"
############################################

In [34]:
functional.predict_next_token(
    mt = mt,
    prompt = operator.prompt_template.format(subject)
)

[[PredictedToken(token=' translator', prob=0.3305882215499878),
  PredictedToken(token=' l', prob=0.24186350405216217),
  PredictedToken(token=' editor', prob=0.07610572874546051),
  PredictedToken(token=' interpreter', prob=0.051495712250471115),
  PredictedToken(token=' book', prob=0.05069734901189804)]]

In [35]:
operator(subject=subject)

LinearRelationOutput(predictions=[PredictedToken(token=' l', prob=0.5602270364761353), PredictedToken(token=' professional', prob=0.03782724589109421), PredictedToken(token=' professor', prob=0.021722357720136642), PredictedToken(token='\n', prob=0.02138558402657509), PredictedToken(token=' engineer', prob=0.020566314458847046)], h=tensor([[ 2.1621,  0.3286, -0.2878,  ..., -0.0443,  0.5449, -0.5127]],
       device='cuda:0', dtype=torch.float16), z=tensor([[-1.2969, -3.3984, -5.6367,  ..., -3.6172,  0.0963,  3.2578]],
       device='cuda:0', dtype=torch.float16, grad_fn=<AddBackward0>))

In [36]:
svd = torch.svd(operator.weight.float())
editor = editors.LowRankPInvEditor(
    lre = operator,
    rank = rank_result.rank,
    n_samples=1, n_new_tokens=1,
    svd = svd
)

In [37]:
efficacy_test_pair = efficacy_successes[subject]
f"Editing: {efficacy_test_pair.source} <to> {efficacy_test_pair.target}"

'Editing: designing buildings -> architect <to> translating books -> translator'

In [38]:
editor(
    subject = efficacy_test_pair.source.subject,
    target = efficacy_test_pair.target.subject,
)

LinearRelationEditResult(predicted_tokens=[PredictedToken(token=' translator', prob=0.31065207719802856), PredictedToken(token=' editor', prob=0.15378393232822418), PredictedToken(token=' l', prob=0.14222688972949982), PredictedToken(token=' teacher', prob=0.05928898975253105), PredictedToken(token=' publisher', prob=0.05071255937218666), PredictedToken(token=' proof', prob=0.031243102625012398), PredictedToken(token=' language', prob=0.021811198443174362), PredictedToken(token=' typ', prob=0.018802430480718613), PredictedToken(token=' book', prob=0.01711980625987053), PredictedToken(token=' lingu', prob=0.01711980625987053)], model_logits=tensor([-inf, -inf, -inf,  ..., -inf, -inf, -inf], device='cuda:0'), model_generations=[' researching history : historian\n exploring space : astronaut\n repairing computers : technician\n driving trucks : truck driver\n conducting an orchestra : conductor\n designing buildings : translator'])