In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('..')

import torch
import matplotlib.pyplot as plt
from src import models, data
from tqdm.auto import tqdm
import json
import os
import numpy as np
import copy


In [3]:
mt = models.load_model("gptj", fp16=True, device="cuda")

In [4]:
dataset = data.load_dataset()
relation = dataset.filter(
    relation_names = ["task done by tool"]
)[0]

In [6]:
train, test = relation.split(5)

In [66]:
from src.sweeps import (
    SweepRelationResults,
    SweepTrialResults,
    SweepLayerResults,
    SweepTrainResults,
    SweepBetaResults, SweepRankResults, EfficacyTestPair
)
from src.data import RelationSample

def parse_results(sweep_result):
    relation_results = SweepRelationResults(
        relation_name=sweep_result["relation_name"],
        trials = []
    )

    for trial in sweep_result["trials"]:
        trial_results = SweepTrialResults(
            prompt_template=trial["prompt_template"],
            train_samples=[RelationSample.from_dict(s) for s in trial["train_samples"]],
            layers = [],
            n_test_samples=trial["n_test_samples"],
        )
        for layer in trial["layers"]:
            train_results = SweepTrainResults(
                samples = [RelationSample.from_dict(s) for s in layer["result"]["samples"]],
                betas = [],
                ranks = [],
                jh_norm=layer["result"]["jh_norm"],
            )
            for beta in layer["result"]["betas"]:
                beta_results = SweepBetaResults(
                    beta = beta["beta"],
                    recall = beta["recall"],
                    faithfulness_successes=[
                        RelationSample.from_dict(s) for s in beta["faithfulness_successes"]
                    ]
                )
                train_results.betas.append(beta_results)

            for rank in layer["result"]["ranks"]:
                rank_results = SweepRankResults(
                    rank = rank["rank"],
                    efficacy = rank["efficacy"],
                    efficacy_successes=[
                        EfficacyTestPair(
                            source=RelationSample.from_dict(s["source"]),
                            target=RelationSample.from_dict(s["target"]),
                        )
                        for s in rank["efficacy_successes"]
                    ]
                )
                train_results.ranks.append(rank_results)
            
            layer_results = SweepLayerResults(
                layer=layer["layer"],
                result=train_results
            )

            trial_results.layers.append(layer_results)
        relation_results.trials.append(trial_results)
    return relation_results

In [67]:
path = "/home/local_arnab/Codes/relations/results/sweep-test/gptj/task done by tool/task_done_by_tool.json"
with open(path, "r") as f:
    sweep_result = json.load(f)

relation_results = parse_results(sweep_result)

In [68]:
relation_results.trials[0].layers[0].result.__dict__.keys()

dict_keys(['samples', 'betas', 'ranks', 'jh_norm'])

In [69]:
beta_1 = relation_results.trials[0].layers[0].result.betas[-1]
print(beta_1.recall)
beta_1.faithfulness_successes

[0.21875, 0.21875, 0.25]


[RelationSample(subject='drawing', object='pencil and sketchbook'),
 RelationSample(subject='drying clothes', object='clothesline'),
 RelationSample(subject='hitting nails', object='hammer'),
 RelationSample(subject='playing sports', object='ball'),
 RelationSample(subject='polishing shoes', object='shoe polish'),
 RelationSample(subject='sewing', object='needle and thread'),
 RelationSample(subject='sweeping floors', object='broom')]

In [70]:
rank_result = relation_results.trials[0].layers[0].result.ranks[-1]
rank_result.efficacy_successes

[EfficacyTestPair(source=RelationSample(subject='birdwatching', object='binoculars'), target=RelationSample(subject='cooking', object='stove')),
 EfficacyTestPair(source=RelationSample(subject='boating', object='boat'), target=RelationSample(subject='sweeping floors', object='broom')),
 EfficacyTestPair(source=RelationSample(subject='cooking', object='stove'), target=RelationSample(subject='drying clothes', object='clothesline')),
 EfficacyTestPair(source=RelationSample(subject='dancing', object='music'), target=RelationSample(subject='taking photographs', object='camera')),
 EfficacyTestPair(source=RelationSample(subject='doing makeup', object='makeup brushes'), target=RelationSample(subject='boating', object='boat')),
 EfficacyTestPair(source=RelationSample(subject='drying clothes', object='clothesline'), target=RelationSample(subject='taking photographs', object='camera')),
 EfficacyTestPair(source=RelationSample(subject='hitting nails', object='hammer'), target=RelationSample(subje

In [85]:
from src import functional, operators, editors

In [72]:
prompt_template = relation_results.trials[0].prompt_template
train_samples = relation_results.trials[0].train_samples

estimator = operators.JacobianIclMeanEstimator(
    mt = mt,
    h_layer = relation_results.trials[0].layers[0].layer,
)

operator = estimator(
    relation.set(
        samples = train_samples,
        prompt_templates = [prompt_template],
    )
)

In [78]:
subject = "cooking"
operator(subject=subject)

LinearRelationOutput(predictions=[PredictedToken(token=' kitchen', prob=0.11502338945865631), PredictedToken(token=' a', prob=0.05874820426106453), PredictedToken(token=' cooking', prob=0.037342626601457596), PredictedToken(token=' dish', prob=0.036193713545799255), PredictedToken(token=' food', prob=0.02753472700715065)], h=tensor([[ 1.0977,  1.8662,  1.1016,  ..., -0.1838,  0.8428, -0.9453]],
       device='cuda:0', dtype=torch.float16), z=tensor([[-3.3438,  2.2910, -0.1855,  ..., -1.3213,  0.7910,  0.9106]],
       device='cuda:0', dtype=torch.float16, grad_fn=<AddBackward0>))

In [79]:
functional.predict_next_token(
    mt = mt,
    prompt = operator.prompt_template.format(subject)
)

[[PredictedToken(token=' stove', prob=0.21862821280956268),
  PredictedToken(token=' pan', prob=0.14115698635578156),
  PredictedToken(token=' pots', prob=0.11886609345674515),
  PredictedToken(token=' pot', prob=0.10167156904935837),
  PredictedToken(token=' kitchen', prob=0.06166691705584526)]]

In [83]:
efficacy_test_pair = rank_result.efficacy_successes[0]
source = efficacy_test_pair.source
target = efficacy_test_pair.target

In [88]:
svd = torch.svd(operator.weight.float())
editor = editors.LowRankPInvEditor(
    lre = operator,
    rank = rank_result.rank,
    n_samples=1, n_new_tokens=1,
    svd = svd
)

In [90]:
editor(
    subject = source.subject,
    target = target.subject,
)

LinearRelationEditResult(predicted_tokens=[PredictedToken(token=' stove', prob=0.1845376342535019), PredictedToken(token=' oven', prob=0.1288280338048935), PredictedToken(token=' pan', prob=0.1288280338048935), PredictedToken(token=' pot', prob=0.1051463708281517), PredictedToken(token=' pots', prob=0.08993646502494812), PredictedToken(token=' frying', prob=0.05124425143003464), PredictedToken(token=' kitchen', prob=0.049667634069919586), PredictedToken(token=' food', prob=0.032067831605672836), PredictedToken(token=' cooking', prob=0.030839335173368454), PredictedToken(token=' cook', prob=0.021361783146858215)], model_logits=tensor([-inf, -inf, -inf,  ..., -inf, -inf, -inf], device='cuda:0'), model_generations=[' playing soccer : soccer ball\n painting walls : paintbrush\n knitting : yarn\n washing dishes : sponge\n writing : pen and paper\n birdwatching : stove'])