In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('..')

import torch
import matplotlib.pyplot as plt
from src import models, data
from tqdm.auto import tqdm
import json
import os
import numpy as np
import copy


In [3]:
mt = models.load_model("gptj", fp16=True, device="cuda")

In [4]:
#####################################
relation_name = "plays pro sport"
#####################################

In [5]:
dataset = data.load_dataset()
relation = dataset.filter(
    relation_names = [relation_name]
)[0]

In [6]:
from src.utils.sweep_utils import read_sweep_results, relation_from_dict

In [7]:
sweep_dict = read_sweep_results(
    "../results/sweep-24-trials/gptj", 
    relation_names=[relation_name], 
    economy=False
)

In [8]:
relation_result = relation_from_dict(sweep_dict[relation_name])

In [9]:
trial_options = list(range(len(relation_result.trials)))
print(f"{trial_options=}")

layer_options = [layer.layer for layer in relation_result.trials[0].layers]
print(f"{layer_options=}")

rank_options = [rank.rank for rank in relation_result.trials[0].layers[0].result.ranks]
print(f"{rank_options=}")

trial_options=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
layer_options=['emb', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]
rank_options=[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312]


In [10]:
relation_result.best_by_efficacy(beta = 2.25).__dict__

{'layer': 6,
 'beta': AggregateMetric(mean=2.25, stdev=0.0, stderr=0.0, values=[2.25, 2.25, 2.25, 2.25, 2.25, 2.25, 2.25, 2.25, 2.25, 2.25, 2.25, 2.25, 2.25, 2.25, 2.25, 2.25, 2.25, 2.25, 2.25, 2.25, 2.25, 2.25, 2.25, 2.25]),
 'recall': AggregateMetric(mean=0.7602235527293386, stdev=0.06316186564923755, stderr=0.012892861836904712, values=[0.7722007722007722, 0.7983539094650206, 0.6778242677824268, 0.8032786885245902, 0.6415929203539823, 0.7647058823529411, 0.7239819004524887, 0.7490636704119851, 0.7208333333333333, 0.7176470588235294, 0.6333333333333333, 0.8167330677290837, 0.7833333333333333, 0.8, 0.788, 0.8016877637130801, 0.7551867219917012, 0.8529411764705882, 0.8695652173913043, 0.6388888888888888, 0.72265625, 0.808695652173913, 0.8089430894308943, 0.7959183673469388]),
 'rank': AggregateMetric(mean=117.66666666666667, stdev=66.19080164359865, stderr=13.511140807715902, values=[72, 112, 72, 216, 216, 112, 112, 224, 208, 80, 136, 72, 144, 256, 64, 72, 48, 152, 32, 72, 192, 56, 48,

In [11]:
#########################################################################################################
TRIAL_NO = 7
RANK = 120
LAYER = 6
#########################################################################################################

In [12]:
layer_result = [layer for layer in relation_result.trials[TRIAL_NO].layers if layer.layer == LAYER][0]
rank_result = [rank for rank in layer_result.result.ranks if rank.rank == RANK][0]
rank_result

SweepRankResults(rank=120, efficacy=[0.947565543071161, 0.9962546816479401, 1.0], efficacy_successes=[EfficacyTestPair(source=RelationSample(subject='Aleksandar Đorđević', object='basketball'), target=RelationSample(subject='Lutz Pfannenstiel', object='soccer')), EfficacyTestPair(source=RelationSample(subject='Alessandro Nesta', object='soccer'), target=RelationSample(subject='Russell Wilson', object='football')), EfficacyTestPair(source=RelationSample(subject='Alex Karras', object='football'), target=RelationSample(subject='Frank Robinson', object='baseball')), EfficacyTestPair(source=RelationSample(subject='Alex Rodriguez', object='baseball'), target=RelationSample(subject='Evgeni Malkin', object='hockey')), EfficacyTestPair(source=RelationSample(subject='Alexander Ovechkin', object='hockey'), target=RelationSample(subject='Russell Wilson', object='football')), EfficacyTestPair(source=RelationSample(subject='Alexei Kovalev', object='hockey'), target=RelationSample(subject='Emanuel Po

In [13]:
efficacy_successes = {s.target.subject : s for s in rank_result.efficacy_successes}

for beta_result in layer_result.result.betas:
    faithfulness_successes = beta_result.faithfulness_successes
    for sample in faithfulness_successes:
        if(sample.subject in efficacy_successes):
            print(f"Edit: {efficacy_successes[sample.subject].source} <to> {efficacy_successes[sample.subject].target} -- found in beta: {beta_result.beta}")
            efficacy_successes.pop(sample.subject)
        

Edit: Mikaël Silvestre -> soccer <to> Ara Parseghian -> football -- found in beta: 0.0
Edit: Magic Johnson -> basketball <to> Bart Starr -> football -- found in beta: 0.0
Edit: Ted Williams -> baseball <to> Brian Bosworth -> football -- found in beta: 0.0
Edit: Hristo Stoichkov -> soccer <to> Bronko Nagurski -> football -- found in beta: 0.0
Edit: Pavel Datsyuk -> hockey <to> Bubba Smith -> football -- found in beta: 0.0
Edit: Steve Nash -> basketball <to> Byron White -> football -- found in beta: 0.0
Edit: Yu Darvish -> baseball <to> Cam Newton -> football -- found in beta: 0.0
Edit: Billy Sunday -> baseball <to> Dan Marino -> football -- found in beta: 0.0
Edit: Tim Duncan -> basketball <to> Don Shula -> football -- found in beta: 0.0
Edit: Kasey Keller -> soccer <to> Emmitt Smith -> football -- found in beta: 0.0
Edit: Pau Gasol -> basketball <to> Fred Williamson -> football -- found in beta: 0.0
Edit: R.A. Dickey -> baseball <to> Gale Sayers -> football -- found in beta: 0.0
Edit: 

In [14]:
print("No target match found in faithfulness successes for the following:")
for sample in efficacy_successes.values():
    print(f"Edit: {sample.source} <to> {sample.target}")

No target match found in faithfulness successes for the following:
Edit: Joe DiMaggio -> baseball <to> Lutz Pfannenstiel -> soccer
Edit: Peter Forsberg -> hockey <to> Júlio César -> soccer
Edit: Michael Bradley -> soccer <to> Jean Béliveau -> hockey
Edit: Chris Paul -> basketball <to> Howie Morenz -> hockey
Edit: Christy Mathewson -> baseball <to> Cuauhtémoc Blanco -> soccer
Edit: Connie Mack -> baseball <to> Andreas Ivanschitz -> soccer
Edit: Ron W. Miller -> football <to> Ken Dryden -> hockey
Edit: Pat Tillman -> football <to> Maurice Richard -> hockey
Edit: Phil Jackson -> basketball <to> Phil Esposito -> hockey
Edit: Jackie Robinson -> baseball <to> Igor Larionov -> hockey
Edit: John Olerud -> baseball <to> Patrick Roy -> hockey
Edit: Kareem Abdul-Jabbar -> basketball <to> Matteo Ferrari -> soccer
Edit: Siem de Jong -> soccer <to> Sergei Fedorov -> hockey
Edit: Moe Berg -> baseball <to> Egidio Arévalo Rios -> soccer
Edit: Woody Strode -> football <to> Mark Messier -> hockey
Edit: T

In [15]:
train_samples = layer_result.result.samples
train_samples

[RelationSample(subject="Ed O'Neill", object='football'),
 RelationSample(subject='Brad Friedel', object='soccer'),
 RelationSample(subject='Deion Sanders', object='baseball'),
 RelationSample(subject='Kevin Durant', object='basketball'),
 RelationSample(subject='Wayne Gretzky', object='hockey'),
 RelationSample(subject='Jesse Hibbs', object='football'),
 RelationSample(subject='Edgaras Jankauskas', object='soccer'),
 RelationSample(subject='Ty Cobb', object='baseball')]

In [16]:
prompt_template = relation_result.trials[TRIAL_NO].prompt_template
prompt_template

'{} plays the sport of'

In [17]:
from src import functional, operators, editors

In [18]:
estimator = operators.JacobianIclMeanEstimator(
    mt = mt,
    h_layer = LAYER,
)

operator = estimator(
    relation.set(
        samples = train_samples,
        prompt_templates = [prompt_template],
    )
)

In [19]:
############################################
subject = "Júlio César"
############################################

In [20]:
# model predicts correctly
functional.predict_next_token(
    mt = mt,
    prompt = operator.prompt_template.format(subject)
)

[[PredictedToken(token=' soccer', prob=0.6075406670570374),
  PredictedToken(token=' football', prob=0.26959455013275146),
  PredictedToken(token=' f', prob=0.03648565337061882),
  PredictedToken(token=' Soccer', prob=0.00833410955965519),
  PredictedToken(token=' jud', prob=0.004976524040102959)]]

In [21]:
# LRE fails (low faithfulness)
operator(subject=subject, k = 5)

LinearRelationOutput(predictions=[PredictedToken(token=' football', prob=0.4927668273448944), PredictedToken(token=' basketball', prob=0.2256051003932953), PredictedToken(token=' baseball', prob=0.1305702179670334), PredictedToken(token=' soccer', prob=0.05113203078508377), PredictedToken(token='...', prob=0.009835257194936275)], h=tensor([[-0.0308, -0.8740, -0.1814,  ...,  1.9746,  0.5850, -0.1228]],
       device='cuda:0', dtype=torch.float16), z=tensor([[-2.7949, -0.2698, -0.0322,  ..., -2.7891,  0.7993, -0.4309]],
       device='cuda:0', dtype=torch.float16, grad_fn=<AddBackward0>))

In [22]:
svd = torch.svd(operator.weight.float())
editor = editors.LowRankPInvEditor(
    lre = operator,
    rank = rank_result.rank,
    n_samples=1, n_new_tokens=1,
    svd = svd
)

In [23]:
efficacy_test_pair = efficacy_successes[subject]
f"Editing: {efficacy_test_pair.source} <to> {efficacy_test_pair.target}"

'Editing: Peter Forsberg -> hockey <to> Júlio César -> soccer'

In [24]:
# editing succeeds (high efficacy)
editor(
    subject = efficacy_test_pair.source.subject,
    target = efficacy_test_pair.target.subject,
)

LinearRelationEditResult(predicted_tokens=[PredictedToken(token=' soccer', prob=0.41615208983421326), PredictedToken(token=' football', prob=0.37891077995300293), PredictedToken(token=' jud', prob=0.03159268945455551), PredictedToken(token=' MMA', prob=0.01691034808754921), PredictedToken(token=' boxing', prob=0.01639007218182087)], model_logits=tensor([-inf, -inf, -inf,  ..., -inf, -inf, -inf], device='cuda:0'), model_generations=["Ed O'Neill plays the sport of football\nBrad Friedel plays the sport of soccer\nDeion Sanders plays the sport of baseball\nKevin Durant plays the sport of basketball\nWayne Gretzky plays the sport of hockey\nJesse Hibbs plays the sport of football\nEdgaras Jankauskas plays the sport of soccer\nTy Cobb plays the sport of baseball\nPeter Forsberg plays the sport of kick"])