In [None]:
%env WORK_DIR = /home/daniel/ml_workspace/repepo/experiments

We're interested in:
- The steerability of individual examples
- The steering vectors extracted from individual examples

In [None]:
from repepo.steering.run_experiment import run_experiment

In [None]:
import itertools
from repepo.steering.utils.helpers import SteeringConfig, EmptyTorchCUDACache

splits = [f"{i}:+1" for i in range(3)]
dataset = "power-seeking-inclination"
layer = 13
multipliers = [-1, 0, 1]

def iter_config():
    for train_split, test_split, multiplier in itertools.product(splits, splits, multipliers):
        yield SteeringConfig(
            train_dataset=dataset,
            train_split=train_split,
            formatter="llama-chat-formatter",
            layer=layer,
            multiplier=multiplier,
            test_dataset=dataset,
            test_split=test_split,
            test_completion_template="{prompt} My answer is: {response}",
            patch_generation_tokens_only=True,
            skip_first_n_generation_tokens=1,
        )

In [None]:
RUN = False 

if RUN:
    results = []
    for config in iter_config():
        with EmptyTorchCUDACache():
            result = run_experiment(config, force_rerun=True, logging_level="INFO")
            results.append((config, result))

else:
    from repepo.steering.utils.helpers import load_eval_result
    results = []
    for config in iter_config():
        result = load_eval_result(config.eval_hash)
        results.append((config, result))

Questions to answer: 
- What's the steerability of individual examples? 
- What's the steering efficiency of SVs extracted from individual examples?

In [None]:
# Aggregate the data into a dataframe

import pandas as pd
rows = []
for config, result in results:
    row = {
        "test_positive_example": result.predictions[0].positive_output_prob.text,
        "test_negative_example": result.predictions[0].negative_output_prob.text,
        "train_split": config.train_split,
        "test_split": config.test_split,
        "layer": config.layer,
        "multiplier": config.multiplier,
        "mean_logit_diff": result.metrics['mean_logit_diff'],
    }
    rows.append(row)

df = pd.DataFrame(rows)
print(len(df))
df.head()

In [None]:
# We investigate how well a single steering vector works across different test splits
data_df = df.copy()
data_df = data_df[data_df.train_split == "0:+1"]

# Plot the mean logit diff against multiplier, with hue by test split
# Plot them all on the same graph, with label
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")
plt.figure()
ax = sns.lineplot(data=data_df, x="multiplier", y="mean_logit_diff", hue="test_split")
ax.set_title("Steering different test examples with a single steering vector")


In [None]:
data_df = df.copy()
data_df = data_df[data_df.test_split == "0:+1"]

# Plot the mean logit diff against multiplier, with hue by test split
# Plot them all on the same graph, with label
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")
plt.figure()

ax = sns.lineplot(data=data_df, x="multiplier", y="mean_logit_diff", hue="train_split")
ax.set_title("Effect of train split on steering the same test split")

In [None]:
# Crudely estimate steering efficiency as (mean logit diff at 1) - (mean logit diff at -1)
data_df = df.copy()
group_df = data_df.groupby(["train_split", "test_split", "layer"])

def compute_signed_difference(row):
    logit_diff_at_plus_1 = row[row.multiplier == 1]["mean_logit_diff"].item()
    logit_diff_at_minus_1 = row[row.multiplier == -1]["mean_logit_diff"].item()
    return pd.Series({'steering_efficiency': logit_diff_at_plus_1 - logit_diff_at_minus_1})

data_df = group_df.apply(compute_signed_difference)
# Merge back in the positive example and negative example
data_df = data_df.reset_index()
metadata_df = df[["train_split", "test_split", "layer", "test_positive_example", "test_negative_example"]].drop_duplicates()
data_df = data_df.merge(metadata_df, on=["train_split", "test_split", "layer"])

print(len(data_df))
data_df.head()

In [None]:

# Construct a 3x3 matrix of steering efficiency,
# where X is train split, Y is test split, and value is steering efficiency
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")
plt.figure()
plot_df = data_df.pivot(index="train_split", columns="test_split", values="steering_efficiency")
ax = sns.heatmap(plot_df, annot=True, cmap="YlGnBu")

In [None]:
###
pd.set_option('display.max_colwidth', None)
data_df[['test_split', 'test_positive_example', 'test_negative_example']].drop_duplicates()