In [None]:
%env WORK_DIR = /home/daniel/ml_workspace/repepo/experiments

In [None]:
import itertools
from repepo.steering.utils.helpers import SteeringConfig, EmptyTorchCUDACache
from repepo.steering.sweeps.constants import ALL_LANGUAGES

layer = 13
multipliers = [-1, 0, 1] 

datasets = [
    "power-seeking-inclination",
    "anti-immigration",
    "openness",
    "conscientiousness"    
]


def iter_config():
    for train_dataset, test_dataset, multiplier in itertools.product(datasets, datasets, multipliers):
        yield SteeringConfig(
            train_dataset=train_dataset,
            train_split="0%:+10",
            formatter="llama-chat-formatter",
            layer=layer,
            multiplier=multiplier,
            test_dataset=test_dataset,
            test_split="40%:+10",
            test_completion_template="{prompt} My answer is: {response}",
            patch_generation_tokens_only=True,
            skip_first_n_generation_tokens=1,
        )

In [None]:
from repepo.steering.run_sweep import run_sweep, load_sweep_results

RUN = True
configs = list(iter_config())

if RUN:
    run_sweep(configs)

results = load_sweep_results(configs)

In [None]:
# Aggregate the data into a dataframe

import pandas as pd
rows = []
for config, result in results:
    row = {
        "test_positive_example": result.predictions[0].positive_output_prob.text,
        "test_negative_example": result.predictions[0].negative_output_prob.text,
        "train_dataset": config.train_dataset,
        "test_dataset": config.test_dataset,
        "layer": config.layer,
        "multiplier": config.multiplier,
        "mean_logit_diff": result.metrics['mean_logit_diff'],
    }
    rows.append(row)

df = pd.DataFrame(rows)
print(len(df))
df.head()

In [None]:
# We investigate how well a single steering vector works across different test splits
data_df = df.copy()
data_df = data_df[data_df.train_dataset == "power-seeking-inclination"]

# Plot the mean logit diff against multiplier, with hue by test split
# Plot them all on the same graph, with label
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")
plt.figure()
ax = sns.lineplot(data=data_df, x="multiplier", y="mean_logit_diff", hue="test_dataset")
ax.set_title("Transferring a steering vector to different contexts")


In [None]:
# We investigate how well different steering vectors work on the same test dataset
data_df = df.copy()
data_df = data_df[data_df.test_dataset == "power-seeking-inclination"]

# Plot the mean logit diff against multiplier, with hue by test split
# Plot them all on the same graph, with label
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")
plt.figure()
ax = sns.lineplot(data=data_df, x="multiplier", y="mean_logit_diff", hue="train_dataset")
ax.set_title("Transferring steering vectors from different contexts to the same test split")


In [None]:
# Crudely estimate steering efficiency as (mean logit diff at 1) - (mean logit diff at -1)
data_df = df.copy()
group_df = data_df.groupby(["train_dataset", "test_dataset", "layer"])

def compute_signed_difference(row):
    logit_diff_at_plus_1 = row[row.multiplier == 1]["mean_logit_diff"].item()
    logit_diff_at_minus_1 = row[row.multiplier == -1]["mean_logit_diff"].item()
    return pd.Series({'steering_efficiency': logit_diff_at_plus_1 - logit_diff_at_minus_1})

data_df = group_df.apply(compute_signed_difference)
# Merge back in the positive example and negative example
data_df = data_df.reset_index()
metadata_df = df[["train_dataset", "test_dataset", "layer", "test_positive_example", "test_negative_example"]].drop_duplicates()
data_df = data_df.merge(metadata_df, on=["train_dataset", "test_dataset", "layer"])

print(len(data_df))
data_df.head()

In [None]:

# Construct a 3x3 matrix of steering efficiency,
# where X is train split, Y is test split, and value is steering efficiency
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")
plt.figure()
plot_df = data_df.pivot(index="train_dataset", columns="test_dataset", values="steering_efficiency")
ax = sns.heatmap(plot_df, annot=True, cmap="YlGnBu")
ax.set_title("Steering efficiency when transferring between different contexts")

In [None]:
# Compare the steering vectors from different contexts. 

from repepo.steering.utils.helpers import load_activation
from repepo.steering.get_aggregator import get_aggregator
from steering_vectors import SteeringVector, LayerType
import torch
from typing import cast

def iter_train_configs():
    for train_dataset in datasets:
        yield SteeringConfig(
            train_dataset=train_dataset,
            train_split="0%:+10",
            formatter="llama-chat-formatter",
            layer=layer,
            multiplier=0,
            test_dataset=train_dataset,
            test_split="40%:+10",
            test_completion_template="{prompt} My answer is: {response}",
            patch_generation_tokens_only=True,
            skip_first_n_generation_tokens=1,
        )

steering_vectors = []
for config in list(iter_train_configs()):
    print(config)
    pos_acts, neg_acts = load_activation(config.train_hash)
    aggregator = get_aggregator(config.aggregator)
    with EmptyTorchCUDACache():
        direction_vec = aggregator(torch.concat(pos_acts), torch.concat(neg_acts))
        steering_vector = SteeringVector(
            layer_activations={config.layer: direction_vec},
            layer_type=cast(LayerType, config.layer_type),
        )

    steering_vectors.append((config, steering_vector))

In [None]:
# Compute the pairwise cosine similarity between steering vectors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
cosine_similarities = np.zeros((len(steering_vectors), len(steering_vectors)))
for i, (config_i, steering_vector_i) in enumerate(steering_vectors):
    for j, (config_j, steering_vector_j) in enumerate(steering_vectors):
        cosine_similarities[i, j] = cosine_similarity(
            steering_vector_i.layer_activations[config_i.layer].detach().cpu().numpy().reshape(1, -1),
            steering_vector_j.layer_activations[config_j.layer].detach().cpu().numpy().reshape(1, -1)
        )

# Plot heatmap of cosine similarities
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")
plt.figure()
ax = sns.heatmap(cosine_similarities, annot=True, cmap="YlGnBu")
# Use train_dataset as x and y labels
labels = [config.train_dataset for config, _ in steering_vectors]
ax.set_xticklabels(labels, rotation=90)
ax.set_yticklabels(labels, rotation=0)
ax.set_title("Cosine similarity between steering vectors from different contexts")

In [None]:
# Compute norms of all three vectors and visualize in bar plot
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")
plt.figure()
norms = [torch.norm(steering_vector.layer_activations[config.layer]).item() for config, steering_vector in steering_vectors]
ax = sns.barplot(x=labels, y=norms)
ax.set_title("Norm of steering vectors from different contexts")
ax.set_xticklabels(labels, rotation=90)