In [None]:
%env WORK_DIR = /home/daniel/ml_workspace/repepo/experiments

In [None]:
from repepo.steering.run_experiment import run_experiment

In [None]:
import itertools
from repepo.steering.utils.helpers import SteeringConfig, EmptyTorchCUDACache
from repepo.steering.sweeps.constants import ALL_TOKEN_CONCEPT_DATASETS, ALL_LLAMA_7B_LAYERS, ALL_MULTIPLIERS
from repepo.steering.sweeps.configs import get_abstract_concept_config

datasets = ["power-seeking-inclination"]
layers = [13]
multipliers = [-1, 0, 1]
train_splits = ["0%:+1", "0%:+3", "0%:+10", "0%:+30", "0%:+100"]
test_splits = ["40%:+10"]

def iter_config():
    for dataset, layer, multiplier, train_split, test_split in itertools.product(datasets, layers, multipliers, train_splits, test_splits):
        yield SteeringConfig(
            train_dataset=dataset,
            train_split=train_split,
            formatter="llama-chat-formatter",
            layer=layer,
            multiplier=multiplier,
            test_dataset=dataset,
            test_split=test_split,
            test_completion_template="{prompt} My answer is: {response}",
            patch_generation_tokens_only=True,
            skip_first_n_generation_tokens=1,
        )

In [None]:
from repepo.steering.run_sweep import run_sweep, load_sweep_results

RUN = False
configs = list(iter_config())

if RUN:
    run_sweep(configs)

results = load_sweep_results(configs)

In [None]:
# Aggregate the data into a dataframe

import pandas as pd
from dataclasses import asdict
rows = []
for config, result in results:
    row = asdict(config)
    row.update(**{
        "test_positive_example": result.predictions[0].positive_output_prob.text,
        "test_negative_example": result.predictions[0].negative_output_prob.text,
        "mean_logit_diff": result.metrics['mean_logit_diff'],
    })
    rows.append(row)

df = pd.DataFrame(rows)
print(len(df))
df.head()

In [None]:
# Visualize important dataset metadata
train_fields = ['model_name', 'train_dataset', 'train_split', 'train_completion_template', 'formatter', 'aggregator', 'layer', 'layer_type']
df[train_fields].drop_duplicates()

# Display without index
from IPython.display import HTML
HTML(df[train_fields].drop_duplicates().to_html(index=False))

In [None]:

# Group results by all training fields:
# - train_dataset, train_split, train_completion_template, formatter, aggregator, layer, layer type
train_fields = ['train_dataset', 'train_split', 'train_completion_template', 'formatter', 'aggregator', 'layer', 'layer_type']
grouped = df.groupby(train_fields)

# Fit a linear model of (mean logit diff) vs (multiplier)
import numpy as np
def compute_steering_efficiency(row):
    x = row.multiplier
    y = row.mean_logit_diff
    (slope, _), res, rank, sv, rcond = np.polyfit(x, y, 1, full=True)
    return pd.Series({'steering_efficiency': slope, 'residuals': np.sqrt(res).item()})

steering_efficiency_df = grouped.apply(compute_steering_efficiency)
# merge back into original df
df = df.merge(steering_efficiency_df, left_on=train_fields, right_index=True)

print(len(df))
df.head()

In [None]:
# Rename train_split to be more readable
df['train_sample_size'] = df['train_split'].str.replace('0%:+', '')

# Plot mean logit diff vs multiplier
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure()
ax = sns.lineplot(data=df, x='multiplier', y='mean_logit_diff', hue='train_sample_size')
ax.set_title('Mean logit diff vs multiplier for different train sample sizes')

In [None]:
# Select only dataset name, layer, and steering efficiency
data_df = df[['train_dataset', 'train_sample_size', 'layer', 'steering_efficiency']]
data_df = data_df.drop_duplicates()
print(len(data_df))

# Bar plot of steering efficiency vs train split
sns.set_theme(style="whitegrid")
plt.figure(figsize=(10, 6))
ax = sns.barplot(x="train_sample_size", y="steering_efficiency", data=data_df)
ax.set_title("Steering efficiency vs train_sample_size")


# Compute sample-level steering metrics

In [None]:
# Aggregate the data into a dataframe

import pandas as pd
from dataclasses import asdict
rows = []
for config, result in results:
    partial_row = asdict(config)
    for prediction in result.predictions:
        row = partial_row.copy()
        row.update(**{
            "test_positive_example": prediction.positive_output_prob.text,
            "test_negative_example": prediction.negative_output_prob.text,
            "logit_diff": prediction.metrics['logit_diff'],
        })
        rows.append(row)

df = pd.DataFrame(rows)
print(len(df))
df.head()

In [None]:
# Visualize the test examples. 
temp_df = df[['test_positive_example', 'test_negative_example']].drop_duplicates()
print(len(temp_df))
HTML(
    temp_df.to_html(index=False)
    # View newlines.
    .replace("\\n","<br>")
    # Left align text.
    .replace('<td>', '<td align="left">')
)

In [None]:
# We want to compare the logit diff of a single example, for the same steering vector, at different multipliers
# 'The same steering vector' is determined by the train fields. 
grouped_df = df.groupby(train_fields + ['test_positive_example', 'test_negative_example'])

# Print one group
grouped_df.get_group((list(grouped_df.groups)[0]))


In [None]:
def compute_steering_efficiency(row):
    x = row.multiplier
    y = row.logit_diff
    (slope, _), res, rank, sv, rcond = np.polyfit(x, y, 1, full=True)
    return pd.Series({'steering_efficiency': slope, 'residuals': np.sqrt(res).item()})

# Apply 'compute steering efficiency'
steering_efficiency_df = grouped_df.apply(compute_steering_efficiency)

# merge back into original df
df = df.merge(steering_efficiency_df, left_on=train_fields + ['test_positive_example', 'test_negative_example'], right_index=True)
print(len(df))


In [None]:

df = df[['train_split', 'test_positive_example', 'test_negative_example', 'multiplier', 'logit_diff', 'steering_efficiency']].drop_duplicates()

print(len(df))
df.head()

In [None]:
# The full examples are too long, so just assign an integer per example
df['test_index'] = df.test_positive_example.astype('category').cat.codes.astype(str)
# Filter by the 1-sample train split
temp_df = df[df.train_split == "0%:+1"]
temp_df = temp_df.sort_values('test_index')
print(len(temp_df))
# Print the number of unique indices
print(len(temp_df.test_index.unique()))

# Plot mean logit diff vs multiplier across all examples
# Use the seaborn objects interface
import seaborn.objects as so
p = (
    so.Plot(temp_df, x='multiplier', y='logit_diff')
    .add(so.Line(), color='test_index')
)
p.label(
    title = "Per-example logit diff vs multiplier, using the 1-sample train split",
    color = 'Test example index'
)

In [None]:
# Filter by the 1-sample train split
temp_df = df[df.train_split == "0%:+100"]
temp_df = temp_df.sort_values('test_index')
print(len(temp_df))
# Print the number of unique indices
print(len(temp_df.test_index.unique()))

# Plot mean logit diff vs multiplier across all examples
# Use the seaborn objects interface
import seaborn.objects as so
p = (
    so.Plot(temp_df, x='multiplier', y='logit_diff')
    .add(so.Line(), color='test_index')
)
p.label(
    title = "Per-example logit diff vs multiplier, using the 100-sample train split",
    color = 'Test example index'
)