What happens when we use a random steering vector for steering?

In [None]:
# NOTE: We need to extract a random steering vector, so we re-define the run_experiment function here

import logging
import sys
import torch
import functools

from typing import cast
from pprint import pformat
from repepo.core.pipeline import Pipeline
from repepo.steering.utils.helpers import (
    SteeringConfig,
    EmptyTorchCUDACache,
    get_model_and_tokenizer,
    get_formatter,
    make_dataset,
    get_experiment_path,
    get_eval_result_path,
    save_eval_result,
    load_eval_result,
    get_activation_path,
    save_activation,
    load_activation,
    save_metric,
)

from repepo.steering.build_steering_training_data import (
    build_steering_vector_training_data,
)
from repepo.steering.concept_metrics import (
    VarianceOfNormSimilarityMetric,
    EuclideanSimilarityMetric,
    CosineSimilarityMetric,
    compute_difference_vectors,
)

from repepo.steering.utils.database import SteeringConfigDatabase

from steering_vectors.train_steering_vector import (
    extract_activations,
    SteeringVector,
    LayerType,
)

from repepo.core.evaluate import EvalResult
from repepo.steering.get_aggregator import get_aggregator
from repepo.steering.evaluate_steering_vector import (
    evaluate_steering_vector,
)

from repepo.steering.run_experiment import setup_logger



def run_experiment_with_random_steering_vector(
    config: SteeringConfig,
    force_rerun: bool = False,
    logging_level: str = "INFO",
) -> EvalResult:
    # Set up logger
    logger = setup_logger(logging_level)
    logger.info(f"Running experiment with config: \n{pformat(config)}")
    
    # Set up pipeline
    model, tokenizer = get_model_and_tokenizer(config.model_name)
    formatter = get_formatter(config.formatter)
    pipeline = Pipeline(model, tokenizer, formatter=formatter)

    # Initialize a random steering vector
    layer_activations = {config.layer: torch.randn(4096)}
    steering_vector = SteeringVector(
        layer_activations = layer_activations,
        layer_type = cast(LayerType, config.layer_type),
    )

    # Evaluate steering vector
    test_dataset = make_dataset(config.test_dataset, config.test_split)
    with EmptyTorchCUDACache():
        eval_results = evaluate_steering_vector(
            pipeline=pipeline,
            steering_vector=steering_vector,
            dataset=test_dataset,
            layers=[config.layer],
            multipliers=[config.multiplier],
            completion_template=config.test_completion_template,
            patch_generation_tokens_only=config.patch_generation_tokens_only,
            skip_first_n_generation_tokens=config.skip_first_n_generation_tokens,
            logger=logger,
        )
        assert len(eval_results) == 1, "Expected one result"
        eval_result = eval_results[0]

    return eval_result


In [None]:
import itertools
from repepo.steering.sweeps.constants import ALL_ABSTRACT_CONCEPT_DATASETS

datasets = ALL_ABSTRACT_CONCEPT_DATASETS
layer = 13
multipliers = [-2, -1, 0, 1, 2]

def iter_config():
    for dataset, multiplier in itertools.product(datasets, multipliers):
        yield SteeringConfig(
            train_dataset=dataset,
            train_split="0%:+10",
            formatter="llama-chat-formatter",
            layer=layer,
            multiplier=multiplier,
            test_dataset=dataset,
            test_split="40%:+10",
            test_completion_template="{prompt} My answer is: {response}",
            patch_generation_tokens_only=True,
            skip_first_n_generation_tokens=1,
        )

results = []
for config in iter_config():
    with EmptyTorchCUDACache():
        result = run_experiment_with_random_steering_vector(config, force_rerun=True, logging_level="INFO")
        results.append((config, result))


In [None]:
import pandas as pd 

# Aggregate results
rows = []

for config, result in results:
    rows.append({
        'dataset': config.test_dataset,
        'multiplier': config.multiplier,
        'mean_logit_diff': result.metrics['mean_logit_diff']
    })

df = pd.DataFrame(rows)
df.head()

In [None]:
# Plot logit diff for each dataset.
import seaborn as sns
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10, 6))
for dataset, group in df.groupby('dataset'):
    sns.lineplot(x='multiplier', y='mean_logit_diff', data=group, label=dataset, ax=ax)