# Ablate Aggregation Method

The goal of this notebook is to compare the effect of different aggregation methods on the steering metric



# 0. Run Experiment

In [None]:
from repepo.steering.sweeps.constants import (
    ALL_ABSTRACT_CONCEPT_DATASETS,
    ALL_TOKEN_CONCEPT_DATASETS, 
    ALL_LANGUAGES,
    ALL_LLAMA_7B_LAYERS,
    ALL_MULTIPLIERS
)

from repepo.steering.sweeps.configs import (
    get_abstract_concept_config,
    get_token_concept_config
)

from repepo.steering.run_sweep import (
    run_sweep, 
    load_sweep_results
)

from repepo.steering.plots.utils import (
    get_config_fields,
    make_results_df
)

In [None]:
# Define the sweep to run over. 

from itertools import product

debug_setting = {
    "datasets": ["power-seeking-inclination"],
    "layers": [13],
    "multipliers": [-1.0, 0.0, 1.0],
    "aggregators": ["mean", "logistic"]
}


def iter_config(setting):
    for dataset, layer, multiplier, aggregator in product(
        setting["datasets"], 
        setting["layers"], 
        setting["multipliers"],
        setting["aggregators"]
    ):
        yield get_abstract_concept_config(
            dataset=dataset,
            layer=layer,
            multiplier=multiplier,
            aggregator=aggregator
        )


In [None]:
# Optionally, run the sweep and load results. 
# If sweep was already run, set RUN = False.
RUN = True

configs = list(iter_config(debug_setting))
if RUN:
    run_sweep(configs, force_rerun_extract=True, force_rerun_apply=True)

results = load_sweep_results(configs)

In [None]:
# Construct a DataFrame from the results.
df = make_results_df(results)
print(len(df))
df.head()

# 1. Analysis

In [None]:
# Plot the change in positive prob and negative prob for one example. 

import seaborn as sns 
import seaborn.objects as so
import matplotlib.pyplot as plt
sns.set_theme()

def plot(df):
    example = df.iloc[0]
    df = df[df["test_positive_example.text"] == example["test_positive_example.text"]]
    
    fig, ax = plt.subplots(1, 1, figsize=(10, 5))
    # Plot positive token logit, negative token logit.
    sns.lineplot(
        data=df, 
        x="multiplier", 
        y="test_positive_token.logprob", 
        hue = 'aggregator', 
        ax=ax)
    sns.lineplot(
        data=df, 
        x="multiplier", 
        y="test_negative_token.logprob", 
        hue = 'aggregator', 
        ax=ax
    )

plot(df)

In [None]:
# Plot the change in positive token logit and negative token logit for one example. 

import seaborn as sns 
import seaborn.objects as so
import matplotlib.pyplot as plt

def plot(df):
    example = df.iloc[0]
    df = df[df["test_positive_example.text"] == example["test_positive_example.text"]]
    
    fig, ax = plt.subplots(1, 1, figsize=(10, 5))
    
    # Note: Tried using seaborn objects but unclear how to plot both positive_token_logit and negative_token_logit on the same plot.
    # return (
    #     so.Plot(data=df, x="multiplier", y="test_positive_token.logit", color="aggregator")
    #     .add(so.Line())        
    # ) 

    sns.lineplot(data=df, x="multiplier", y="test_positive_token.logit", hue="aggregator", ax=ax)
    sns.lineplot(data=df, x="multiplier", y="test_negative_token.logit", hue="aggregator", linestyle='--', ax=ax)

plot(df)

In [None]:
import pandas as pd
import numpy as np

def calculate_steering_efficiency(
    df: pd.DataFrame, 
    base_metric_name: str = "logit_diff"
):
    df = df.copy()
    # Group by examples
    fields_to_group_by = get_config_fields()
    fields_to_group_by.remove("multiplier")
    fields_to_group_by += ["test_positive_example.text"]

    grouped = df.groupby(fields_to_group_by)

    def fit_linear_regression(df: pd.DataFrame):
        # Fit a linear regression of the base metric on the multiplier
        # Return the slope and error of the fit 
        assert len(df) == 3, "Expected 3 rows in the group"
        x = df["multiplier"].to_numpy()
        y = df[base_metric_name].to_numpy()        
        (slope, intercept), residuals, _, _, _ = np.polyfit(x, y, 1, full=True)
        # Return a dataframe with the slope and residuals
        return pd.DataFrame({
            "slope": [slope],
            "residual": [residuals.item()]
        })

    # Apply a linear-fit to each group using grouped.apply
    slopes = grouped.apply(fit_linear_regression, include_groups = False)
    df = df.merge(slopes, on=fields_to_group_by, how='left')
    return df 

df = calculate_steering_efficiency(df)
print(len(df))

# Scatter plot of the slopes and residuals
fig, ax = plt.subplots(figsize=(8, 8))
sns.scatterplot(data=df, x="slope", y="residual", hue="aggregator", ax=ax)

Remarks
- Why is logistic regression so bad at extracting a steering vector? It seems like it has to be a bug in my code


Sanity checks
- What's the similarity of the steering vectors?
- Is the mean-diff vector able to act as concept classifiers? 

## 1.1 Compare steering vectors

Here, we compare  the steering vectors obtained via logistic regression and mean difference in terms of cosine similarity
- Mean difference
- Logistic regression

In [None]:
from repepo.steering.utils.helpers import load_steering_vector, SteeringConfig

# Get the unique train configs
unique_train_hashes = df.drop_duplicates('train_hash')

# Load the activations for the unique train configs
steering_vectors = {
    hash: load_steering_vector(hash) for hash in unique_train_hashes['train_hash']
}

print(len(steering_vectors))

In [None]:
from torch.nn.functional import cosine_similarity

mean_aggregator_hash = df[df['aggregator'] == 'mean']['train_hash'].iloc[0]
logistic_aggregator_hash = df[df['aggregator'] == 'logistic']['train_hash'].iloc[0]

mean_steering_vector = steering_vectors[mean_aggregator_hash]
logistic_steering_vector = steering_vectors[logistic_aggregator_hash]

# Calculate the cosine similarity between the mean and logistic steering vectors
steering_layer = 13
cosine_similarity(
    mean_steering_vector.layer_activations[steering_layer], 
    logistic_steering_vector.layer_activations[steering_layer],
    dim=0
)

# Conclusion (6 Apr)

- For now, it seems like the SVs found by logistic-regression are significantly different from SVs found via mean-difference, and it's unclear why this would be the case? 
- One reason may be the decomposed model we discussed earlier... Need to validate this