Define what results we want to load

In [None]:
%env WORK_DIR = /home/daniel/ml_workspace/repepo/experiments

In [None]:
from repepo.steering.sweeps.evaluate_steering_efficiency_across_datasets import iter_configs
from repepo.steering.utils.helpers import load_eval_result, get_eval_result_path, get_experiment_path

configs_and_results = []
for config in iter_configs():
    try:
        eval_result = load_eval_result(config.eval_hash)
        configs_and_results.append((config, eval_result))
    except: 
        print(config.eval_hash)
    


Pedagogical example of how to compute steering efficiency

In [None]:
#for config, result in configs_and_results:
#    print(f"{config.train_dataset}: \t logit_diff = {result.metrics['mean_logit_diff']:.2f} +/- {result.metrics['std_logit_diff']:.2f}")

# Construct a dataframe
import pandas as pd
df = pd.DataFrame([{
    'train_dataset': config.train_dataset,
    'layer': config.layer,
    'multiplier': config.multiplier,
    'mean_logit_diff': result.metrics['mean_logit_diff'],
    'std_logit_diff': result.metrics['std_logit_diff'],
    'mean_pos_prob': result.metrics['mean_pos_prob'],
    'std_pos_prob': result.metrics['std_pos_prob'],
} for config, result in configs_and_results])

df.head()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def plot_steering_efficiency(df, dataset, layer):
    data_df = df[(df.train_dataset == dataset) & (df.layer == layer)]

    multipliers = data_df.multiplier.unique()
    multipliers.sort()
    layers = data_df.layer.unique()
    layers.sort()

    slope, intercept = np.polyfit(multipliers, data_df.mean_logit_diff, 1)
    plt.figure(figsize=(10, 6))
    for layer in layers:
        layer_df = data_df[data_df.layer == layer]
        plt.errorbar(layer_df.multiplier, layer_df.mean_logit_diff, yerr=layer_df.std_logit_diff, fmt='o', label=layer)
        plt.plot(multipliers, slope * multipliers + intercept, label='Linear fit')
    plt.xlabel('Steering multiplier')
    plt.ylabel('Mean logit difference')
    plt.title(f'Steering efficiency for {dataset}')
    plt.legend()

datasets = df.train_dataset.unique()
for dataset in datasets:
    layer = 13
    plot_steering_efficiency(df, dataset, layer)
    plt.show()

Compute the steering efficiency

In [None]:

# Group results by (train_dataset, layer)
grouped = df.groupby(['train_dataset', 'layer'])
# Fit a linear model of (mean logit diff) vs (multiplier)
import numpy as np

def compute_steering_efficiency(row):
    x = row.multiplier
    y = row.mean_logit_diff
    (slope, _), res, rank, sv, rcond = np.polyfit(x, y, 1, full=True)
    return pd.Series({'steering_efficiency': slope, 'residuals': np.sqrt(res).item()})

steering_efficiency_df = grouped.apply(compute_steering_efficiency)
# merge back into original df
df = df.merge(steering_efficiency_df, left_on=['train_dataset', 'layer'], right_index=True)
df.head()

In [None]:
# Select only dataset name, layer, and steering efficiency
data_df = df[['train_dataset', 'layer', 'steering_efficiency']]
data_df = data_df.drop_duplicates()
print(len(data_df))

# Bar plot of steering efficiency
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")
plot = sns.catplot(
    data=data_df, kind="bar",
    y="train_dataset", x="steering_efficiency", hue="train_dataset",
    palette="dark", alpha=.6, height=6
)
plt.title('Steering efficiency across datasets')


In [None]:
# Bar plot of residuals
residuals_df = df[['train_dataset', 'layer', 'residuals']]
residuals_df = residuals_df.drop_duplicates()
plot = sns.catplot(
    data=residuals_df, kind="bar",
    y="train_dataset", x="residuals", hue="train_dataset",
    palette="dark", alpha=.6, height=6
)
plt.title('Residuals error of linear fit of steering efficiency')

In [None]:
# Plot the steering efficiency divided by residuals
efficiency_div_residuals = data_df.merge(residuals_df, on=['train_dataset', 'layer'])
efficiency_div_residuals['efficiency_div_residuals'] = efficiency_div_residuals.steering_efficiency / efficiency_div_residuals.residuals
efficiency_div_residuals = efficiency_div_residuals.drop(columns=['steering_efficiency', 'residuals'])
efficiency_div_residuals = efficiency_div_residuals.drop_duplicates()
plot = sns.catplot(
    data=efficiency_div_residuals, kind="bar",
    y="train_dataset", x="efficiency_div_residuals", hue="train_dataset",
    palette="dark", alpha=.6, height=6
)

In [None]:
# # Visualize self-replication, which had highest efficiency divided by residuals
# self_replication_df = df[df.train_dataset == 'self-replication']
# self_replication_df = self_replication_df[['layer', 'multiplier', 'mean_logit_diff', 'std_logit_diff']]
# self_replication_df = self_replication_df.drop_duplicates()
# multipliers = self_replication_df.multiplier.unique()
# multipliers.sort()
# slope, intercept = np.polyfit(multipliers, self_replication_df.mean_logit_diff, 1)
# plt.figure()
# plt.title("Self-replication")
# plt.errorbar(self_replication_df.multiplier, self_replication_df.mean_logit_diff, yerr=self_replication_df.std_logit_diff, fmt='o', label='Self-replication')