Define what results we want to load

In [None]:
%env WORK_DIR = /home/daniel/ml_workspace/repepo/experiments

In [None]:
from repepo.steering.sweeps.evaluate_steering_efficiency_across_datasets import iter_configs
from repepo.steering.run_sweep import run_sweep, load_sweep_results

RUN = False
configs = list(iter_configs())

if RUN:
    run_sweep(configs)

results = load_sweep_results(configs)
    


In [None]:
from repepo.steering.plots.utils import make_results_df, get_config_fields

df = make_results_df(results)
print(len(df))
df.head()

In [None]:
from repepo.utils.stats import bernoulli_js_dist

# Compute sample-wise Jenson-Shannon divergence from different multipliers

config_fields = get_config_fields()
fields = config_fields + ["test_positive_example", "test_negative_example"]
fields.remove("multiplier")

def compute_js_div(group):
    """ 
    Within a group, we have the exact same SV and eval example, but different multipliers. 
    So we should compute the JS divergence to the zero multiplier
    and store it in the group.
    """    
    zero_multiplier = group[group.multiplier == 0].pos_prob.values[0]
    js_div = group.apply(lambda x: bernoulli_js_dist(zero_multiplier, x.pos_prob), axis=1)
    return js_div

grouped = df.groupby(fields, as_index = False)[['multiplier', 'pos_prob']]
df['js_div'] = grouped.apply(compute_js_div).reset_index(level=0, drop=True)
df.head()

In [None]:
small_df = df[df.train_dataset == "power-seeking-inclination"]
print(len(small_df))
small_df.head()

In [None]:
k = 10

# Filter down to k randomly-chosen test_positive_example
examples_to_keep = small_df.test_positive_example[:k]
small_df = small_df[small_df.test_positive_example.isin(examples_to_keep)]
# Convert test_positive_example to an index
small_df['example_index'] = small_df.test_positive_example.astype('category').cat.codes.astype(str)
print(len(small_df))

# Plot the JS divergence for each test_positive_example
import seaborn as sns
import matplotlib.pyplot as plt

sns.lineplot(data=small_df, x='multiplier', y='js_div', hue='example_index')

In [None]:
# Plot Js div vs mean pos prob at zero
pos_prob_at_zero = small_df[small_df.multiplier == 0][['example_index', 'pos_prob']]
print(len(pos_prob_at_zero))
# merge
small_df = small_df.merge(pos_prob_at_zero, on='example_index', suffixes=('', '_at_zero'))
small_df.head()

In [None]:

sns.scatterplot(data=small_df, x='pos_prob_at_zero', y='js_div', hue='example_index')

# 1. Plot steering efficiency


In [None]:
# Group by steering vectors and examples, but not multiplier
fields = config_fields + ['test_positive_example']
fields.remove("multiplier")

steering_efficiency_df = (
    # Here, a group consists of the same SV and example for different multipliers
    df.groupby(fields, as_index=False)['js_div']
    # Compute steering efficiency as the range of JS div across multipliers
    .agg(lambda x: x.max() - x.min())
    .rename(columns={'js_div': 'steering_efficiency'})
)

In [None]:
print(len(steering_efficiency_df))
steering_efficiency_df.head()

In [None]:
# For each dataset, take the mean of steering efficiency over all test_positive_examples
grouped = steering_efficiency_df.groupby("train_dataset", as_index=False)
mean_steering_efficiency = grouped['steering_efficiency'].mean()
# Add back in 'mean_pos_prob'
mean_steering_efficiency = mean_steering_efficiency.merge(
    df[['train_dataset', 'multiplier', 'mean_pos_prob']].drop_duplicates(),
    on='train_dataset'
)
print(mean_steering_efficiency.columns)

sns.set_theme(font_scale = 0.5)
sns.barplot(data=mean_steering_efficiency, y='train_dataset', x='steering_efficiency')

In [None]:
from repepo.steering.sweeps.constants import ALL_ABSTRACT_CONCEPT_DATASETS

sns.set_theme()
sns.barplot(
    data=mean_steering_efficiency[mean_steering_efficiency['train_dataset'].isin(ALL_ABSTRACT_CONCEPT_DATASETS)], 
    y='train_dataset', x='steering_efficiency'
)

In [None]:
# Scatterplot of steering efficiency vs mean pos prob
limited_df = mean_steering_efficiency
# limited_df = mean_steering_efficiency[mean_steering_efficiency['train_dataset'].isin(ALL_ABSTRACT_CONCEPT_DATASETS)]

temp_df = limited_df[limited_df['multiplier'] == 0]

fig, ax = plt.subplots()
sns.scatterplot(data=temp_df, x='mean_pos_prob', y='steering_efficiency', hue='train_dataset', ax = ax)
# Plot linar regression line
import numpy as np
slope, intercept = np.polyfit(temp_df['mean_pos_prob'], temp_df['steering_efficiency'], 1)
x = np.linspace(0, 1, 100)
y = slope * x + intercept
sns.lineplot(x=x, y=y, ax = ax, color='orange')

ax.set_title("Steering efficiency vs mean pos prob")
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))