In [None]:
from pathlib import Path

RESULTS_ROOT = Path("../../results")

In [None]:
import sys
sys.path.append("../..")

from experiments.aliases import REMEDI_EDITOR_LAYER, REMEDI_ENTITY_CLS_LAYER

# How Big Are Directions?

In [None]:
MODEL = "gptj"
DATASET = "biosbias"

EXPERIMENT_NAME = f"post_icml_directions_{DATASET}_{MODEL}"
REMEDI_LAYER = REMEDI_EDITOR_LAYER[MODEL][DATASET]

In [None]:
import torch
from tqdm.auto import tqdm


def load_results(results_dir, layer):
    return torch.load(results_dir / str(layer) / "dump.pth",
                      map_location="cpu")


def load_results_by_layer(results_dir):
    results_by_layer = {}
    for layer_dir in tqdm(layer_dirs(results_dir)):
        layer = int(layer_dir.name)
        results_by_layer[layer] = load_results(results_dir, layer) 
    return results_by_layer


results_dir = RESULTS_ROOT / EXPERIMENT_NAME / "linear"
assert results_dir.exists()
results = load_results(results_dir, REMEDI_LAYER)

In [None]:
from collections import defaultdict

pre_entity_norms_by_layer = defaultdict(list)
post_entity_norms_by_layer = defaultdict(list)
for x in results:
    for key, d in (
        ("h_entity_pre", pre_entity_norms_by_layer),
        ("h_entity_post", post_entity_norms_by_layer),
    ):
        for layer, h_entity in x[key].items():
            d[layer].append(h_entity.float().norm())

direction_norms = [x["direction"].float().detach().cpu().norm() for x in results]
            
post_entity_norms_by_layer = {
    layer: [norm - pre_entity_norms_by_layer[layer][i] for i, norm in enumerate(norms)]
    for layer, norms in post_entity_norms_by_layer.items()
}

In [None]:
import numpy as np
import pandas as pd

df = pd.DataFrame(
    {
        "layer": layer,
        "Entity Rep": np.mean(pre_entity_norms_by_layer[layer]),
        "+REMEDI": np.mean(post_entity_norms_by_layer[layer]),
    }
    for layer in pre_entity_norms_by_layer
)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

MODEL_PLOT_NAMES = {
    "gptj": "GPT-J",
    "gpt2": "GPT2",
    "gpt2-xl": "GPT2-XL"
}

DATASET_PLOT_NAMES = {
    "mcrae": "McRae",
    "counterfact": "CounterFact",
    "biosbias": "Bios",
}

sns.set()
sns.set_style({'font.family':'serif', 'font.serif':['Times New Roman']})

df.plot(
    kind="bar",
    x="layer",
    stacked=True,
    color=['b', 'darkblue'],
    figsize=(8, 4),
    rot=0,
)

y0 = np.mean(direction_norms)
plt.plot(
    [0, len(pre_entity_norms_by_layer)],
    [y0, y0],
    linestyle='dotted',
    color='black',
)

plt.xlabel("Layer")
plt.xticks(range(0, 28, 4))
plt.ylabel("Norm")
plt.legend(labels=["REMEDI Direction", "Entity", "+ Direction"])
plt.title(f"Rep. Norms ({DATASET_PLOT_NAMES[DATASET]})")
plt.tight_layout()
plt.savefig(f"norms_{MODEL}_{DATASET}.pdf")

# Visualize Directions

Make a little t-SNE plot of the McRae directions.

In [None]:
EXPERIMENT_NAME = "post_icml_directions_mcrae_gptj"
REMEDI_LAYER = REMEDI_EDITOR_LAYER[MODEL]["mcrae"]
LIMIT = 1000

In [None]:
results_dir = RESULTS_ROOT / EXPERIMENT_NAME
results = load_results(results_dir / "linear", REMEDI_LAYER)

In [None]:
def mean_and_std(values):
    values = np.array([v.detach().cpu().float().numpy() for v in values])
    return np.mean(values, axis=0), np.std(values, axis=0)

d_mean, d_std = mean_and_std([x["direction"] for x in results])
e_mean, e_std = mean_and_std([x["h_entity_pre"][REMEDI_LAYER] for x in results])

In [None]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE


directions_seen = set()
directions_labels = []
directions = []
for x in results[:100]:
    if len(directions_labels) >= LIMIT:
        break

    attribute = x["attribute"]
    if attribute in directions_seen:
        continue
    directions_seen.add(attribute)

    directions_labels.append(attribute)
    directions.append((x["direction"].detach().squeeze().numpy() - d_mean) / d_std)

entities_seen = set()
entities_labels = []
entities = []
for x in results[:50]:
    if len(entities_labels) >= 0:# LIMIT:
        break

    entity = x["entity"]
    if entity in entities_seen:
        continue
    entities_seen.add(entity)

    entities_labels.append(entity)
    entities.append((x["h_entity_pre"][REMEDI_LAYER].detach().squeeze().numpy() - e_mean) / e_std)

labels = directions_labels + entities_labels
vectors = np.array(directions + entities)

xys = PCA(n_components=2).fit_transform(vectors)
# xys = TSNE(
#     n_components=2,
#     learning_rate='auto',
#     init='random',
#     perplexity=3,
# ).fit_transform(vectors)

In [None]:
xs = [x for x, _ in xys]
ys = [y for _, y in xys]
cs = ['b' for _ in range(len(directions))] + ['g' for _ in range(len(entities))]

sns.set(font_scale=5)
sns.set_style({'font.family':'serif', 'font.serif':['Times New Roman']})

_, ax = plt.subplots(figsize=(100, 100))
ax.scatter(xs, ys, c=cs)
for i, feature in enumerate(labels):
    ax.annotate(feature, (xs[i], ys[i]))
plt.tight_layout()
plt.savefig("mcrae_pca.pdf")