In [None]:
!ls ../results

# Generation Tasks: Performance by Layer

In [None]:
import json
from pathlib import Path

import pandas as pd
from scipy.stats import hmean

RESULTS_ROOT = Path("../../results")


def load_json(file):
    with file.open("r") as handle:
        return json.load(handle)


def layer_dirs(results_dir):
    return sorted(results_dir.iterdir(), key=lambda x: int(x.name))


def load_gen_metrics(model, dataset):
    gen_results_dir = RESULTS_ROOT / f"post_icml_sweep_gen_{dataset}_{model}"
    assert gen_results_dir.exists()

    if dataset == "biosbias":
        gen_results_dir /= "contextual"

    rows = []
    for gen_layer_dir in layer_dirs(gen_results_dir / "linear"):
        layer = int(gen_layer_dir.name)
        if dataset == "counterfact":
            efficacy = load_json(gen_layer_dir / "efficacy_metrics.json")
            paraphrase = load_json(gen_layer_dir / "paraphrase_metrics.json")
            generation = load_json(gen_layer_dir / "generation_metrics.json")
            essence = load_json(gen_layer_dir / "essence_metrics.json")

            efficacy_score = paraphrase["score"]["mean"]
            fluency_score = generation["fluency"]["mean"]
            row = {
                "layer": layer,
                "efficacy": efficacy_score,
                "consistency": generation["consistency"]["mean"],
                "fluency": fluency_score,
                "essence": essence["essence"]["mean"],
                "score": hmean([efficacy_score, fluency_score])
            }
        else:
            assert dataset == "biosbias"
            correction = load_json(gen_layer_dir / "error_correction_metrics.json")
            accuracy_score = correction["top1_accuracy"]
            fluency_score = correction["fluency"]["mean"]
            row = {
                "layer": layer,
                "top1_accuracy": accuracy_score,
                "consistency": correction["consistency"]["mean"],
                "fluency": fluency_score,
                "score": hmean([accuracy_score, fluency_score])
            }
        rows.append(row)
    return pd.DataFrame(rows)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


MODEL_PLOT_NAMES = {
    "gptj": "GPT-J",
    "gpt2": "GPT2",
    "gpt2-xl": "GPT2-XL",
}

DATASET_PLOT_NAMES = {
    "counterfact": "CounterFact",
    "biosbias": "Bios",
}

def plot_gen_scores_by_layer(model, dataset):
    metrics = load_gen_metrics(model, dataset)

    sns.set(rc={"figure.figsize": (10, 4) if model != "gpt2" else (5, 2)})
    sns.set_style({'font.family':'serif', 'font.serif':['Times New Roman']})

    colors = ["g" if (x < max(metrics.score)) else "darkgreen" for x in metrics.score]
    sns.barplot(
        x="layer",
        y="score",
        data=metrics,
        palette=colors,
    )
    plt.title(f"REMEDI Performance on {DATASET_PLOT_NAMES[dataset]}/{MODEL_PLOT_NAMES[model]}")
    plt.ylabel("H-Mean of Fl. / Eff.")
    plt.xlabel("Layer")
    if model == "gpt2-xl":
        plt.xticks(range(0, 48, 4))
plot_gen_scores_by_layer("gptj", "counterfact")

In [None]:
plot_gen_scores_by_layer("gptj", "biosbias")

In [None]:
plot_gen_scores_by_layer("gpt2-xl", "counterfact")

In [None]:
plot_gen_scores_by_layer("gpt2-xl", "biosbias")

In [None]:
plot_gen_scores_by_layer("gpt2", "counterfact")

In [None]:
plot_gen_scores_by_layer("gpt2", "biosbias")

# Classification Tasks: Performance by Layer

In [None]:
import re

METRICS_FILE_PATTERN = re.compile(r"[a-zA-z_]+_layer_(\d+)_metrics.json")

def get_editor_layer_dir(results_dir):
    return next(iter(layer_dirs(results_dir / "linear")))

def load_cls_metrics(model, dataset):
    results_dir = RESULTS_ROOT / f"post_icml_sweep_cls_{dataset}_{model}"
    assert results_dir.exists()

    editor_layer_dir = get_editor_layer_dir(results_dir)

    metrics_by_entity_layer = {}
    for file in editor_layer_dir.iterdir():
        match = METRICS_FILE_PATTERN.match(file.name)
        if not match:
            continue
        layer = int(match.group(1))
        metrics_by_entity_layer[layer] = load_json(file)

    rows = []
    for layer, metrics in sorted(metrics_by_entity_layer.items(), key=lambda kv: kv[0]):
        if dataset == "counterfact":
            row = {
                "layer": layer,
                "f1": metrics["contextual"]["f1"],
                "mcc": metrics["contextual"]["mcc"],
            }
        else:
            assert dataset == "biosbias"
            row = {
                "layer": layer,
                "f1": metrics["f1"],
                "mcc": metrics["mcc"],
            }
        rows.append(row)
    return pd.DataFrame(rows)

load_cls_metrics("gptj", "counterfact")

In [None]:
def plot_cls_scores_by_layer(model, dataset, metric="mcc"):
    metrics = load_cls_metrics(model, dataset)

    sns.set(rc={"figure.figsize": (10, 4) if model != "gpt2" else (5, 2)})
    sns.set_style({'font.family':'serif', 'font.serif':['Times New Roman']})

    colors = ["g" if (x < max(metrics.f1)) else "darkgreen" for x in metrics.f1]
    sns.barplot(
        x="layer",
        y=metric,
        data=metrics,
        palette=colors,
    )
    plt.title(f"REMEDI Performance on {DATASET_PLOT_NAMES[dataset]}/{MODEL_PLOT_NAMES[model]}")
    plt.ylabel(metric.upper())
    plt.xlabel("Layer")
    if model == "gpt2-xl":
        plt.xticks(range(0, 48, 4))

plot_cls_scores_by_layer("gptj", "counterfact", metric="f1")

In [None]:
plot_cls_scores_by_layer("gpt2-xl", "counterfact", metric="f1")

In [None]:
plot_cls_scores_by_layer("gpt2", "counterfact", metric="f1")

In [None]:
plot_cls_scores_by_layer("gptj", "biosbias", metric="f1")

In [None]:
plot_cls_scores_by_layer("gpt2-xl", "biosbias", metric="f1")

In [None]:
plot_cls_scores_by_layer("gpt2", "biosbias", metric="f1")