# Figures

----------------

We generate all the figures and statistics that appear in the paper here.

----------------

## 1) Setup

In [1]:
import numpy as np
import pandas as pd
import json
import pickle
import sys

sys.path.append("../../model")
from model_params import *

sys.path.append("../../data")
from dataset_params import *
from prompt_params import *
from demo_params import *

In [2]:
models = [
    "gpt2_xl", 
    "gpt_j", 
    "gpt_neox_20B",
]

In [3]:
datasets = [
    "sst2",
    "agnews",
    "trec",
    "dbpedia",
    "rte",
    "mrpc",
    "tweet_eval_hate",
    "sick",
    "poem_sentiment",
    "ethos",
    "financial_phrasebank",
    "medical_questions_pairs",
    "tweet_eval_stance_feminist",
    "tweet_eval_stance_atheism",
    "unnatural",
    "sst2_ab",
]

# non-synthetic tasks
datasets_main = [
    "sst2",
    "agnews",
    "trec",
    "dbpedia",
    "rte",
    "mrpc",
    "tweet_eval_hate",
    "sick",
    "poem_sentiment",
    "ethos",
    "financial_phrasebank",
    "medical_questions_pairs",
    "tweet_eval_stance_feminist",
    "tweet_eval_stance_atheism",
]

# datasets with 3 or more labels
datasets_3_plus = [
    "agnews",
    "trec",
    "dbpedia",
    "sick",
    "poem_sentiment",
    "financial_phrasebank",
    "tweet_eval_stance_feminist",
    "tweet_eval_stance_atheism",
    "unnatural",
]

In [4]:
settings = [
    "permuted_incorrect_labels",
    "half_permuted_incorrect_labels",
    "random_labels",
]

## 2) Load Results

In [5]:
logit_lens = {}
for model in models:
    logit_lens[model] = {}
    for setting in settings:
        logit_lens[model][setting] = {}
        for dataset in datasets:
            logit_lens[model][setting][dataset] = np.load(
                f"../../results/logit_lens/{model}/{setting}/{dataset}.npy",
                allow_pickle=True,
            ).item()

In [6]:
attention = {}
dataset = "unnatural"
for model in models:
    attention[model] = {}
    for setting in settings:
        attention[model][setting] = {}
        attention[model][setting][dataset] = data_frame = pd.read_csv(f"../../results/attention/{model}/{dataset}.csv")

## 3) Layerwise

### 3.1) SST-2 Prompt Formats

In [7]:
def save_prompt_formats(models, settings, metrics, datasets):
    for model in models:
        n_layers = MODEL_PARAMS[model]["num_layers"] + 1
        for setting in settings:
            for metric in metrics:
                for dataset in datasets:
                    prompts = logit_lens[model][setting][dataset][metric]
                    for j, prompt in enumerate(prompts):
                        tp_df = pd.DataFrame({"layer": list(range(n_layers)), "p": prompt[0][0, :, -1]})
                        fp_df = pd.DataFrame({"layer": list(range(n_layers)), "p": prompt[0][1, :, -1]})
                        zero_shot = pd.DataFrame({"layer": list(range(n_layers)), "p": prompt[0][0, :, 0]})
                        tp_df.to_csv(
                            f"../../results/figures/layerwise/{model}/{setting}/{metric}/{dataset}_prompt_format_id_{j}_true_prefix.csv",
                            index=False,
                        )
                        fp_df.to_csv(
                            f"../../results/figures/layerwise/{model}/{setting}/{metric}/{dataset}_prompt_format_id_{j}_false_prefix.csv",
                            index=False,
                        )
                        zero_shot.to_csv(
                            f"../../results/figures/layerwise/{model}/{setting}/{metric}/{dataset}_prompt_format_id_{j}_zero_shot.csv",
                            index=False,
                        )

In [8]:
save_prompt_formats(["gpt_j"], ["permuted_incorrect_labels"], ["cal_correct_over_incorrect"], ["sst2"])

### 3.2) Average over Prompt Formats

In [9]:
def save_avg_over_pf_baseline(models, settings, metrics, datasets):
    for model in models:
        n_layers = MODEL_PARAMS[model]["num_layers"] + 1
        for setting in settings:
            for metric in metrics:
                for dataset in datasets:
                    n_labels = len(PROMPT_PARAMS[dataset][0]["labels"])
                    baseline_data = {"layer": list(range(n_layers)), "p": [1 / n_labels] * n_layers}
                    baseline_df = pd.DataFrame(baseline_data)
                    baseline_df.to_csv(
                        f"../../results/figures/layerwise/{model}/{setting}/{metric}/{dataset}_baseline.csv",
                        index=False,
                    )

In [10]:
save_avg_over_pf_baseline(models, ["permuted_incorrect_labels"], ["cal_correct_over_incorrect"], datasets)
save_avg_over_pf_baseline(["gpt_j"], ["permuted_incorrect_labels"], ["correct_over_incorrect"], datasets)

In [11]:
def save_avg_over_pf(models, settings, metrics, datasets, save_tp, save_fp, save_zero_shot):
    for model in models:
        n_layers = MODEL_PARAMS[model]["num_layers"] + 1
        for setting in settings:
            for metric in metrics:
                for dataset in datasets:
                    avg = np.mean(logit_lens[model][setting][dataset][metric], axis=0)[0]
                    tp_df = pd.DataFrame({"layer": list(range(n_layers)), "p": avg[0, :, -1]})
                    fp_df = pd.DataFrame({"layer": list(range(n_layers)), "p": avg[1, :, -1]})
                    zero_shot = pd.DataFrame({"layer": list(range(n_layers)), "p": avg[0, :, 0]})
                    if save_tp:
                        tp_df.to_csv(
                            f"../../results/figures/layerwise/{model}/{setting}/{metric}/{dataset}_true_prefix.csv",
                            index=False,
                        )
                    if save_fp:
                        fp_df.to_csv(
                            f"../../results/figures/layerwise/{model}/{setting}/{metric}/{dataset}_false_prefix.csv",
                            index=False,
                        )
                    if save_zero_shot:
                        zero_shot.to_csv(
                            f"../../results/figures/layerwise/{model}/{setting}/{metric}/{dataset}_zero_shot.csv",
                            index=False,
                        )

In [12]:
save_avg_over_pf(models, ["permuted_incorrect_labels"], ["cal_correct_over_incorrect"], datasets, True, True, True)
save_avg_over_pf(models, ["half_permuted_incorrect_labels", "random_labels"], ["cal_correct_over_incorrect"], datasets, False, True, False)
save_avg_over_pf(["gpt_j"], ["permuted_incorrect_labels"], ["correct_over_incorrect"], datasets, True, True, True)

### 3.3) Average over Datasets

In [13]:
def save_avg_over_datasets_baseline(models, settings, metrics, datasets):
    for model in models:
        n_layers = MODEL_PARAMS[model]["num_layers"] + 1
        for setting in settings:
            for metric in metrics:
                baseline = 0
                for dataset in datasets:
                    baseline += 1 / len(PROMPT_PARAMS[dataset][0]["labels"])
                baseline /= len(datasets)

                if metric == "top_1_acc" or metric == "label_space_probs":
                    baseline = 1 / 50400
                elif metric == "cal_permute":
                    baseline = 1
                baseline_data = {"layer": list(range(n_layers)), "p": [baseline] * n_layers}
                baseline_df = pd.DataFrame(baseline_data)
                baseline_df.to_csv(
                    f"../../results/figures/layerwise/{model}/{setting}/{metric}/average_baseline.csv",
                    index=False,
                )

In [14]:
save_avg_over_datasets_baseline(models, ["permuted_incorrect_labels"], ["cal_correct_over_incorrect"], datasets_main)
save_avg_over_datasets_baseline(models, ["permuted_incorrect_labels"], ["correct_over_incorrect", "top_1_acc", "cal_permute", "label_space_probs"], datasets_main)

In [15]:
def save_avg_over_datasets(models, settings, metrics, datasets, save_tp, save_fp, save_zero_shot):
    for model in models:
        n_layers = MODEL_PARAMS[model]["num_layers"] + 1
        for setting in settings:
            for metric in metrics:
                tp_avg = np.zeros(n_layers)
                fp_avg = np.zeros(n_layers)
                zero_shot_avg = np.zeros(n_layers)
                for dataset in datasets:
                    avg = np.mean(logit_lens[model][setting][dataset][metric], axis=0)[0]
                    tp = avg[0, :, -1]
                    fp = avg[1, :, -1]
                    zs = avg[0, :, 0]
                    if metric == "cal_permute":
                        tp *= len(PROMPT_PARAMS[dataset][0]["labels"])
                        fp *= len(PROMPT_PARAMS[dataset][0]["labels"])
                        zs *= len(PROMPT_PARAMS[dataset][0]["labels"])
                    tp_avg += tp
                    fp_avg += fp
                    zero_shot_avg += zs
                tp_avg /= len(datasets)
                fp_avg /= len(datasets)
                zero_shot_avg /= len(datasets)
                tp_df = pd.DataFrame({"layer": list(range(n_layers)), "p": tp_avg})
                fp_df = pd.DataFrame({"layer": list(range(n_layers)), "p": fp_avg})
                zero_shot = pd.DataFrame({"layer": list(range(n_layers)), "p": zero_shot_avg})
                if save_tp:
                    tp_df.to_csv(
                        f"../../results/figures/layerwise/{model}/{setting}/{metric}/average_true_prefix.csv",
                        index=False,
                    )
                if save_fp:
                    fp_df.to_csv(
                        f"../../results/figures/layerwise/{model}/{setting}/{metric}/average_false_prefix.csv",
                        index=False,
                    )
                if save_zero_shot:
                    zero_shot.to_csv(
                        f"../../results/figures/layerwise/{model}/{setting}/{metric}/average_zero_shot.csv",
                        index=False,
                    )

In [16]:
save_avg_over_datasets(models, ["permuted_incorrect_labels"], ["cal_correct_over_incorrect"], datasets_main, True, True, True)
save_avg_over_datasets(models, ["half_permuted_incorrect_labels", "random_labels"], ["cal_correct_over_incorrect"], datasets_main, False, True, False)
save_avg_over_datasets(models, ["permuted_incorrect_labels"], ["correct_over_incorrect", "top_1_acc", "cal_permute", "label_space_probs"], datasets_main, True, True, True)

## 4) Contextwise

### 4.1) Accuracy Gap

#### 4.1.1) Average over Prompt Formats

In [17]:
def save_acc_gap_avg_over_pf(models, settings, metrics, datasets):
    for model in models:
        n_layers = MODEL_PARAMS[model]["num_layers"] + 1
        n_demos = MODEL_PARAMS[model]["max_demos"]
        for setting in settings:
            for metric in metrics:
                for dataset in datasets:
                    avg = np.mean(logit_lens[model][setting][dataset][metric], axis=0)[0]
                    acc_gap = avg[0, -1, :] - avg[1, -1, :]
                    acc_gap_df = pd.DataFrame({"pic": list(range(n_demos)), "p": acc_gap})
                    acc_gap_df.to_csv(
                        f"../../results/figures/contextwise/{model}/{setting}/acc_gap/{dataset}.csv",
                        index=False,
                    )

In [18]:
save_acc_gap_avg_over_pf(models, ["permuted_incorrect_labels"], ["cal_correct_over_incorrect"], datasets_main)

#### 4.1.2) Average over Datasets

In [19]:
def save_acc_gap_avg_over_datasets(models, settings, metrics, datasets):
    for model in models:
        n_layers = MODEL_PARAMS[model]["num_layers"] + 1
        n_demos = MODEL_PARAMS[model]["max_demos"]
        for setting in settings:
            for metric in metrics:
                acc_gap = np.zeros(n_demos)
                for dataset in datasets:
                    avg = np.mean(logit_lens[model][setting][dataset][metric], axis=0)[0]
                    acc_gap += avg[0, -1, :] - avg[1, -1, :]
                acc_gap /= len(datasets)
                acc_gap_df = pd.DataFrame({"pic": list(range(n_demos)), "p": acc_gap})
                acc_gap_df.to_csv(
                    f"../../results/figures/contextwise/{model}/{setting}/acc_gap/average.csv",
                    index=False,
                )

In [20]:
save_acc_gap_avg_over_datasets(models, ["permuted_incorrect_labels"], ["cal_correct_over_incorrect"], datasets_main)

### 4.2) Permute Score

#### 4.2.1) Average over Prompt Formats

In [21]:
def save_permute_score_avg_over_pf(models, settings, datasets):
    metric = "cal_permute"
    for model in models:
        n_layers = MODEL_PARAMS[model]["num_layers"] + 1
        n_demos = MODEL_PARAMS[model]["max_demos"]
        for setting in settings:
            for dataset in datasets:
                avg = np.mean(logit_lens[model][setting][dataset][metric], axis=0)[0]
                permute_score = avg[1, -1, :] * len(PROMPT_PARAMS[dataset][0]["labels"])
                permute_score_df = pd.DataFrame({"pic": list(range(n_demos)), "p": permute_score})
                permute_score_df.to_csv(
                    f"../../results/figures/contextwise/{model}/{setting}/permute_score/{dataset}.csv",
                    index=False,
                )

In [22]:
save_permute_score_avg_over_pf(["gpt_j"], ["permuted_incorrect_labels"], datasets_3_plus)

#### 4.2.2) Average over Datasets

In [23]:
def save_permute_score_avg_over_datasets(models, settings, datasets):
    metric = "cal_permute"
    for model in models:
        n_layers = MODEL_PARAMS[model]["num_layers"] + 1
        n_demos = MODEL_PARAMS[model]["max_demos"]
        for setting in settings:
            permute_score = np.zeros(n_demos)
            for dataset in datasets:
                avg = np.mean(logit_lens[model][setting][dataset][metric], axis=0)[0]
                permute_score += avg[1, -1, :] * len(PROMPT_PARAMS[dataset][0]["labels"])
            permute_score /= len(datasets)
            permute_score_df = pd.DataFrame({"pic": list(range(n_demos)), "p": permute_score})
            permute_score_df.to_csv(
                f"../../results/figures/contextwise/{model}/{setting}/permute_score/average.csv",
                index=False,
            )

In [24]:
save_permute_score_avg_over_datasets(["gpt_j"], ["permuted_incorrect_labels"], datasets_3_plus)

## 5) Gap Appearance

see notebooks/analysis/logit_lens

**gpt2_xl** \
Layer: 22. Dataset: sst2. \
Layer: 22. Dataset: poem_sentiment. \
Layer: 23. Dataset: financial_phrasebank. \
Layer: 23. Dataset: ethos. \
Layer: 21. Dataset: tweet_eval_hate. \
Layer: 42. Dataset: tweet_eval_stance_atheism. \
Layer: 24. Dataset: tweet_eval_stance_feminist. \
Layer: 03. Dataset: medical_questions_pairs. \
Layer: 03. Dataset: mrpc. \
Layer: 21. Dataset: sick. \
Layer: 20. Dataset: rte. \
Layer: 23. Dataset: agnews. \
Layer: 22. Dataset: trec. \
Layer: 24. Dataset: dbpedia.

**gpt_j** \
Layer: 14. Dataset: sst2. \
Layer: 14. Dataset: poem_sentiment. \
Layer: 14. Dataset: financial_phrasebank. \
Layer: 14. Dataset: ethos. \
Layer: 14. Dataset: tweet_eval_hate. \
Layer: 14. Dataset: tweet_eval_stance_atheism. \
Layer: 14. Dataset: tweet_eval_stance_feminist. \
Layer: 13. Dataset: medical_questions_pairs. \
Layer: 13. Dataset: mrpc. \
Layer: 13. Dataset: sick. \
Layer: 14. Dataset: rte. \
Layer: 17. Dataset: agnews. \
Layer: 14. Dataset: trec. \
Layer: 11. Dataset: dbpedia.

**gpt_neox** \
Layer: 10. Dataset: sst2. \
Layer: 12. Dataset: poem_sentiment. \
Layer: 10. Dataset: financial_phrasebank. \
Layer: 10. Dataset: ethos. \
Layer: 10. Dataset: tweet_eval_hate. \
Layer: 10. Dataset: tweet_eval_stance_atheism. \
Layer: 10. Dataset: tweet_eval_stance_feminist. \
Layer: 13. Dataset: medical_questions_pairs. \
Layer: 04. Dataset: mrpc. \
Layer: 10. Dataset: sick. \
Layer: 02. Dataset: rte. \
Layer: 10. Dataset: agnews. \
Layer: 10. Dataset: trec. \
Layer: 12. Dataset: dbpedia.

## 6) Early Exiting

see notebooks/analysis/logit_lens

We identified the following critical layers:

**gpt2_xl** \
Layer 30

**gpt_j** \
Layer 16

**gpt_neox** \
Layer 32

### 6.1) Critical Layer Early Exiting Success

see notebooks/analysis/logit_lens

The number of datasets where early exiting is better than the final layer.

**gpt2_xl** \
\# of Datasets: 10

**gpt_j** \
\# of Datasets: 14

**gpt_neox** \
\# of Datasets: 10

### 6.2) Critical Layer Early Exiting Performance

see notebooks/analysis/logit_lens

The performance differrence between early exiting and full evaluation.

Setting: **permuted_incorrect_labels** \
Model: **gpt2_xl** \
True Prefix Delta: -0.517% \
False Prefix Delta: 2.658% \
Model: **gpt_j** \
True Prefix Delta: -1.538% \
False Prefix Delta: 10.299% \
Model: **gpt_neox** \
True Prefix Delta: 0.477% \
False Prefix Delta: 0.869%

Setting: **half_permuted_incorrect_labels** \
Model: **gpt2_xl** \
True Prefix Delta: 0.070% \
False Prefix Delta: 1.573% \
Model: **gpt_j** \
True Prefix Delta: -2.544% \
False Prefix Delta: 2.607% \
Model: **gpt_neox** \
True Prefix Delta: 0.033% \
False Prefix Delta: 0.150%

Setting: **random_labels** \
Model: **gpt2_xl** \
True Prefix Delta: 0.209% \
False Prefix Delta: 2.329% \
Model: **gpt_j** \
True Prefix Delta: -1.931% \
False Prefix Delta: 4.604% \
Model: **gpt_neox** \
True Prefix Delta: 0.616% \
False Prefix Delta: 1.320%

## 7) Attention

#### 7.1) Layerwise Percent of Final Gap

In [25]:
def save_perc_of_gap_avg_over_datasets(models, settings, datasets):
    metric = "cal_correct_over_incorrect"
    for model in models:
        n_layers = MODEL_PARAMS[model]["num_layers"] + 1
        for setting in settings:
            tp_avg = np.zeros(n_layers)
            fp_avg = np.zeros(n_layers)
            for dataset in datasets:
                avg = np.mean(logit_lens[model][setting][dataset][metric], axis=0)[0]
                tp_avg += avg[0, :, -1]
                fp_avg += avg[1, :, -1]
            tp_avg /= len(datasets)
            fp_avg /= len(datasets)

            percent_of_final_gap = []
            for i in range(n_layers):
                percent_of_final_gap.append(
                    (tp_avg[i] - fp_avg[i]) / (tp_avg[-1] - fp_avg[-1])
                )

            pofg = pd.DataFrame({"layer": list(range(n_layers)), "p": percent_of_final_gap})
            pofg.to_csv(
                f"../../results/figures/attention/{model}/{setting}/percent_of_final_gap.csv",
                index=False,
            )

In [26]:
save_perc_of_gap_avg_over_datasets(models, ["permuted_incorrect_labels"], datasets_main)

#### 7.2) Sum of PM Score

In [27]:
def save_sum_of_pm_score_avg_over_pf(models, settings, datasets):
    for model in models:
        n_layers = MODEL_PARAMS[model]["num_layers"]
        for setting in settings:
            layer_scores = [0] * n_layers
            for dataset in datasets:
                df = pd.DataFrame(attention[model][setting][dataset])
                df = df[df["demo_indx"] == df["demo_indx"].max()]
                for i in range(n_layers):
                    df_layer = df[df["layer_indx"] == i]
                    for j in range(df["head_indx"].max() + 1):
                        df_head = df_layer[df_layer["head_indx"] == j]
                        layer_scores[i] += df_head.iloc[0]["cfs_lab_prime"]
                        layer_scores[i] += df_head.iloc[1]["cfs_lab_prime"]
                pm_score = pd.DataFrame({"layer": list(range(n_layers)), "p": layer_scores})
                pm_score.to_csv(
                    f"../../results/figures/attention/{model}/{setting}/{dataset}_sum_of_pm_scores.csv",
                    index=False,
                )

In [28]:
save_sum_of_pm_score_avg_over_pf(models, ["permuted_incorrect_labels"], ["unnatural"])

## 8) Ablations

### 8.1 {Attention, MLP, Late Layer} Ablation

See section 2 in notebooks/analysis/ablations.

### 8.2 Head Ablation

See section 3 in notebooks/analysis/ablations.