# Get accuracy and consistency 
For GPT2-XL, LLaMA and Atlas-base.

In [4]:
import json
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import os
import math

from experiments.causal_trace import plot_trace_heatmap

## Load the data

In [55]:
relations = ["P19", "P20", "P27", "P101", "P495", "740", "P1376"]

### GPT-2 XL and LLaMA-7B

In [None]:
data = pd.DataFrame()

# read gpt2 preds
for relation in relations:
    with open(f"/cephyr/users/lovhag/Alvis/projects/rome/data/{relation}_gpt2_xl_preds.jsonl") as f:
        for line in f:
            tmp_line = json.loads(line)
            tmp_line["model"] = "gpt2"
            tmp_line["relation"] = relation
            data = data.append(tmp_line, ignore_index=True)
            
# read llama preds
for relation in relations:
    with open(f"/cephyr/users/lovhag/Alvis/projects/rome/data/results/llama7B/{relation}_Llama_2_7b_hf_preds.jsonl") as f:
        for line in f:
            tmp_line = json.loads(line)
            tmp_line["model"] = "llama7b"
            tmp_line["relation"] = relation
            data = data.append(tmp_line, ignore_index=True)
        
data.head()

### Atlas-base
Only read the results for which the templates are compatible with ARMs.

> Comment: We could also load results with confidences given, but these are only across the options and not comparable to the confidences of GPT2-XL or LLaMA.

In [12]:
def template_ends_with_mask(template):
    return template.replace(".","").strip()[-3:]=="[Y]"

In [58]:
atlas_base_results_folder = "/cephyr/users/lovhag/Alvis/projects/atlas/data/experiments/pararel-eval-zero-shot-base-no-space-likelihood-no-eos-with-3"
atlas_base_files = {"P19": os.path.join(atlas_base_results_folder, "P19-base-2017-1115963/P19-step-0.jsonl"),
                    "P20": os.path.join(atlas_base_results_folder, "P20-base-2017-1115971/P20-step-0.jsonl"),
                    "P27": os.path.join(atlas_base_results_folder, "P27-base-2017-1115970/P27-step-0.jsonl"),
                    "P101": os.path.join(atlas_base_results_folder, "P101-base-2017-1115964/P101-step-0.jsonl"),
                    "P495": os.path.join(atlas_base_results_folder, "P495-base-2017-1115952/P495-step-0.jsonl"),
                    "P740": os.path.join(atlas_base_results_folder, "P740-base-2017-1115956/P740-step-0.jsonl"),
                    "P1376": os.path.join(atlas_base_results_folder, "P1376-base-2017-1115967/P1376-step-0.jsonl")
                    }

In [59]:
atlas_base_data = pd.DataFrame()

for relation in relations:
    with open(atlas_base_files[relation]) as f:
        for line in f:
            tmp_data = json.loads(line)
            if template_ends_with_mask(tmp_data["pattern"]):
                del tmp_data["passages"]
                del tmp_data["metadata"]
                tmp_data["relation"] = relation
                atlas_base_data = atlas_base_data.append(tmp_data, ignore_index=True)

atlas_base_data["answers"] = atlas_base_data.answers.apply(lambda val: val[0])
atlas_base_data["pattern"] = atlas_base_data.pattern.apply(lambda val: val.replace("[X]", "{}").replace(" [Y].","").replace(" [Y]",""))
atlas_base_data["query"] = atlas_base_data["query"].apply(lambda val: val.replace("<extra_id_0>.","").replace("<extra_id_0>",""))
            
atlas_base_data = atlas_base_data.rename(columns = {"answers": "attribute", 
                                         "generation": "prediction",
                                         "generation_by_choice": "candidate_prediction",
                                         "pattern": "template",
                                         "query": "prompt",
                                         "sub_label": "subject"
                                         }
                                        )
atlas_base_data.head()

Unnamed: 0,attribute,prediction,candidate_prediction,template,prompt,subject
0,French,French French French. The French language -Jules,French,The native language of {} is,The native language of Louis Jules Trochu is,Louis Jules Trochu
1,French,French Italian French. She is Italian -French ...,French,The native language of {} is,The native language of Valeria Bruni Tedeschi is,Valeria Bruni Tedeschi
2,French,French French French. The French language Jacq...,French,The native language of {} is,The native language of Pierre Messmer is,Pierre Messmer
3,French,French French French d'Anjou. he was born Fren...,French,The native language of {} is,The native language of Roger Nimier is,Roger Nimier
4,Russian,Russian Russian Russian. The Russian language ...,Russian,The native language of {} is,The native language of Andrey Malakhov is,Andrey Malakhov


Can confirm that all data query values are the same!

In [60]:
gpt2_data_to_comp = gpt2_data.drop(columns=["candidate_p", "candidate_rank", "gold_p", "gold_rank", "known_id", "prediction_p", "relation_id"]
              ).sort_index(axis=1).sort_values(by=["subject", "template"], ignore_index=True)
atlas_base_data_to_comp = atlas_base_data.sort_index(axis=1).sort_values(by=["subject", "template"], ignore_index=True)


gpt2_data_to_comp.compare(atlas_base_data_to_comp, align_axis="columns")

Unnamed: 0_level_0,candidate_prediction,candidate_prediction,prediction,prediction
Unnamed: 0_level_1,self,other,self,other
0,Hindi,Hindi,Mar,"Hindi Hindi Hindi a former President of India,..."
1,Hindi,English,Mar,Bengali English Bengali he is the President.
2,Dutch,Dutch,the,Dutch Dutch Dutch a Dutch writer of poetry was...
3,Dutch,Dutch,the,"Dutch Dutch Dutch a Dutch writer, Aagje Deken"
4,Italian,Italian,Italian,"Italian Italian Italian a racecar driver, Varz..."
...,...,...,...,...
1829,German,Swedish,the,"Swedish Swedish Swedish, although it is not sp..."
1830,Russian,Russian,Russian,Russian Russian Russian. The Russian language ...
1831,Russian,Russian,Russian,Russian Russian Russian. The Russian language ...
1832,Russian,Russian,Uzbek,Chechen Chechen Chechen елим ан


## Get accuracy and consistency

In [61]:
# accuracy across all paraphrases

def get_accuracy(data, pred_type):
    return (data.attribute==data[pred_type].apply(lambda val: val.strip())).sum()/len(data)    

def get_consistency(data, pred_type):
    def get_consistency_per_attribute(vals):
        num_cons = 0
        for i in range(len(vals)-1):
            num_cons += sum(vals[i]==vals[i+1:])
        return num_cons, math.comb(len(vals), 2)
    
    num_cons = 0
    num_tot = 0
    for subject in data.subject.unique():
        tmp_num_cons, tmp_num_tot = get_consistency_per_attribute(data[data.subject==subject][pred_type].reset_index(drop=True))
        num_cons += tmp_num_cons
        num_tot += tmp_num_tot
        
    return num_cons/num_tot

In [62]:
gpt2_data.groupby(by="template").apply(lambda val: get_accuracy(val, "prediction"))

template
The mother tongue of {} is      0.724100
The native language of {} is    0.728462
dtype: float64

In [63]:
atlas_base_data.groupby(by="template").apply(lambda val: get_accuracy(val, "prediction"))

template
The mother tongue of {} is      0.0
The native language of {} is    0.0
dtype: float64

In [64]:
gpt2_data.groupby(by="template").apply(lambda val: get_accuracy(val, "candidate_prediction"))

template
The mother tongue of {} is      0.763359
The native language of {} is    0.765540
dtype: float64

In [65]:
atlas_base_data.groupby(by="template").apply(lambda val: get_accuracy(val, "candidate_prediction"))

template
The mother tongue of {} is      0.931298
The native language of {} is    0.947655
dtype: float64

In [66]:
print(f"GPT2 candidate consistency: {get_consistency(gpt2_data, 'candidate_prediction')}")
print(f"Atlas-base candidate consistency: {get_consistency(atlas_base_data, 'candidate_prediction')}")

GPT2 candidate consistency: 0.95092693565976
Atlas-base candidate consistency: 0.960741548527808
