# Evaluation
On the fact recall dataset.

In [1]:
import pandas as pd
from sklearn import metrics

## Load the fact recall datasets

In [2]:
conf_fact_recall_data = pd.read_json("/cephyr/users/lovhag/Alvis/projects/fact-recall-detection/data/data_creation/final_splits/confident_fact_recall_preds.jsonl", lines=True)
conf_fact_recall_data["confident"] = True

unconf_fact_recall_data = pd.read_json("/cephyr/users/lovhag/Alvis/projects/fact-recall-detection/data/data_creation/final_splits/unconfident_fact_recall_preds.jsonl", lines=True)
unconf_fact_recall_data["confident"] = False

fact_recall_data = pd.concat((conf_fact_recall_data, unconf_fact_recall_data), ignore_index=True)
# for some reason, there are duplicates?
fact_recall_data = fact_recall_data[~fact_recall_data.duplicated()]

fact_recall_data.head()

Unnamed: 0,obj_label,sub_label,predicate_id,source,sub_view_rates,obj_view_rates,string_match,person_name,prompt,template,answers,p_answers,pred_rank,prompt_bias,correct,surface_pred,trivial_pred,consistency_counts,confident
0,Jerusalem,Obadiah ben Abraham,P20,TREx_UHN,261.166667,165070.916667,False,False,Obadiah ben Abraham died in,[X] died in [Y],Jerusalem,0.048551,1,False,True,False,False,6,True
1,Jerusalem,Obadiah ben Abraham,P20,TREx_UHN,261.166667,165070.916667,False,False,Obadiah ben Abraham died at,[X] died at [Y],Jerusalem,0.075843,1,False,True,False,False,6,True
2,Jerusalem,Obadiah ben Abraham,P20,TREx_UHN,261.166667,165070.916667,False,False,Obadiah ben Abraham passed away in,[X] passed away in [Y],Jerusalem,0.064481,1,False,True,False,False,6,True
3,Jerusalem,Obadiah ben Abraham,P20,TREx_UHN,261.166667,165070.916667,False,False,Obadiah ben Abraham passed away at,[X] passed away at [Y],Jerusalem,0.026302,2,False,True,False,False,6,True
4,Jerusalem,Obadiah ben Abraham,P20,TREx_UHN,261.166667,165070.916667,False,False,Obadiah ben Abraham lost their life at,[X] lost their life at [Y],Jerusalem,0.056528,1,False,True,False,False,6,True


In [14]:
len(fact_recall_data)

2685

## Load the TE based data

In [34]:
conf_data = pd.read_csv("/cephyr/users/lovhag/Alvis/projects/rome/data/confident_fact_recall_detection/gpt2_xl_final.csv")
conf_data["confident"] = True

unconf_data = pd.read_csv("/cephyr/users/lovhag/Alvis/projects/rome/data/unconfident_fact_recall_detection/gpt2_xl_final.csv")
unconf_data["confident"] = False

data = pd.concat((conf_data, unconf_data), ignore_index=True)
data.head()

Unnamed: 0,subject,template,pred,pred_rank,correct_answer,te,confident
0,Obadiah ben Abraham,{} died in,the,0,Jerusalem,0.030592,True
1,Obadiah ben Abraham,{} died in,Jerusalem,1,Jerusalem,0.031775,True
2,Obadiah ben Abraham,{} died in,5,2,Jerusalem,0.018109,True
3,Obadiah ben Abraham,{} died in,6,3,Jerusalem,0.01232,True
4,Obadiah ben Abraham,{} died in,12,4,Jerusalem,0.016043,True


Apply the same filtering here as for the fact recall data. Only keep the top 3 model predictions.

In [35]:
data = data[data.pred_rank<3]
len(data)

8058

In [36]:
forbidden_predictions = ["a", "the", "collaboration", "response", "public", '"', "order", "partnership", "honor", "AD", "open", "H", "age", "creating", "disgrace", "her", "his", "in", "left", "not", "providing", "tragedy", "which", "whom"]
forbidden_mask = (data.pred.isin(forbidden_predictions))
data["trivial_pred"] = forbidden_mask

te_thresh = 0.1
data["te_fact_recall"] = (data.te>te_thresh) & ~(data.trivial_pred)
print(f"{sum(data.te_fact_recall)} data samples have a TE above 0.1")

1304 data samples have a TE above 0.1


Reformat the dataset to make it compatible with the gold labels dataset

In [37]:
data["template"] = data.template.apply(lambda val: val.replace("{}", "[X]")+" [Y]")
data["pred"] = data.pred.astype("str").apply(lambda val: " "+val)
data = data.rename(columns={"subject": "sub_label", "pred": "answers"})

# for some reason, there are duplicates?
data = data[~(data.drop(columns="correct_answer").duplicated())]

data

Unnamed: 0,sub_label,template,answers,pred_rank,correct_answer,te,confident,trivial_pred,te_fact_recall
0,Obadiah ben Abraham,[X] died in [Y],the,0,Jerusalem,0.030592,True,True,False
1,Obadiah ben Abraham,[X] died in [Y],Jerusalem,1,Jerusalem,0.031775,True,False,False
2,Obadiah ben Abraham,[X] died in [Y],5,2,Jerusalem,0.018109,True,False,False
10,Obadiah ben Abraham,[X] died at [Y],the,0,Jerusalem,-0.003685,True,True,False
11,Obadiah ben Abraham,[X] died at [Y],Jerusalem,1,Jerusalem,0.058086,True,False,False
...,...,...,...,...,...,...,...,...,...
26841,"Katherine FitzGerald, Viscountess Grandison",[X] passed away in [Y],London,1,London,0.065550,False,False,False
26842,"Katherine FitzGerald, Viscountess Grandison",[X] passed away in [Y],the,2,London,-0.045233,False,True,False
26850,Abdur Rab Nishtar,[X] passed away in [Y],a,0,Karachi,0.016864,False,True,False
26851,Abdur Rab Nishtar,[X] passed away in [Y],the,1,Karachi,-0.020943,False,True,False


## Compare the sets

In [49]:
fact_recall_data["te_fact_recall"] = None
for ix, row in fact_recall_data.iterrows():
    is_te_fact_recall = data[(data.sub_label==row.sub_label) & (data.template==row.template) & (data.answers==row.answers)].te_fact_recall.values
    assert len(is_te_fact_recall)<2
    if len(is_te_fact_recall) == 0:
        is_te_fact_recall = [None]
    fact_recall_data.loc[ix, "te_fact_recall"] = is_te_fact_recall[0]

fact_recall_data.head()

Unnamed: 0,obj_label,sub_label,predicate_id,source,sub_view_rates,obj_view_rates,string_match,person_name,prompt,template,answers,p_answers,pred_rank,prompt_bias,correct,surface_pred,trivial_pred,consistency_counts,confident,te_fact_recall
0,Jerusalem,Obadiah ben Abraham,P20,TREx_UHN,261.166667,165070.916667,False,False,Obadiah ben Abraham died in,[X] died in [Y],Jerusalem,0.048551,1,False,True,False,False,6,True,False
1,Jerusalem,Obadiah ben Abraham,P20,TREx_UHN,261.166667,165070.916667,False,False,Obadiah ben Abraham died at,[X] died at [Y],Jerusalem,0.075843,1,False,True,False,False,6,True,False
2,Jerusalem,Obadiah ben Abraham,P20,TREx_UHN,261.166667,165070.916667,False,False,Obadiah ben Abraham passed away in,[X] passed away in [Y],Jerusalem,0.064481,1,False,True,False,False,6,True,False
3,Jerusalem,Obadiah ben Abraham,P20,TREx_UHN,261.166667,165070.916667,False,False,Obadiah ben Abraham passed away at,[X] passed away at [Y],Jerusalem,0.026302,2,False,True,False,False,6,True,False
4,Jerusalem,Obadiah ben Abraham,P20,TREx_UHN,261.166667,165070.916667,False,False,Obadiah ben Abraham lost their life at,[X] lost their life at [Y],Jerusalem,0.056528,1,False,True,False,False,6,True,False


Some values are missing due to tokenizer limitations.

In [55]:
fact_recall_data[fact_recall_data.te_fact_recall.isna()]

Unnamed: 0,obj_label,sub_label,predicate_id,source,sub_view_rates,obj_view_rates,string_match,person_name,prompt,template,answers,p_answers,pred_rank,prompt_bias,correct,surface_pred,trivial_pred,consistency_counts,confident,te_fact_recall
2562,Prague,Rudolf Křesťan,P19,Google_RE_UHN,97.166667,130805.0,False,False,Rudolf Křesťan was born in,[X] was born in [Y],Prague,0.056105,2,False,True,False,False,2,False,
2563,Prague,Rudolf Křesťan,P19,Google_RE_UHN,97.166667,130805.0,False,False,Rudolf Křesťan is originally from,[X] is originally from [Y],Prague,0.066008,2,False,True,False,False,2,False,
2685,Prague,Jan Čulík,P19,Google_RE_UHN,232.0,130805.0,False,False,Jan Čulík was born in,[X] was born in [Y],Prague,0.054137,1,False,True,False,False,3,False,
2686,Prague,Jan Čulík,P19,Google_RE_UHN,232.0,130805.0,False,False,Jan Čulík is originally from,[X] is originally from [Y],Prague,0.090887,1,False,True,False,False,3,False,
2687,Prague,Jan Čulík,P19,Google_RE_UHN,232.0,130805.0,False,False,Jan Čulík was originally from,[X] was originally from [Y],Prague,0.044552,1,False,True,False,False,3,False,


We drop these.

In [63]:
fact_recall_data = fact_recall_data.dropna()

## Compute recall

In [65]:
fact_recall_data.value_counts(["confident", "te_fact_recall"], sort=False)

confident  te_fact_recall
False      False             934
           True              144
True       False             671
           True              931
dtype: int64

In [66]:
print("Confident split:")
TP = sum(fact_recall_data[fact_recall_data.confident].te_fact_recall)
FN = sum(~(fact_recall_data[fact_recall_data.confident].te_fact_recall))

print(f"TP: {TP}")
print(f"FN: {FN}")
print(f"Recall: {TP/(FN+TP)}")
print()

print("Unconfident split:")
TP = sum(fact_recall_data[~(fact_recall_data.confident)].te_fact_recall)
FN = sum(~(fact_recall_data[~(fact_recall_data.confident)].te_fact_recall))

print(f"TP: {TP}")
print(f"FN: {FN}")
print(f"Recall: {TP/(FN+TP)}")

Confident split:
TP: 931
FN: 671
Recall: 0.5811485642946317

Unconfident split:
TP: 144
FN: 934
Recall: 0.13358070500927643


In [68]:
pd.set_option('display.max_rows', 1000)
fact_recall_data[(fact_recall_data.confident) & ~(fact_recall_data.te_fact_recall)]

Unnamed: 0,obj_label,sub_label,predicate_id,source,sub_view_rates,obj_view_rates,string_match,person_name,prompt,template,answers,p_answers,pred_rank,prompt_bias,correct,surface_pred,trivial_pred,consistency_counts,confident,te_fact_recall
0,Jerusalem,Obadiah ben Abraham,P20,TREx_UHN,261.166667,165070.9,False,False,Obadiah ben Abraham died in,[X] died in [Y],Jerusalem,0.048551,1,False,True,False,False,6,True,False
1,Jerusalem,Obadiah ben Abraham,P20,TREx_UHN,261.166667,165070.9,False,False,Obadiah ben Abraham died at,[X] died at [Y],Jerusalem,0.075843,1,False,True,False,False,6,True,False
2,Jerusalem,Obadiah ben Abraham,P20,TREx_UHN,261.166667,165070.9,False,False,Obadiah ben Abraham passed away in,[X] passed away in [Y],Jerusalem,0.064481,1,False,True,False,False,6,True,False
3,Jerusalem,Obadiah ben Abraham,P20,TREx_UHN,261.166667,165070.9,False,False,Obadiah ben Abraham passed away at,[X] passed away at [Y],Jerusalem,0.026302,2,False,True,False,False,6,True,False
4,Jerusalem,Obadiah ben Abraham,P20,TREx_UHN,261.166667,165070.9,False,False,Obadiah ben Abraham lost their life at,[X] lost their life at [Y],Jerusalem,0.056528,1,False,True,False,False,6,True,False
5,Jerusalem,Obadiah ben Abraham,P20,TREx_UHN,261.166667,165070.9,False,False,Obadiah ben Abraham's life ended in,[X]'s life ended in [Y],Jerusalem,0.030897,2,False,True,False,False,6,True,False
6,Norway,Embrik Strand,P27,TREx_UHN,144.583333,273060.5,False,False,Embrik Strand is a citizen of,[X] is a citizen of [Y],Norway,0.055363,1,False,True,False,False,6,True,False
7,Norway,Embrik Strand,P27,TREx_UHN,144.583333,273060.5,False,False,"Embrik Strand, a citizen of","[X], a citizen of [Y]",Norway,0.090866,2,False,True,False,False,6,True,False
8,Norway,Embrik Strand,P27,TREx_UHN,144.583333,273060.5,False,False,"Embrik Strand, who is a citizen of","[X], who is a citizen of [Y]",Norway,0.091338,2,False,True,False,False,6,True,False
9,Norway,Embrik Strand,P27,TREx_UHN,144.583333,273060.5,False,False,Embrik Strand holds a citizenship of,[X] holds a citizenship of [Y],Norway,0.098135,1,False,True,False,False,6,True,False


In [69]:
fact_recall_data[~(fact_recall_data.confident) & ~(fact_recall_data.te_fact_recall)]

Unnamed: 0,obj_label,sub_label,predicate_id,source,sub_view_rates,obj_view_rates,string_match,person_name,prompt,template,answers,p_answers,pred_rank,prompt_bias,correct,surface_pred,trivial_pred,consistency_counts,confident,te_fact_recall
1608,Paris,Maurice de Vlaminck,P19,TREx_UHN,3246.916667,227647.1,False,False,Maurice de Vlaminck was born in,[X] was born in [Y],Paris,0.083087,1,False,True,False,False,2,False,False
1609,Paris,Maurice de Vlaminck,P19,TREx_UHN,3246.916667,227647.1,False,False,Maurice de Vlaminck was originally from,[X] was originally from [Y],Paris,0.038279,2,False,True,False,False,2,False,False
1610,Milan,Maximilian Sforza,P19,TREx_UHN,763.0,99240.0,False,False,Maximilian Sforza was born in,[X] was born in [Y],Milan,0.04717,1,False,True,False,False,2,False,False
1611,Milan,Maximilian Sforza,P19,TREx_UHN,763.0,99240.0,False,False,Maximilian Sforza was originally from,[X] was originally from [Y],Milan,0.037021,2,False,True,False,False,2,False,False
1612,Scotland,David Coulthard,P19,TREx_UHN,25383.5,259042.1,False,False,David Coulthard is originally from,[X] is originally from [Y],Scotland,0.035982,2,False,True,False,False,2,False,False
1613,Scotland,David Coulthard,P19,TREx_UHN,25383.5,259042.1,False,False,David Coulthard was originally from,[X] was originally from [Y],Scotland,0.029149,1,False,True,False,False,2,False,False
1614,Atlanta,Killer Mike,P19,TREx_UHN,53289.166667,201929.3,False,False,Killer Mike was born in,[X] was born in [Y],Atlanta,0.091597,1,False,True,False,False,5,False,False
1619,Karachi,Asad Malik,P19,TREx_UHN,1496.25,79275.5,False,False,Asad Malik was born in,[X] was born in [Y],Karachi,0.049052,1,False,True,False,False,1,False,False
1620,London,Chris Stringer,P19,TREx_UHN,977.833333,416228.7,False,False,Chris Stringer was originally from,[X] was originally from [Y],London,0.023558,2,False,True,False,False,1,False,False
1621,Mecca,Bilal ibn Ribah,P19,TREx_UHN,910.583333,100708.6,False,False,Bilal ibn Ribah was born in,[X] was born in [Y],Mecca,0.031516,2,False,True,False,False,2,False,False
