# Final Processing
For the creation of PrISM datasets with samples corresponding to exact fact recall and guesswork.

## Set up enviroment

Make sure to stand in the root folder of the repo.

In [None]:
import os
os.chdir("../.")
os.getcwd()

In [None]:
import pandas as pd

## Define paths and folder for storing results

We create datasets for each of the analysed LMs. Make sure to specify the desired LM in the cell below. The notebook will then create the datasets with the exact fact recall and guesswork samples for that model.

In [None]:
MODEL_NAME = "llama2_13B" # "llama2_7B" #"gpt2_xl"
DATA_PATH = f"data/data_creation/{MODEL_NAME}/lama_data_preds_wiki_nb_pb.jsonl"
SAVEFOLDER = f"data/data_creation/{MODEL_NAME}"

## Load the data

In [3]:
data = pd.read_json(DATA_PATH, lines=True)
data = data.drop(columns=["uuid", "obj_uri", "sub_uri"])
data = data.drop_duplicates()
data

Unnamed: 0,obj_label,sub_label,predicate_id,source,prompt,template,sub_view_rates,obj_view_rates,answers,p_answers,rank_answers,string_match,person_name,used_template,prompt_bias
0,Alexandra,Allan Peiper,P19,TREx_UHN,Allan Peiper was born in,[X] was born in [Y],558.416667,6956.750000,,0.274003,0,True,True,[X] was born in,True
1,Alexandra,Allan Peiper,P19,TREx_UHN,Allan Peiper was born in,[X] was born in [Y],558.416667,6956.750000,New,0.044721,1,False,False,[X] was born in,True
2,Alexandra,Allan Peiper,P19,TREx_UHN,Allan Peiper was born in,[X] was born in [Y],558.416667,6956.750000,the,0.037551,2,False,False,[X] was born in,True
3,Alexandra,Allan Peiper,P19,TREx_UHN,Allan Peiper is originally from,[X] is originally from [Y],558.416667,6956.750000,the,0.092337,0,False,False,[X] is originally from,True
4,Alexandra,Allan Peiper,P19,TREx_UHN,Allan Peiper is originally from,[X] is originally from [Y],558.416667,6956.750000,New,0.045463,1,False,False,[X] is originally from,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227788,Newark,Joshua Mathiot,P20,Google_RE_UHN,Joshua Mathiot's life ended in,[X]'s life ended in [Y],124.416667,2572.166667,the,0.115215,1,False,False,[X]'s life ended in,True
227789,Newark,Joshua Mathiot,P20,Google_RE_UHN,Joshua Mathiot's life ended in,[X]'s life ended in [Y],124.416667,2572.166667,,0.083683,2,True,True,[X]'s life ended in,True
227790,Newark,Joshua Mathiot,P20,Google_RE_UHN,Joshua Mathiot succumbed at,[X] succumbed at [Y],124.416667,2572.166667,the,0.378701,0,False,False,[X] succumbed at,True
227791,Newark,Joshua Mathiot,P20,Google_RE_UHN,Joshua Mathiot succumbed at,[X] succumbed at [Y],124.416667,2572.166667,,0.220430,1,True,True,[X] succumbed at,True


## Add labels on correctness

In [4]:
data["strict_correct"] = data.apply(lambda row: row.answers.strip()==row.obj_label, axis=1)

In [5]:
data["first_token_correct"] = data.apply(lambda row: len(row.answers.strip())>=3 and row.obj_label.startswith(row.answers.strip()), axis=1)

In [6]:
data[(data.first_token_correct) & ~(data.strict_correct)]

Unnamed: 0,obj_label,sub_label,predicate_id,source,prompt,template,sub_view_rates,obj_view_rates,answers,p_answers,rank_answers,string_match,person_name,used_template,prompt_bias,strict_correct,first_token_correct
105,Antwerp,Frans Floris I,P19,TREx_UHN,Frans Floris I was born in,[X] was born in [Y],,50520.750000,Ant,0.427150,0,False,False,[X] was born in,False,False,True
109,Antwerp,Frans Floris I,P19,TREx_UHN,Frans Floris I is originally from,[X] is originally from [Y],,50520.750000,Ant,0.199198,1,False,False,[X] is originally from,False,False,True
111,Antwerp,Frans Floris I,P19,TREx_UHN,Frans Floris I was originally from,[X] was originally from [Y],,50520.750000,Ant,0.282292,0,False,False,[X] was originally from,False,False,True
115,Antwerp,Frans Floris I,P19,TREx_UHN,Frans Floris I originated from,[X] originated from [Y],,50520.750000,Ant,0.218820,1,False,False,[X] originated from,False,False,True
119,Antwerp,Frans Floris I,P19,TREx_UHN,Frans Floris I originates from,[X] originates from [Y],,50520.750000,Ant,0.117836,2,False,False,[X] originates from,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214786,Bucharest,George Georgescu,P20,Google_RE_UHN,George Georgescu died in,[X] died in [Y],222.000000,61623.833333,Buch,0.085521,1,False,True,[X] died in,False,False,True
214792,Bucharest,George Georgescu,P20,Google_RE_UHN,George Georgescu passed away in,[X] passed away in [Y],222.000000,61623.833333,Buch,0.084553,1,False,True,[X] passed away in,False,False,True
220906,Antwerp,Frans de Momper,P20,Google_RE_UHN,Frans de Momper died in,[X] died in [Y],189.916667,50520.750000,Ant,0.056253,1,False,False,[X] died in,False,False,True
220912,Antwerp,Frans de Momper,P20,Google_RE_UHN,Frans de Momper passed away in,[X] passed away in [Y],189.916667,50520.750000,Ant,0.076051,1,False,False,[X] passed away in,False,False,True


In [None]:
data["correct"] = data["first_token_correct"]

## Make data compatible with CT code

The CT code expects some specific columns to exist in the dataset.

In [None]:
# add this for compatibility with the CT code
if MODEL_NAME == "llama2_7B" or MODEL_NAME == "llama2_13B":
    data["candidate_prediction"] = data["answers"]
    data["subject"] = data["sub_label"]
    data["known_id"] = data.index

## Create data splits

Before we can identify the exact fact recall and guesswork samples, we need to add some necessary metadata on biases and consistency.

### Add necessary metadata

In [8]:
data["surface_pred"] = data.apply(lambda row: any(row[["string_match", "person_name", "prompt_bias"]]), axis=1)
print(f"{sum(data['surface_pred'])} model predictions out of {len(data)} are potential surface level predictions")

192978 model predictions out of 227751 are potential surface level predictions


We use model confidence proxied by consistency to paraphrasing to identify exact fact recall samples.

In [9]:
CONSISTENCY_THRESH = 5

# count surface level preds and non surface level preds separately
consistency_counts = data.groupby(["predicate_id", "sub_label", "obj_label", "answers", "surface_pred"]).p_answers.count().rename("cons_counts")
data["consistency_counts"] = data.apply(lambda row: consistency_counts[row.predicate_id, row.sub_label, row.obj_label, row.answers, row.surface_pred], axis=1)
data.head()

Unnamed: 0,obj_label,sub_label,predicate_id,source,prompt,template,sub_view_rates,obj_view_rates,answers,p_answers,rank_answers,string_match,person_name,used_template,prompt_bias,strict_correct,first_token_correct,correct,surface_pred,consistency_counts
0,Alexandra,Allan Peiper,P19,TREx_UHN,Allan Peiper was born in,[X] was born in [Y],558.416667,6956.75,,0.274003,0,True,True,[X] was born in,True,False,False,False,True,1
1,Alexandra,Allan Peiper,P19,TREx_UHN,Allan Peiper was born in,[X] was born in [Y],558.416667,6956.75,New,0.044721,1,False,False,[X] was born in,True,False,False,False,True,3
2,Alexandra,Allan Peiper,P19,TREx_UHN,Allan Peiper was born in,[X] was born in [Y],558.416667,6956.75,the,0.037551,2,False,False,[X] was born in,True,False,False,False,True,5
3,Alexandra,Allan Peiper,P19,TREx_UHN,Allan Peiper is originally from,[X] is originally from [Y],558.416667,6956.75,the,0.092337,0,False,False,[X] is originally from,True,False,False,False,True,5
4,Alexandra,Allan Peiper,P19,TREx_UHN,Allan Peiper is originally from,[X] is originally from [Y],558.416667,6956.75,New,0.045463,1,False,False,[X] is originally from,True,False,False,False,True,3


In [10]:
data["confident"] = data.consistency_counts>CONSISTENCY_THRESH
confident_fact_recall_preds = data[data.confident & ~(data.surface_pred) & data.correct].copy()

confident_fact_recall_preds

Unnamed: 0,obj_label,sub_label,predicate_id,source,prompt,template,sub_view_rates,obj_view_rates,answers,p_answers,...,string_match,person_name,used_template,prompt_bias,strict_correct,first_token_correct,correct,surface_pred,consistency_counts,confident
12391,Melbourne,Edward Duyker,P19,TREx_UHN,Edward Duyker was born in,[X] was born in [Y],320.416667,124325.083333,Melbourne,0.176117,...,False,False,[X] was born in,False,True,True,True,False,10,True
12393,Melbourne,Edward Duyker,P19,TREx_UHN,Edward Duyker is originally from,[X] is originally from [Y],320.416667,124325.083333,Melbourne,0.137194,...,False,False,[X] is originally from,False,True,True,True,False,10,True
12397,Melbourne,Edward Duyker,P19,TREx_UHN,Edward Duyker was originally from,[X] was originally from [Y],320.416667,124325.083333,Melbourne,0.072961,...,False,False,[X] was originally from,False,True,True,True,False,10,True
12401,Melbourne,Edward Duyker,P19,TREx_UHN,Edward Duyker originated from,[X] originated from [Y],320.416667,124325.083333,Melbourne,0.035770,...,False,False,[X] originated from,False,True,True,True,False,10,True
12404,Melbourne,Edward Duyker,P19,TREx_UHN,Edward Duyker originates from,[X] originates from [Y],320.416667,124325.083333,Melbourne,0.077432,...,False,False,[X] originates from,False,True,True,True,False,10,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191350,Melbourne,Edward Duyker,P19,Google_RE_UHN,Edward Duyker was born in,[X] was born in [Y],320.416667,124325.083333,Melbourne,0.176117,...,False,False,[X] was born in,False,True,True,True,False,10,True
191352,Melbourne,Edward Duyker,P19,Google_RE_UHN,Edward Duyker is originally from,[X] is originally from [Y],320.416667,124325.083333,Melbourne,0.137194,...,False,False,[X] is originally from,False,True,True,True,False,10,True
191356,Melbourne,Edward Duyker,P19,Google_RE_UHN,Edward Duyker was originally from,[X] was originally from [Y],320.416667,124325.083333,Melbourne,0.072961,...,False,False,[X] was originally from,False,True,True,True,False,10,True
191360,Melbourne,Edward Duyker,P19,Google_RE_UHN,Edward Duyker originated from,[X] originated from [Y],320.416667,124325.083333,Melbourne,0.035770,...,False,False,[X] originated from,False,True,True,True,False,10,True


Our bias filter is not perfect?

In [12]:
confident_fact_recall_preds[confident_fact_recall_preds.sub_view_rates<1000].groupby(["obj_label","sub_label","predicate_id","answers"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sub_view_rates,obj_view_rates,p_answers,rank_answers,string_match,person_name,prompt_bias,strict_correct,first_token_correct,correct,surface_pred,consistency_counts,confident
obj_label,sub_label,predicate_id,answers,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Australia,Sydney Twelve,P495,Australia,285.75,576541.583333,0.065553,1.5,False,False,False,True,True,True,False,6.0,True
Australia,The Early Bird Show,P495,Australia,542.75,576541.583333,0.064941,1.0,False,False,False,True,True,True,False,6.0,True
Barcelona,Institut d'Estudis Catalans,P740,Barcelona,775.416667,156304.833333,0.0495,1.0,False,False,False,True,True,True,False,11.0,True
Brazil,Bicho de Sete Cabeças,P495,Brazil,235.5,326610.333333,0.04311,1.5,False,False,False,True,True,True,False,6.0,True
Brazil,Desejos de Mulher,P495,Brazil,287.833333,326610.333333,0.077602,1.0,False,False,False,True,True,True,False,9.0,True
Brazil,Mulheres de Areia,P495,Brazil,766.083333,326610.333333,0.060944,1.333333,False,False,False,True,True,True,False,6.0,True
Brazil,O Quatrilho,P495,Brazil,623.416667,326610.333333,0.053229,1.571429,False,False,False,True,True,True,False,7.0,True
Cuba,Suite Habana,P495,Cuba,278.166667,193325.916667,0.054528,1.5,False,False,False,True,True,True,False,8.0,True
Denmark,Lise Ringheim,P27,Denmark,88.416667,259110.416667,0.113276,1.833333,False,False,False,True,True,True,False,6.0,True
Egypt,Ayyam El Sadat,P495,Egypt,154.583333,252156.833333,0.085481,1.0625,False,False,False,True,True,True,False,16.0,True


### Exact fact recall set
We here save the exact fact recall samples.

We only keep entries in the exact fact recall set for which the popularity score (average monthly Wikipedia views) is above 1000.

In [11]:
pop_confident_fact_recall_preds = confident_fact_recall_preds[confident_fact_recall_preds.sub_view_rates>1000]
pop_confident_fact_recall_preds

Unnamed: 0,obj_label,sub_label,predicate_id,source,prompt,template,sub_view_rates,obj_view_rates,answers,p_answers,...,string_match,person_name,used_template,prompt_bias,strict_correct,first_token_correct,correct,surface_pred,consistency_counts,confident
17306,Constantinople,Procopius,P20,TREx_UHN,Procopius died in,[X] died in [Y],9059.583333,109764.416667,Constantin,0.044890,...,False,False,[X] died in,False,False,True,True,False,6,True
17309,Constantinople,Procopius,P20,TREx_UHN,Procopius died at,[X] died at [Y],9059.583333,109764.416667,Constantin,0.128613,...,False,False,[X] died at,False,False,True,True,False,6,True
17312,Constantinople,Procopius,P20,TREx_UHN,Procopius passed away in,[X] passed away in [Y],9059.583333,109764.416667,Constantin,0.070805,...,False,False,[X] passed away in,False,False,True,True,False,6,True
17317,Constantinople,Procopius,P20,TREx_UHN,Procopius expired at,[X] expired at [Y],9059.583333,109764.416667,Constantin,0.193499,...,False,False,[X] expired at,False,False,True,True,False,6,True
17324,Constantinople,Procopius,P20,TREx_UHN,Procopius's life ended in,[X]'s life ended in [Y],9059.583333,109764.416667,Constantin,0.069759,...,False,False,[X]'s life ended in,False,False,True,True,False,6,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165341,Galicia,Santiago de Compostela,P1376,TREx_UHN,Santiago de Compostela is the capital city of,[X] is the capital city of [Y],28000.000000,2412.250000,Gal,0.139605,...,False,False,[X] is the capital city of,False,False,True,True,False,6,True
165342,Galicia,Santiago de Compostela,P1376,TREx_UHN,"Santiago de Compostela, the capital of","[X], the capital of [Y]",28000.000000,2412.250000,Gal,0.501965,...,False,False,"[X], the capital of",False,False,True,True,False,6,True
165345,Galicia,Santiago de Compostela,P1376,TREx_UHN,"Santiago de Compostela, the capital city of","[X], the capital city of [Y]",28000.000000,2412.250000,Gal,0.540636,...,False,False,"[X], the capital city of",False,False,True,True,False,6,True
165349,Galicia,Santiago de Compostela,P1376,TREx_UHN,"Santiago de Compostela, that is the capital of","[X], that is the capital of [Y]",28000.000000,2412.250000,Gal,0.342240,...,False,False,"[X], that is the capital of",False,False,True,True,False,6,True


The predictions are generally quite confident.

In [12]:
pop_confident_fact_recall_preds.p_answers.mean()

0.27611644265738855

In [13]:
pop_confident_fact_recall_preds.rank_answers.value_counts()

1    1049
2    1033
0     990
Name: rank_answers, dtype: int64

Save the data

In [None]:
pop_confident_fact_recall_preds.to_json(os.path.join(SAVEFOLDER, "exact_fact_recall_set.jsonl"), lines=True, orient="records")

### Random guesswork set

We here save the random guesswork samples. These are samples that are correct but inconsistent.

In [15]:
gold_label_set = data.obj_label.unique()
answer_is_in_gold_label_set = data.answers.apply(lambda val: val.strip() in gold_label_set)
random_guesswork_set = data[(data.consistency_counts<2) & 
                            (answer_is_in_gold_label_set) &
                            (data.sub_view_rates<1000)
                           ].copy()
random_guesswork_set

Unnamed: 0,obj_label,sub_label,predicate_id,source,prompt,template,sub_view_rates,obj_view_rates,answers,p_answers,...,string_match,person_name,used_template,prompt_bias,strict_correct,first_token_correct,correct,surface_pred,consistency_counts,confident
32,Scotland,Paul Mounsey,P19,TREx_UHN,Paul Mounsey was born in,[X] was born in [Y],313.750000,259042.083333,London,0.038695,...,False,True,[X] was born in,True,False,False,False,True,1,False
77,Paris,Claude Arrieu,P19,TREx_UHN,Claude Arrieu was born in,[X] was born in [Y],342.750000,227647.083333,Paris,0.057988,...,False,True,[X] was born in,True,True,True,True,True,1,False
79,Paris,Claude Arrieu,P19,TREx_UHN,Claude Arrieu is originally from,[X] is originally from [Y],342.750000,227647.083333,France,0.074929,...,False,True,[X] is originally from,False,False,False,False,True,1,False
125,Barcelona,Henry Heras,P19,TREx_UHN,Henry Heras is originally from,[X] is originally from [Y],306.500000,156304.833333,Mexico,0.039514,...,False,False,[X] is originally from,False,False,False,False,False,1,False
136,Rome,Daniele Franceschini,P19,TREx_UHN,Daniele Franceschini was born in,[X] was born in [Y],205.250000,166365.083333,Florence,0.072317,...,False,False,[X] was born in,False,False,False,False,False,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
226456,Moscow,Nikolay Strunnikov,P20,Google_RE_UHN,Nikolay Strunnikov passed away in,[X] passed away in [Y],82.916667,130953.583333,Moscow,0.127888,...,False,False,[X] passed away in,False,True,True,True,False,1,False
226480,Quebec,Claude Aveneau,P20,Google_RE_UHN,Claude Aveneau passed away in,[X] passed away in [Y],63.416667,120671.000000,Paris,0.139089,...,False,True,[X] passed away in,False,False,False,False,True,1,False
226505,Venice,Raffaele Venusti,P20,Google_RE_UHN,Raffaele Venusti passed away in,[X] passed away in [Y],40.083333,136078.083333,Rome,0.053666,...,False,True,[X] passed away in,False,False,False,False,True,1,False
226935,Beverly,Arne Brun Lie,P20,Google_RE_UHN,Arne Brun Lie passed away in,[X] passed away in [Y],72.500000,1624.416667,Oslo,0.277760,...,False,False,[X] passed away in,False,False,False,False,False,1,False


In [19]:
random_guesswork_set.rank_answers.value_counts()

2    2854
1     923
0     593
Name: rank_answers, dtype: int64

In [20]:
random_guesswork_set.p_answers.mean()

0.04057559630709405

In [21]:
random_guesswork_set.rank_answers.value_counts()

2    2854
1     923
0     593
Name: rank_answers, dtype: int64

In [22]:
random_guesswork_set.correct.value_counts()

False    3846
True      524
Name: correct, dtype: int64

Save the data

In [16]:
random_guesswork_set.to_json(os.path.join(SAVEFOLDER, "random_guesswork_set.jsonl"), lines=True, orient="records")