In [60]:
import pandas as pd
import glob
import datasets
import numpy as np

In [61]:
wiki_data = pd.read_csv("../../data/wikidata/wikidata-property-list.csv")
wiki_data = wiki_data[["Title", "ID", "Datatype", "Description"]]
wiki_data

Unnamed: 0,Title,ID,Datatype,Description
0,head of government,6,item,"head of the executive power of a town, city, m..."
1,brother,7,item,subject has the object as their brother
2,sister,9,item,subject has the object as their sister (female...
3,video,10,Commons file,relevant video
4,highway marker,14,Commons file,graphic representing the highway
...,...,...,...,...
1193,catholic.ru ID,1453,string,identifier on the site catholic.ru
1194,legal form,1454,item,legal form of an organization
1195,list of works,1455,item,link to the article with the works of a person
1196,list of monuments,1456,item,link to the list of heritage monuments in the ...


## Accuracy analysis

In [62]:
results_dict = {}
results_dict["language"] = []
results_dict["relation"] = []
results_dict["accuracy"] = []
results_dict["num facts"] = []


hf_df = datasets.load_dataset("CalibraGPT/Fact-Completion")
file_names = glob.glob("../../data/result_logs/llama-30b/error-analysis/*.csv")


# calculate the accuracy per relation per language
for file in file_names:
    language = file.split(".csv")[0].split("-")[-1].capitalize()
    error_df = pd.read_csv(file)
    full_hf_df = hf_df[file.split(".csv")[0].split("-")[-1].capitalize()]
    full_hf_df = full_hf_df.to_pandas()
    relations = np.unique(np.array(full_hf_df.relation.tolist()))
    for relation in relations:
        num_wrong = len(error_df[error_df["relation"] == relation])
        num_total = len(full_hf_df[full_hf_df["relation"] == relation])
        results_dict["relation"].append(relation)
        results_dict["language"].append(language)
        results_dict["accuracy"].append(100 * (num_total - num_wrong) / num_total)
        results_dict["num facts"].append(num_total)

# add wiki data in and save
df = pd.DataFrame.from_dict(results_dict)
quant_error_analysis = df.sort_values(by=["accuracy"], ascending=False).reset_index(
    drop=True
)

titles = []
descriptions = []
for i in range(len(quant_error_analysis)):
    p_id = quant_error_analysis.relation.loc[i].split("P")[-1]
    titles.append(
        wiki_data[wiki_data["ID"] == int(p_id)].Title.reset_index(drop=True)[0]
    )
    descriptions.append(
        wiki_data[wiki_data["ID"] == int(p_id)].Description.reset_index(drop=True)[0]
    )

quant_error_analysis["Title"] = titles
quant_error_analysis["Description"] = descriptions

quant_error_analysis.to_csv("../../data/error_analysis/relation_accuracy_canonical.csv")
quant_error_analysis

Found cached dataset parquet (/Users/danielfurman/.cache/huggingface/datasets/CalibraGPT___parquet/CalibraGPT--Fact-Completion-24a24a1e4bf6e4a8/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 20/20 [00:00<00:00, 661.07it/s]


Unnamed: 0,language,relation,accuracy,num facts,Title,Description
0,Croatian,P30,100.000000,3,continent,continent of which the subject is a part. Use ...
1,Hungarian,P176,100.000000,11,manufacturer,main manufacturer of this product (excluding s...
2,Hungarian,P20,100.000000,7,place of death,the most specific known (e.g. city instead of ...
3,Russian,P103,100.000000,2,native language,language that a person learned natively
4,Hungarian,P108,100.000000,2,employer,organization for which the subject works or wo...
...,...,...,...,...,...,...
632,Ukrainian,P1303,42.857143,28,instrument,instrument that a person plays
633,Russian,P264,42.635659,258,record label,brand and trademark associated with the market...
634,Ukrainian,P413,40.000000,155,position played on team,"position the player plays, e.g. Small Forward"
635,Serbian,P740,33.333333,6,formation location,location where a group or organization was formed


In [63]:
# group by relation

# average across all langs
grouped_quant_error_analysis = pd.DataFrame(
    quant_error_analysis.groupby(["relation"]).aggregate(
        {"accuracy": "mean", "num facts": "sum"}
    )
)

pd_df = {}
pd_df["relation"] = grouped_quant_error_analysis.index.tolist()
pd_df["mean accuracy across langs"] = grouped_quant_error_analysis["accuracy"].tolist()
pd_df["num facts across langs"] = grouped_quant_error_analysis["num facts"].tolist()

grouped_quant_error_analysis = pd.DataFrame.from_dict(pd_df)
grouped_quant_error_analysis = grouped_quant_error_analysis.sort_values(
    "mean accuracy across langs", ascending=True
).reset_index(drop=True)

titles = []
descriptions = []
for i in range(len(grouped_quant_error_analysis)):
    p_id = grouped_quant_error_analysis.relation.loc[i].split("P")[-1]
    titles.append(
        wiki_data[wiki_data["ID"] == int(p_id)].Title.reset_index(drop=True)[0]
    )
    descriptions.append(
        wiki_data[wiki_data["ID"] == int(p_id)].Description.reset_index(drop=True)[0]
    )

grouped_quant_error_analysis["Title"] = titles
grouped_quant_error_analysis["Description"] = descriptions


grouped_quant_error_analysis.to_csv(
    "../../data/error_analysis/relation_accuracy_aggregated_by_relation.csv"
)
grouped_quant_error_analysis

Unnamed: 0,relation,mean accuracy across langs,num facts across langs,Title,Description
0,P264,58.177942,9729,record label,brand and trademark associated with the market...
1,P101,65.509065,11031,field of work,"fields of work related to this item (physics, ..."
2,P39,69.351905,8364,position held,subject currently or formerly holds the object...
3,P449,69.964848,18147,original network,network(s) the radio or television show was or...
4,P1303,70.667677,5390,instrument,instrument that a person plays
5,P136,71.109177,15273,genre,genre of a creative work or genre in which an ...
6,P106,73.490891,13934,occupation,"occupation of a person; see also ""field of wor..."
7,P413,73.570715,8700,position played on team,"position the player plays, e.g. Small Forward"
8,P407,74.474544,6163,language,language of a work (not necessarily original l...
9,P364,75.664172,12982,original language,language in which a work was originally created


## Percentage change analysis

In [64]:
results_dict = {}
results_dict["language"] = []
results_dict["relation"] = []
results_dict["num facts"] = []
results_dict["percentage change"] = []
results_dict["new ratio of rows"] = []
results_dict["old ratio of rows"] = []

hf_df = datasets.load_dataset("CalibraGPT/Fact-Completion")
file_names = glob.glob("../../data/result_logs/llama-30b/error-analysis/*.csv")

# calculate the distribution shift by language by relation present in the errors
for file in file_names:
    language = file.split(".csv")[0].split("-")[-1].capitalize()
    error_df = pd.read_csv(file)
    full_hf_df = hf_df[file.split(".csv")[0].split("-")[-1].capitalize()]
    full_hf_df = full_hf_df.to_pandas()
    relations = np.unique(np.array(error_df.relation.tolist()))
    for relation in relations:
        new_ratio = len(error_df[error_df["relation"] == relation]) / len(error_df)
        old_ratio = len(full_hf_df[full_hf_df["relation"] == relation]) / len(
            full_hf_df
        )
        results_dict["relation"].append(relation)
        results_dict["language"].append(language)
        results_dict["percentage change"].append(
            np.round(100 * (new_ratio - old_ratio) / old_ratio, 3)
        )
        results_dict["new ratio of rows"].append(np.round(new_ratio, 3))
        results_dict["old ratio of rows"].append(np.round(old_ratio, 3))

        num_total = len(full_hf_df[full_hf_df["relation"] == relation])
        results_dict["num facts"].append(num_total)

# add wiki data in and save
df = pd.DataFrame.from_dict(results_dict)
quant_error_analysis = df.sort_values(
    by=["percentage change"], ascending=False
).reset_index(drop=True)

titles = []
descriptions = []
for i in range(len(quant_error_analysis)):
    p_id = quant_error_analysis.relation.loc[i].split("P")[-1]
    titles.append(
        wiki_data[wiki_data["ID"] == int(p_id)].Title.reset_index(drop=True)[0]
    )
    descriptions.append(
        wiki_data[wiki_data["ID"] == int(p_id)].Description.reset_index(drop=True)[0]
    )

quant_error_analysis["Title"] = titles
quant_error_analysis["Description"] = descriptions

quant_error_analysis.to_csv(
    "../../data/error_analysis/relation_percent_change_canonical.csv"
)
quant_error_analysis

Found cached dataset parquet (/Users/danielfurman/.cache/huggingface/datasets/CalibraGPT___parquet/CalibraGPT--Fact-Completion-24a24a1e4bf6e4a8/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 20/20 [00:00<00:00, 652.48it/s]


Unnamed: 0,language,relation,num facts,percentage change,new ratio of rows,old ratio of rows,Title,Description
0,German,P39,124,244.905,0.026,0.008,position held,subject currently or formerly holds the object...
1,English,P264,535,221.038,0.065,0.020,record label,brand and trademark associated with the market...
2,Danish,P264,532,201.396,0.069,0.023,record label,brand and trademark associated with the market...
3,Slovenian,P1303,76,179.534,0.027,0.010,instrument,instrument that a person plays
4,German,P264,389,172.157,0.065,0.024,record label,brand and trademark associated with the market...
...,...,...,...,...,...,...,...,...
612,Czech,P103,208,-86.926,0.003,0.022,native language,language that a person learned natively
613,Slovenian,P1412,65,-87.183,0.001,0.008,languages spoken,language(s) that a person speaks. Primarily us...
614,English,P17,835,-87.568,0.004,0.032,country,sovereign state of this item
615,English,P103,915,-90.718,0.003,0.035,native language,language that a person learned natively


In [65]:
# average across all langs
grouped_quant_error_analysis = pd.DataFrame(
    quant_error_analysis.groupby(["relation"]).aggregate(
        {"percentage change": "mean", "num facts": "sum"}
    )
)

pd_df = {}
pd_df["relation"] = grouped_quant_error_analysis.index.tolist()
pd_df["mean percentage change across langs"] = grouped_quant_error_analysis[
    "percentage change"
].tolist()
pd_df["num facts across langs"] = grouped_quant_error_analysis["num facts"].tolist()

grouped_quant_error_analysis = pd.DataFrame.from_dict(pd_df)
grouped_quant_error_analysis = grouped_quant_error_analysis.sort_values(
    "mean percentage change across langs", ascending=False
).reset_index(drop=True)

titles = []
descriptions = []
for i in range(len(grouped_quant_error_analysis)):
    p_id = grouped_quant_error_analysis.relation.loc[i].split("P")[-1]
    titles.append(
        wiki_data[wiki_data["ID"] == int(p_id)].Title.reset_index(drop=True)[0]
    )
    descriptions.append(
        wiki_data[wiki_data["ID"] == int(p_id)].Description.reset_index(drop=True)[0]
    )

grouped_quant_error_analysis["Title"] = titles
grouped_quant_error_analysis["Description"] = descriptions

grouped_quant_error_analysis.to_csv(
    "../../data/error_analysis/relation_percent_change_aggregated_by_relation.csv"
)
grouped_quant_error_analysis

Unnamed: 0,relation,mean percentage change across langs,num facts across langs,Title,Description
0,P264,113.51705,9729,record label,brand and trademark associated with the market...
1,P101,76.44105,11031,field of work,"fields of work related to this item (physics, ..."
2,P39,55.84905,8364,position held,subject currently or formerly holds the object...
3,P449,49.0586,18147,original network,network(s) the radio or television show was or...
4,P136,44.37375,15273,genre,genre of a creative work or genre in which an ...
5,P1303,36.10595,5390,instrument,instrument that a person plays
6,P106,30.5725,13934,occupation,"occupation of a person; see also ""field of wor..."
7,P407,25.76575,6163,language,language of a work (not necessarily original l...
8,P138,21.891312,2987,named after,entity or event that inspired the subject's na...
9,P364,20.3578,12982,original language,language in which a work was originally created


In [66]:
columns_to_keep = ["language", "Title", "percentage change"]
filtered_df = quant_error_analysis[quant_error_analysis["percentage change"] > 0][
    columns_to_keep
]

In [67]:
# Define the minimum percentage change threshold
min_pct_change = 0

# Filter the results_dict dataframe to keep only rows with percentage change greater than min_pct_change
filtered_df_lang = filtered_df[filtered_df["percentage change"] > min_pct_change]

# Group the filtered dataframe by relation and language - compute the mean percentage change is redundant, to make groupby easier
grouped_df = filtered_df_lang.groupby(["Title", "language"])["percentage change"].mean()

# For each relation, get the list of languages with positive percentage change
for relation in grouped_df.index.levels[0]:
    languages = list(grouped_df.loc[relation].index)
    print(f"{relation}': {languages}")

continent': ['Bulgarian', 'Dutch', 'French', 'German', 'Portuguese', 'Romanian', 'Serbian', 'Swedish', 'Ukrainian']
country of origin': ['Catalan', 'Croatian', 'Czech', 'Danish', 'English', 'German', 'Italian', 'Romanian', 'Spanish', 'Ukrainian']
developer': ['Serbian']
employer': ['Bulgarian', 'Russian', 'Ukrainian']
field of work': ['Bulgarian', 'Catalan', 'Croatian', 'Czech', 'Danish', 'Dutch', 'English', 'French', 'German', 'Hungarian', 'Italian', 'Polish', 'Portuguese', 'Romanian', 'Russian', 'Serbian', 'Slovenian', 'Spanish', 'Swedish', 'Ukrainian']
formation location': ['Czech', 'Romanian', 'Serbian', 'Spanish']
genre': ['Bulgarian', 'Catalan', 'Croatian', 'Czech', 'Danish', 'Dutch', 'English', 'French', 'German', 'Hungarian', 'Italian', 'Polish', 'Portuguese', 'Romanian', 'Russian', 'Serbian', 'Spanish', 'Swedish', 'Ukrainian']
headquarters location': ['Croatian', 'Czech', 'Polish', 'Russian', 'Slovenian', 'Spanish', 'Ukrainian']
instrument': ['Bulgarian', 'Croatian', 'Czech', 

In [68]:
filtered_df = filtered_df.sort_values(
    by=["language", "percentage change"], ascending=False
).reset_index(drop=True)
filtered_df.to_csv(
    "../../data/error_analysis/relation_percent_change_canonical_above_0_percent.csv"
)
filtered_df

Unnamed: 0,language,Title,percentage change
0,Ukrainian,position played on team,122.208
1,Ukrainian,instrument,111.626
2,Ukrainian,record label,57.504
3,Ukrainian,sport,52.917
4,Ukrainian,field of work,46.763
...,...,...,...
270,Bulgarian,place of death,10.035
271,Bulgarian,position held,6.664
272,Bulgarian,original network,4.944
273,Bulgarian,named after,2.535
