## Imports

In [1]:
import pandas as pd
import glob
import datasets
import numpy as np
import matplotlib.pyplot as plt
import os
import json

from scipy.stats import chi2_contingency
from scipy.stats import chi2

import time

from geopy.geocoders import Nominatim
from pycountry_convert import country_alpha2_to_continent_code

## Load Data

In [2]:
wiki_data = pd.read_csv("../../data/wikidata/wikidata-property-list.csv")
wiki_data = wiki_data[["Title", "ID", "Datatype", "Description"]]

In [3]:
code_to_lang_dict = {
    "bg": "Bulgarian",
    "ca": "Catalan",
    "cs": "Czech",
    "da": "Danish",
    "de": "German",
    "en": "English",
    "es": "Spanish",
    "fr": "French",
    "hr": "Croatian",
    "hu": "Hungarian",
    "it": "Italian",
    "nl": "Dutch",
    "pl": "Polish",
    "pt": "Portuguese",
    "ro": "Romanian",
    "ru": "Russian",
    "sl": "Slovenian",
    "sr": "Serbian",
    "sv": "Swedish",
    "uk": "Ukrainian",
}

In [4]:
lang_to_code_dict = {v: k for k, v in code_to_lang_dict.items()}

In [5]:
results_dict = {}
results_dict["language"] = []
results_dict["relation"] = []
results_dict["percentage change"] = []
results_dict["new ratio of rows"] = []
results_dict["old ratio of rows"] = []

hf_df = datasets.load_dataset("CalibraGPT/Fact-Completion")
file_names = glob.glob("../../data/result_logs/llama-30b/error-analysis/*.csv")

# confirm grabbing data correctly against LLaMa figure
# uncomment print statement at end of for loop to see
results_dfs = []
count = 0
for file in file_names:
    language = file.split(".csv")[0].split("-")[-1].capitalize()
    error_df = pd.read_csv(file)
    full_hf_df = hf_df[file.split(".csv")[0].split("-")[-1].capitalize()]
    full_hf_df = full_hf_df.to_pandas()

    # stem is in both
    # dataset id is in both
    # to see if the model got something wrong, see if the dataset id in the full df is in the error
    error_ids = list(error_df["dataset_id"])
    correct = []
    counts = []
    relation_names = []
    for row in full_hf_df.iterrows():
        # track counts
        count += 1
        counts.append(count)
        # track errors
        correct.append(False) if row[1]["dataset_id"] in error_ids else correct.append(
            True
        )
        # track relation titles
        relation_id = int(row[1].relation[1:])
        relation_title = list(wiki_data[wiki_data["ID"] == relation_id]["Title"])[0]
        relation_names.append(relation_title)

    # append result to full df
    full_hf_df["correct"] = correct
    # append language to full df
    full_hf_df["language"] = [language] * full_hf_df.shape[0]
    # append language code to full df
    lang_code = lang_to_code_dict[language]
    full_hf_df["lang_code"] = [lang_code] * full_hf_df.shape[0]
    # append relation title to full df
    full_hf_df["relation_title"] = relation_names
    # also append an arbitrary id to have unique val for each row
    full_hf_df["analysis_id"] = counts

    results_dfs.append(full_hf_df)

results_df = pd.concat(results_dfs)
assert results_df.shape[0] == count

Found cached dataset parquet (/Users/tim/.cache/huggingface/datasets/CalibraGPT___parquet/CalibraGPT--Fact-Completion-24a24a1e4bf6e4a8/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/20 [00:00<?, ?it/s]

## More Cleanup to Ensure that we have access to Subjects across langs

In [6]:
# mapping between dataset id and the english form of a subject
dataset_id_to_eng_subject = {}
for row in results_df.iterrows():
    if row[1].language == "English":
        if row[1].dataset_id not in dataset_id_to_eng_subject:
            dataset_id_to_eng_subject[row[1].dataset_id] = row[1].subject

In [7]:
entities = {}
for row in results_df.iterrows():
    # gather helpful row level data
    # the subject
    subject = row[1].subject
    # whether the model got it right
    val = row[1].correct
    # the dataset id
    dataset_id = row[1].dataset_id
    # the english version of the subject
    english_subject = dataset_id_to_eng_subject[dataset_id]

    # commit it to our tracking dict
    if english_subject not in entities:
        entities[english_subject] = {
            "correct": 0,
            "incorrect": 0,
            "langs": {},
            "alternate_forms": {},
            "dataset_ids": set(),
        }

    # counter of correct/incorrect for that subject
    if val:
        entities[english_subject]["correct"] += 1
    else:
        entities[english_subject]["incorrect"] += 1

    # track language
    lang = row[1].lang_code

    if lang not in entities[english_subject]["langs"]:
        entities[english_subject]["langs"][lang] = 1

    else:
        entities[english_subject]["langs"][lang] += 1

    # track any alternate forms
    entities[english_subject]["alternate_forms"][lang] = subject

    entities[english_subject]["dataset_ids"].add(dataset_id)

In [8]:
entity_names = []
correct = []
incorrect = []
total = []
pct = []
langs = []
num_langs = []
alternate_forms = []
dataset_ids = []
for k, v in entities.items():
    entity_names.append(k)
    # track # of times entity is used in a correct statement, incorrect, and pct accuracy
    correct.append(v["correct"])
    incorrect.append(v["incorrect"])
    total.append(int(v["correct"]) + int(v["incorrect"]))
    pct.append(int(v["correct"]) / (int(v["correct"]) + int(v["incorrect"])))
    # track # of languages the entity is used in
    langs.append(v["langs"])
    num_langs.append(len(v["langs"]))
    alternate_forms.append(v["alternate_forms"])
    # track dataset ids its used in
    dataset_ids.append(list(v["dataset_ids"]))
    # sanity check
    assert int(v["correct"]) + int(v["incorrect"]) == sum(v["langs"].values())

In [9]:
# the average entity appears in ~12 langs
# (remember that this will max out at 20.)
np.mean(num_langs)

11.867738745323988

In [10]:
entity_analysis_df = pd.DataFrame(
    {
        "entity": entity_names,
        "num_correct": correct,
        "num_incorrect": incorrect,
        "total_usages": total,
        "percent_accuracy": pct,
        "languages": langs,
        "num_languages": num_langs,
        "alternate_forms": alternate_forms,
        "dataset_ids": dataset_ids,
    }
)

## Get Geo entities

In [11]:
geo_relations = {
'capital': 'P36',
'country': 'P17',
'continent': 'P30',
'capital of': 'P1376',
'is in the administrative territorial entity': 'P131',
'shares border with': 'P47'}

In [12]:
geo_df = results_df[results_df['relation'].isin(list(geo_relations.values()))]

In [13]:
geo_df.shape

(44297, 12)

In [14]:
geo_entities = {}
for row in entity_analysis_df.iterrows():
    dataset_ids = list(row[1].dataset_ids)
    for d in dataset_ids:
        if d in list(geo_df['dataset_id']):
            entity = row[1].entity
            if entity not in geo_entities:
                geo_entities[row[1].entity] = [row[1].num_correct, row[1].num_incorrect, [d]]
            else:
                geo_entities[row[1].entity][2].append(d)

In [78]:
len(geo_entities)

3247

In [19]:
with open("../../data/wikidata/full_geo_entities.json", "w") as outfile:
    json.dump(geo_entities, outfile)

## Get people entities

In [27]:
results_df.head()

Unnamed: 0,dataset_id,stem,true,false,relation,subject,object,correct,language,lang_code,relation_title,analysis_id
0,calinet_8922,Приус производи,Тоиота,Хонда,P176,Приус,Тоиота,True,Serbian,sr,manufacturer,1
1,rome_5025,Сундар Пицхаи ради за,Гоогле,Аппле,P108,Сундар Пицхаи,Гоогле,False,Serbian,sr,employer,2
2,rome_21333,"Главни град Народне Републике Кине,",Пекинг,Кабул,P36,Народна Република Кина,Пекинг,True,Serbian,sr,capital,3
3,rome_8738,У Синт Мартену разумеју,холандски,дански,P37,Синт Маартен,холандски,True,Serbian,sr,official language,4
4,rome_8783,Хаас Хоусе се налази у месту,Беч,Алберта,P131,Хаас Хоусе,Беч,True,Serbian,sr,is in the administrative territorial entity,5


In [29]:
wiki_data.head()

Unnamed: 0,Title,ID,Datatype,Description
0,head of government,6,item,"head of the executive power of a town, city, m..."
1,brother,7,item,subject has the object as their brother
2,sister,9,item,subject has the object as their sister (female...
3,video,10,Commons file,relevant video
4,highway marker,14,Commons file,graphic representing the highway


In [38]:
wiki_data[wiki_data['ID'] == 740]['Title']

546    formation location
Name: Title, dtype: object

In [43]:
ids = set(list(results_df['relation']))
dataset_ids_to_titles = {}

for i in ids:
    formatted_i = int(i[1:])
    
    wiki_data_title = list(wiki_data[wiki_data['ID'] == formatted_i]['Title'])[0]
    dataset_ids_to_titles[i] = wiki_data_title

print(dataset_ids_to_titles)

{'P740': 'formation location', 'P20': 'place of death', 'P17': 'country', 'P131': 'is in the administrative territorial entity', 'P1303': 'instrument', 'P276': 'located in', 'P159': 'headquarters location', 'P19': 'place of birth', 'P127': 'owned by', 'P47': 'shares border with', 'P1376': 'capital of', 'P108': 'employer', 'P264': 'record label', 'P103': 'native language', 'P136': 'genre', 'P37': 'official language', 'P364': 'original language', 'P449': 'original network', 'P1412': 'languages spoken', 'P138': 'named after', 'P39': 'position held', 'P413': 'position played on team', 'P30': 'continent', 'P937': 'work location', 'P641': 'sport', 'P36': 'capital', 'P176': 'manufacturer', 'P178': 'developer', 'P106': 'occupation', 'P495': 'country of origin', 'P101': 'field of work', 'P407': 'language'}


In [73]:
people_relations = {'P20': 'place of death',
'P1303': 'instrument',
'P108': 'employer',
'P103': 'native language',
'P39': 'position held',
'P413': 'position played on team',
'P937': 'work location',
'P641': 'sport',
'P106': 'occupation',
'P101': 'field of work'}

In [74]:
people_df = results_df[results_df['relation'].isin(list(people_relations.keys()))]

In [75]:
people_df.shape

(90968, 12)

In [79]:
len(people_entities)

7905

In [80]:
with open("../../data/wikidata/full_people_entities.json", "w") as outfile:
    json.dump(people_entities, outfile)