## Imports

In [1]:
import pandas as pd
import glob
import datasets
import numpy as np
import matplotlib.pyplot as plt

from scipy.stats import chi2_contingency
from scipy.stats import chi2

## Load Data

In [2]:
wiki_data = pd.read_csv("../../data/wikidata/wikidata-property-list.csv")
wiki_data = wiki_data[["Title", "ID", "Datatype", "Description"]]

In [3]:
code_to_lang_dict = {
    "bg": "Bulgarian",
    "ca": "Catalan",
    "cs": "Czech",
    "da": "Danish",
    "de": "German",
    "en": "English",
    "es": "Spanish",
    "fr": "French",
    "hr": "Croatian",
    "hu": "Hungarian",
    "it": "Italian",
    "nl": "Dutch",
    "pl": "Polish",
    "pt": "Portuguese",
    "ro": "Romanian",
    "ru": "Russian",
    "sl": "Slovenian",
    "sr": "Serbian",
    "sv": "Swedish",
    "uk": "Ukrainian",
}

In [4]:
lang_to_code_dict = {v: k for k, v in code_to_lang_dict.items()}

In [5]:
results_dict = {}
results_dict["language"] = []
results_dict["relation"] = []
results_dict["percentage change"] = []
results_dict["new ratio of rows"] = []
results_dict["old ratio of rows"] = []

hf_df = datasets.load_dataset("CalibraGPT/Fact-Completion")
file_names = glob.glob("../../data/result_logs/llama-30b/error-analysis/*.csv")

# confirm grabbing data correctly against LLaMa figure
# uncomment print statement at end of for loop to see
results_dfs = []
count = 0
for file in file_names:
    language = file.split(".csv")[0].split("-")[-1].capitalize()
    error_df = pd.read_csv(file)
    full_hf_df = hf_df[file.split(".csv")[0].split("-")[-1].capitalize()]
    full_hf_df = full_hf_df.to_pandas()

    # stem is in both
    # dataset id is in both
    # to see if the model got something wrong, see if the dataset id in the full df is in the error
    error_ids = list(error_df["dataset_id"])
    correct = []
    counts = []
    relation_names = []
    for row in full_hf_df.iterrows():
        # track counts
        count += 1
        counts.append(count)
        # track errors
        correct.append(False) if row[1]["dataset_id"] in error_ids else correct.append(
            True
        )
        # track relation titles
        relation_id = int(row[1].relation[1:])
        relation_title = list(wiki_data[wiki_data["ID"] == relation_id]["Title"])[0]
        relation_names.append(relation_title)

    # append result to full df
    full_hf_df["correct"] = correct
    # append language to full df
    full_hf_df["language"] = [language] * full_hf_df.shape[0]
    # append language code to full df
    lang_code = lang_to_code_dict[language]
    full_hf_df["lang_code"] = [lang_code] * full_hf_df.shape[0]
    # append relation title to full df
    full_hf_df["relation_title"] = relation_names
    # also append an arbitrary id to have unique val for each row
    full_hf_df["analysis_id"] = counts

    results_dfs.append(full_hf_df)

results_df = pd.concat(results_dfs)
assert results_df.shape[0] == count

Found cached dataset parquet (/Users/tim/.cache/huggingface/datasets/CalibraGPT___parquet/CalibraGPT--Fact-Completion-24a24a1e4bf6e4a8/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/20 [00:00<?, ?it/s]

## More Cleanup to Ensure that we have access to Subjects across langs

In [7]:
# mapping between dataset id and the english form of a subject
dataset_id_to_eng_subject = {}
for row in results_df.iterrows():
    if row[1].language == "English":
        if row[1].dataset_id not in dataset_id_to_eng_subject:
            dataset_id_to_eng_subject[row[1].dataset_id] = row[1].subject

In [8]:
# put in an id and get the english subject back
dataset_id_to_eng_subject["rome_21844"]

'Megan Rapinoe'

In [9]:
entities = {}
for row in results_df.iterrows():
    # gather helpful row level data
    # the subject
    subject = row[1].subject
    # whether the model got it right
    val = row[1].correct
    # the dataset id
    dataset_id = row[1].dataset_id
    # the english version of the subject
    english_subject = dataset_id_to_eng_subject[dataset_id]

    # commit it to our tracking dict
    if english_subject not in entities:
        entities[english_subject] = {
            "correct": 0,
            "incorrect": 0,
            "langs": {},
            "alternate_forms": {},
            "dataset_ids": set(),
        }

    # counter of correct/incorrect for that subject
    if val:
        entities[english_subject]["correct"] += 1
    else:
        entities[english_subject]["incorrect"] += 1

    # track language
    lang = row[1].lang_code

    if lang not in entities[english_subject]["langs"]:
        entities[english_subject]["langs"][lang] = 1

    else:
        entities[english_subject]["langs"][lang] += 1

    # track any alternate forms
    entities[english_subject]["alternate_forms"][lang] = subject

    entities[english_subject]["dataset_ids"].add(dataset_id)

In [10]:
entity_names = []
correct = []
incorrect = []
total = []
pct = []
langs = []
num_langs = []
alternate_forms = []
dataset_ids = []
for k, v in entities.items():
    entity_names.append(k)
    # track # of times entity is used in a correct statement, incorrect, and pct accuracy
    correct.append(v["correct"])
    incorrect.append(v["incorrect"])
    total.append(int(v["correct"]) + int(v["incorrect"]))
    pct.append(int(v["correct"]) / (int(v["correct"]) + int(v["incorrect"])))
    # track # of languages the entity is used in
    langs.append(v["langs"])
    num_langs.append(len(v["langs"]))
    alternate_forms.append(v["alternate_forms"])
    # track dataset ids its used in
    dataset_ids.append(list(v["dataset_ids"]))
    # sanity check
    assert int(v["correct"]) + int(v["incorrect"]) == sum(v["langs"].values())

In [11]:
# the average entity appears in ~12 langs
# (remember that this will max out at 20.)
np.mean(num_langs)

11.867738745323988

In [12]:
entity_analysis_df = pd.DataFrame(
    {
        "entity": entity_names,
        "num_correct": correct,
        "num_incorrect": incorrect,
        "total_usages": total,
        "percent_accuracy": pct,
        "languages": langs,
        "num_languages": num_langs,
        "alternate_forms": alternate_forms,
        "dataset_ids": dataset_ids,
    }
)

In [22]:
entity_analysis_df.to_json(
    "../../data/error_analysis/entity_analysis_language_and_accuracy_by_entity.json",
    orient="index"
)

## Usage Reports

In [14]:
def get_lang_usage_report_for_entity(entity_val):
    usage_dict = entity_analysis_df[entity_analysis_df["entity"] == entity_val][
        "languages"
    ].values[0]

    usage_dict = {
        code_to_lang_dict[k]: {"correct": 0, "incorrect": 0}
        for k, v in usage_dict.items()
    }
    # ok, how many of those usages are correct vs not?
    dataset_ids = list(
        entity_analysis_df[entity_analysis_df["entity"] == entity_val]["dataset_ids"]
    )[0]

    for dataset_id in dataset_ids:
        subset = results_df[results_df["dataset_id"] == dataset_id]
        for row in subset.iterrows():
            lang_used = row[1].language
            correct = row[1].correct

            if correct:
                usage_dict[lang_used]["correct"] += 1
            else:
                usage_dict[lang_used]["incorrect"] += 1

    return usage_dict

In [15]:
def get_percent_correct_from_usage_report(usage_report):
    correct = 0
    incorrect = 0
    for l in usage_report.keys():
        correct += usage_report[l]["correct"]
        incorrect += usage_report[l]["incorrect"]
    return np.round(correct / (correct + incorrect) * 100, 2)

In [16]:
kerala_usage = get_lang_usage_report_for_entity("Kerala")
kerala_usage

{'Serbian': {'correct': 1, 'incorrect': 0},
 'Ukrainian': {'correct': 3, 'incorrect': 0},
 'Dutch': {'correct': 5, 'incorrect': 1},
 'Swedish': {'correct': 4, 'incorrect': 0},
 'Catalan': {'correct': 5, 'incorrect': 1},
 'Polish': {'correct': 3, 'incorrect': 0},
 'Bulgarian': {'correct': 5, 'incorrect': 0},
 'Slovenian': {'correct': 2, 'incorrect': 0},
 'English': {'correct': 6, 'incorrect': 0},
 'German': {'correct': 6, 'incorrect': 0},
 'Portuguese': {'correct': 6, 'incorrect': 0},
 'Czech': {'correct': 3, 'incorrect': 0},
 'Spanish': {'correct': 3, 'incorrect': 1},
 'Croatian': {'correct': 1, 'incorrect': 0},
 'Danish': {'correct': 6, 'incorrect': 0},
 'French': {'correct': 5, 'incorrect': 0},
 'Italian': {'correct': 5, 'incorrect': 0},
 'Romanian': {'correct': 5, 'incorrect': 0}}

In [17]:
get_percent_correct_from_usage_report(kerala_usage)

96.1

In [18]:
def get_rows_by_entity(entity_val, outcome=False):
    ids = list(
        entity_analysis_df[entity_analysis_df["entity"] == entity_val]["dataset_ids"]
    )[0]
    return results_df[
        (results_df["dataset_id"].isin(ids)) & (results_df["correct"] == outcome)
    ]

In [19]:
get_rows_by_entity("Kerala")

Unnamed: 0,dataset_id,stem,true,false,relation,subject,object,correct,language,lang_code,relation_title,analysis_id
11211,rome_16815,In Kerala verstaan ​​ze <br> In Kerala begrijp...,Engels,Fins,P37,Kerala,Engels,False,Dutch,nl,official language,24556
9395,rome_16815,"A Kerala, entenen <br> A Kerala, entenen el",Anglès,finès,P37,Kerala,Anglès,False,Catalan,ca,official language,71556
9320,rome_16815,En Kerala entienden,Inglés,finlandés,P37,Kerala,Inglés,False,Spanish,es,official language,206544


## Significance Testing

* use chi squared to see: 
 
    * if the number of correctly and incorrectly classified statements for each language scrip is statistically significant
       * yes
    * if the number of correctly and incorrectly classified statements for each language group is statistically significant
        * yes
        
    * the number of correctly and incorrectly classified locations for western/eastern locales is statistically significant
    * the number of correctly and incorrectly classified entities for women/men is statistically significant

### language groups and script

Romance languages: Catalan, French, Italian, Portuguese, Romanian, Spanish
Germanic languages: Danish, Dutch, German, Swedish
Slavic languages: Bulgarian, Czech, Croatian, Polish, Russian, Serbian, Slovenian, Ukrainian
Hungarian: a Uralic language, not related to any of the other languages in the list.

In [20]:
# 2 x 2
lang_to_script_dict = {
    "bg": "Cyrillic",
    "ca": "Latin",
    "cs": "Latin",
    "da": "Latin",
    "de": "Latin",
    "en": "Latin",
    "es": "Latin",
    "fr": "Latin",
    "hr": "Latin",
    "hu": "Latin",
    "it": "Latin",
    "nl": "Latin",
    "pl": "Latin",
    "pt": "Latin",
    "ro": "Latin",
    "ru": "Cyrillic",
    "sl": "Latin",
    "sr": "Cyrillic",
    "sv": "Latin",
    "uk": "Cyrillic",
}

# 2 x 4
lang_to_group_dict = {
    "bg": "Slavic",
    "ca": "Romance",
    "cs": "Slavic",
    "da": "Germanic",
    "de": "Germanic",
    "en": "Germanic",
    "es": "Romance",
    "fr": "Romance",
    "hr": "Slavic",
    "hu": "Uralic",
    "it": "Romance",
    "nl": "Germanic",
    "pl": "Slavic",
    "pt": "Romance",
    "ro": "Romance",
    "ru": "Slavic",
    "sl": "Slavic",
    "sr": "Slavic",
    "sv": "Germanic",
    "uk": "Slavic",
}

In [21]:
# now, for each of these levels, we need:
# number correct
# number incorrect
# total..
results_df.head()

Unnamed: 0,dataset_id,stem,true,false,relation,subject,object,correct,language,lang_code,relation_title,analysis_id
0,calinet_8922,Приус производи,Тоиота,Хонда,P176,Приус,Тоиота,True,Serbian,sr,manufacturer,1
1,rome_5025,Сундар Пицхаи ради за,Гоогле,Аппле,P108,Сундар Пицхаи,Гоогле,False,Serbian,sr,employer,2
2,rome_21333,"Главни град Народне Републике Кине,",Пекинг,Кабул,P36,Народна Република Кина,Пекинг,True,Serbian,sr,capital,3
3,rome_8738,У Синт Мартену разумеју,холандски,дански,P37,Синт Маартен,холандски,True,Serbian,sr,official language,4
4,rome_8783,Хаас Хоусе се налази у месту,Беч,Алберта,P131,Хаас Хоусе,Беч,True,Serbian,sr,is in the administrative territorial entity,5


In [28]:
# scripts
cyrillic = {'correct': 0, 'incorrect': 0}
latin = {'correct': 0, 'incorrect': 0}

# language groups
germanic = {'correct': 0, 'incorrect': 0}
romance = {'correct': 0, 'incorrect': 0}
slavic = {'correct': 0, 'incorrect': 0}
uralic = {'correct': 0, 'incorrect': 0}

for row in results_df.iterrows():
        
    lang_code = row[1].lang_code
    result = row[1].correct
    mapping = ''
    
    script = lang_to_script_dict[lang_code]
    group = lang_to_group_dict[lang_code]
    
    if result:
        mapping = 'correct'
    else:
        mapping = 'incorrect'
    
    # language scripts
    if script == 'Cyrillic':
        cyrillic[mapping] +=1
    elif script == 'Latin':
        latin[mapping] +=1
        
    # language groups
    if group == 'Germanic':
        germanic[mapping] +=1
    elif group == 'Romance':
        romance[mapping] +=1
    elif group == 'Slavic':
        slavic[mapping] +=1
    elif group == 'Uralic':
        uralic[mapping] +=1
    
print(f"cyrllic: {cyrillic}")
print(f"latin: {latin}")

print(f"germanic: {germanic}")
print(f"romance: {romance}")
print(f"slavic: {slavic}")
print(f"uralic: {uralic}") # sanity check -> this is 75.7% correct which is the same as the llama graph result for HU.

# sanity check identical output sizes
assert(sum(cyrillic.values()) + sum(latin.values()) == results_df.shape[0])
assert(sum(germanic.values()) + sum(romance.values()) + sum(slavic.values()) + sum(uralic.values()) == results_df.shape[0])

cyrllic: {'correct': 26248, 'incorrect': 10962}
latin: {'correct': 221567, 'incorrect': 44366}
germanic: {'correct': 93963, 'incorrect': 16109}
romance: {'correct': 97762, 'incorrect': 19307}
slavic: {'correct': 52568, 'incorrect': 18784}
uralic: {'correct': 3522, 'incorrect': 1128}


In [109]:
def chi_squared(category_dicts, flag_one, flag_two, category_explainer):
    
    table = []
    top_vals = []
    bottom_vals = []
    for i in range(len(category_dicts)):
        val = category_dicts[i][flag_one]
        top_vals.append(category_dicts[i][flag_one])
        bottom_vals.append(category_dicts[i][flag_two])
    
    table = np.array([top_vals, bottom_vals])
    # print(f"contingency_table for {category_explainer}\n(top row is # correct, bottom row is # incorrect)\n{table}")
        
    stat, p, dof, expected = chi2_contingency(table)
    reject = 'REJECT' if p <= .05 else 'ACCEPT'
    
    if p < .001:
        p = "< .001"
    
    print(f"For {category_explainer}, we see a chi-squared value of {stat} and a p-value of {p}.")
    print(f"We can {reject} the null hypothesis that {category_explainer} is independent from performance on the CKA assessment.")

In [110]:
chi_squared([cyrillic, latin], 'correct', 'incorrect', 'language script')

For language script, we see a chi-squared value of 3570.576274105528 and a p-value of < .001.
We can REJECT the null hypothesis that language script is independent from performance on the CKA assessment.


In [111]:
chi_squared([germanic, romance, slavic, uralic], 'correct', 'incorrect', 'language family')

For language family, we see a chi-squared value of 4438.005771880503 and a p-value of < .001.
We can REJECT the null hypothesis that language family is independent from performance on the CKA assessment.


### Western vs Eastern locations

In [166]:
# results_df[(results_df['relation'] == 'P127') & (results_df['lang_code'] == 'en')].tail()

In [230]:
geo_relations = {
'capital': 'P36',
'country': 'P17',
'continent': 'P30',
'capital of': 'P1376',
'is in the administrative territorial entity': 'P131',
'shares border with': 'P47'}

In [231]:
geo_df = results_df[results_df['relation'].isin(list(geo_relations.values()))]

In [232]:
geo_df.shape

(44297, 12)

In [233]:
c = 0
geo_entities = {}
for row in entity_analysis_df.iterrows():
    dataset_ids = list(row[1].dataset_ids)
    for d in dataset_ids:
        if d in list(geo_df['dataset_id']):
            entity = row[1].entity
            if entity not in geo_entities:
                geo_entities[row[1].entity] = [row[1].num_correct, row[1].num_incorrect, [d]]
            else:
                geo_entities[row[1].entity][2].append(d)
                
    c += 1

In [234]:
len(geo_entities)

3247

In [241]:
for k, v in geo_entities.items():
    print(k, v)
    break

People's Republic of China [17, 0, ['rome_21333']]


In [242]:
# work with gazetteer