# Evalutation results for DBpedia

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
def plot_macro_f1(languages: set, f1_scores: dict, x_distance, figsize):
    x = np.arange(len(languages)) * x_distance  # the label locations
    width = 0.4  # the width of the bars
    multiplier = 0

    fig, ax = plt.subplots(layout='constrained', figsize=figsize)

    for attribute, measurement in f1_scores.items():
        offset = width * multiplier
        rects = ax.bar(x + offset, measurement, width, label=attribute)
        ax.bar_label(rects, fmt='%.2f')
        multiplier += 1

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('F1 Score')
    ax.set_title('F1 Scores by languages')
    ax.set_xticks(x + width*(len(f1_scores)-1)/2, languages)
    ax.legend(loc='upper left', ncols=3)
    plt.show()


def get_lang_set(results):
    languages_list = [set(result["Language"].tolist())
                      for result in results.values()]
    return set().union(*languages_list)


def get_f1_scores(results, languages):
    f1_scores = {}
    for name, result_df in results.items():
        scores = list()
        for lang in languages:
            try:
                macro_f1 = result_df.loc[result_df["Language"]
                                         == lang, "Macro F1"].iloc[0]
            except:
                macro_f1 = 0
            scores.append(macro_f1)
        f1_scores[name] = scores
    return f1_scores

def plot(results, x_distance=1, figsize=(8, 8)):
    languages = get_lang_set(results)
    f1_scores = get_f1_scores(results, languages)
    plot_macro_f1(languages, f1_scores, x_distance, figsize)

## Baseline

First, we only fine-tuned mT5-base model on qald-9-plus with DBpedia SPARQLs for 100 epochs as our baseline. 

In [4]:
# https://gerbil-qa.aksw.org/gerbil/experiment?id=202306270001
baseline = pd.read_csv("../gerbil_results/mt5-base-qald9-dbpedia.csv")
baseline

Unnamed: 0,Language,Micro F1,Micro Precision,Micro Recall,Macro F1,Macro Precision,Macro Recall,Macro F1 QALD
0,ba,0.0633,0.1472,0.0404,0.2901,0.293,0.2997,0.4436
1,be,0.1366,0.3255,0.0864,0.367,0.3652,0.3813,0.5045
2,de,0.0439,0.0288,0.0922,0.3727,0.3709,0.3887,0.5183
3,en,0.0313,0.0204,0.068,0.3352,0.3333,0.3534,0.4864
4,fr,0.0345,0.4423,0.018,0.2558,0.2564,0.2553,0.4036
5,lt,0.039,0.0255,0.0831,0.3435,0.3419,0.36,0.5007
6,ru,0.0336,0.0221,0.07,0.3484,0.3465,0.3697,0.5043
7,uk,0.0695,0.0641,0.076,0.3748,0.3725,0.3935,0.5244


## Experiment 10

According to the experience from experiment for Wikidata, we first pre-trained on LCquad 1.0 and fine-tuned on qald-9-plus with DBpedia SPARQLs.

Since our entity linking tool for DBpedia only works for `en`, `de`, and `fr`, we only evaluated our model on these languages. 

In [None]:
# https://gerbil-qa.aksw.org/gerbil/experiment?id=202306290000
exp9 = pd.read_csv("../gerbil_results/mt5-lcquad-ling-qald9.csv")
exp9