# Result Analysis
Use this notebook to analyse the results of a KG evaluation with KGrEaT. In the tables below you can find an overview of the performance metrics per KG as well as mapping coverage per dataset.

## Load and Merge Results

In [None]:
# specify the paths to the results of the KGs here

PATH_TO_HR_RESULTS = {
    '0_DBpedia2016': './kg/dbpedia2016/result/run_HR',
    '1_DBpedia2022': './kg/dbpedia2022/result/run_HR',
    '2_YAGO': './kg/yago3/result/run_HR',
    '3_Wikidata': './kg/wikidata/result/run_HR',
    '4_CaLiGraph': './kg/caligraph/result/run_HR',
    '5_DbkWik': './kg/dbkwik/result/run_HR',
}

PATH_TO_HP_RESULTS = {
    '0_DBpedia2016': './kg/dbpedia2016/result/run_HP',
    '1_DBpedia2022': './kg/dbpedia2022/result/run_HP',
    '2_YAGO': './kg/yago3/result/run_HP',
    '3_Wikidata': './kg/wikidata/result/run_HP',
    '4_CaLiGraph': './kg/caligraph/result/run_HP',
    '5_DbkWik': './kg/dbkwik/result/run_HP',
}

In [None]:
import pandas as pd
from pathlib import Path
pd.set_option('display.max_rows', 100)

def load_kg_results(kg_name: str, path_to_results: str) -> pd.DataFrame:
    path_to_results = Path(path_to_results)
    df = pd.concat([pd.read_csv(f, sep='\t') for f in path_to_results.rglob('*.tsv')]).reset_index(drop=True)
    df['kg_name'] = kg_name
    return df

def merge_kg_results(path_to_results) -> pd.DataFrame:
    return pd.concat([load_kg_results(kg_name, result_path) for kg_name, result_path in path_to_results.items() if Path(result_path).exists()])

df_hp = merge_kg_results(PATH_TO_HP_RESULTS)
df_hp['entity_mode'] = df_hp['entity_mode'].map({'ALL': '1_PA', 'KNOWN': '0_PK'})
df_hr = merge_kg_results(PATH_TO_HR_RESULTS)
df_hr = df_hr[df_hr['entity_mode'] == 'ALL']
df_hr['entity_mode'] = '2_R'
df = pd.concat([df_hr, df_hp])
df['entity_frac_known'] = 1 - (df['entities_missing'] / df['entities_total'])

## Raw Result Data

In [None]:
df

## Results Aggregated on Task Level

In [None]:
dfs = df.sort_values(['kg_name', 'entity_mode'])
pt = dfs.pivot_table(columns=['kg_name', 'entity_mode'], index=['task_mode', 'metric'], values='score', sort=False)
pt.round(3)

## Coverage of datasets

In [None]:
dfs = df.sort_values(['kg_name', 'entity_mode'])
pt = dfs.pivot_table(columns=['kg_name', 'entity_mode'], index=['task_mode', 'dataset'], values='entity_frac_known', sort=False)
pt.round(2)