## This notebook generates annotation sheets for Top-10 in Wikidata (EN), Wikidata (NL), AAT (EN), AAT (NL), PWN, and ODWN

* annotation_sheet_wikidata_en.csv
* annotation_sheet_wikidata_nl.csv
* annotation_sheet_aat_en.csv
* annotation_sheet_aat_nl.cs
* annotation_sheet_pwn_en.csv
* annotation_sheet_odwn_nl.csv

In [None]:
import json
import pandas as pd

In [None]:
def generate_annotation_sheet(resource:str, lang:str, filename:str, score_type=None):
    '''
    score_type: str, 'cs_rm', 'cs_wm', 'cs_rm_wm' or None, default None
    filename: str, corresponding csv file (top-10 or additional entities) in the 'cs' directory,
    without '.csv' extention, for example 'top_10_rm_wikidata_en'
    Returns pandas df
    '''
    
    if resource == "wikidata":
        annotation_sheet_df = pd.DataFrame(columns=["term","entity_id","text_1","text_2","text_3","text_4","text_5"])
        
        # import search results
        with open(f"/Users/anesterov/wd/jan31/results_clean_{lang}.json",'r') as jf:
            search_results = json.load(jf)
        # import top-10
        top_10 = pd.read_csv(f"/Users/anesterov/reps/LODlit/cs/{filename}.csv")

        for group in top_10.groupby("term"):
            # if there is a score in the df
            # check if the highest cs score != 0
            if score_type == None or list(group[1][score_type])[0] > 0:
                for hit_id in list(group[1]["hit_id"]):
                    for hit in search_results[group[0]]:
                        if hit["QID"] == hit_id:
                            data = [group[0],hit_id,hit["prefLabel"],hit["aliases"],hit["description"],hit["instance_of"],\
                                   hit["subclass_of"]]
                            annotation_sheet_df.loc[len(annotation_sheet_df)] = data
                        
    if resource == "aat":
        annotation_sheet_df = pd.DataFrame(columns=["term","entity_id","text_1","text_2","text_3","text_4","text_5"])

        # import search results
        with open(f"/Users/anesterov/reps/LODlit/AAT/aat_query_results_{lang}.json",'r') as jf:
            search_results = json.load(jf)
        # import top-10
        top_10 = pd.read_csv(f"/Users/anesterov/reps/LODlit/cs/{filename}.csv")
        
        for group in top_10.groupby("term"):
            # if there is a score in the df
            # check if the highest cs score != 0
            if score_type == None or list(group[1][score_type])[0] > 0:
                ids_per_term = [str(i).replace('.0','') for i in list(group[1]["hit_id"])]
                for hit_id in ids_per_term:
                    for hit in search_results[group[0]]:
                        if hit["aat_uri"] == hit_id:
                            data = [group[0],hit_id,hit["prefLabel"],hit["altLabel"],\
                                    hit["scopeNote"],hit["prefLabel_comment"],hit["altLabel_comment"]]
                            annotation_sheet_df.loc[len(annotation_sheet_df)] = data
                        
    if resource == "pwn":
        annotation_sheet_df = pd.DataFrame(columns=["term","entity_id","text_1","text_2","text_3"])

        # import search results
        with open("/Users/anesterov/reps/LODlit/PWN/pwn31_query_results.json",'r') as jf:
            search_results = json.load(jf)
        # import top-10
        top_10 = pd.read_csv(f"/Users/anesterov/reps/LODlit/cs/{filename}.csv")
        
        for group in top_10.groupby("term"):
            # if there is a score in the df
            # check if the highest cs score != 0
            if score_type == None or list(group[1][score_type])[0] > 0:
                for hit_id in list(group[1]["hit_id"]):
                    for hit in search_results[group[0]]:
                        if hit["synset_id"] == hit_id:
                            data = [group[0],hit_id,hit["lemmata"],hit["definition"],hit["examples"]]
                            annotation_sheet_df.loc[len(annotation_sheet_df)] = data
    
    if resource == "odwn":
        annotation_sheet_df = pd.DataFrame(columns=["term","entity_id","text_1","text_2","text_3","text_4","text_5"])
        # import search results
        with open("/Users/anesterov/reps/LODlit/ODWN/odwn_query_results.json",'r') as jf:
            search_results = json.load(jf)
        # import top-10
        top_10 = pd.read_csv(f"/Users/anesterov/reps/LODlit/cs/{filename}.csv")
        
        for group in top_10.groupby("term"):
            # if there is a score in the df
            # check if the highest cs score != 0
            if score_type == None or list(group[1][score_type])[0] > 0:
                for hit_id in list(group[1]["hit_id"]):
                    for hit in search_results[group[0]]:
                        if hit["synset_id"] != "":
                            if hit["synset_id"] == hit_id:
                                data = [group[0],hit_id,hit.get("le_written_form"),hit.get("sense_definition"),\
                                   hit.get("sense_examples"),hit.get("synonyms"),hit.get("synset_definitions")]
                                annotation_sheet_df.loc[len(annotation_sheet_df)] = data
                        else:
                            if hit["le_id"] == hit_id:
                                data = [group[0],hit_id,hit.get("le_written_form"),hit.get("sense_definition"),\
                                   hit.get("sense_examples"),hit.get("synonyms"),hit.get("synset_definitions")]
                                annotation_sheet_df.loc[len(annotation_sheet_df)] = data

                        
                        
    annotation_sheet_df.drop_duplicates(subset=["term","entity_id"], inplace=True)
                        
    return annotation_sheet_df

In [None]:
generate_annotation_sheet("pwn","en","top_10_rm_pwn",score_type="cs_rm").to_csv("annotation_sheet_pwn_en.csv")

### Generating additional anntation sheets
* difference between Top-10 based on RM and Top-10 based on WM

In [None]:
generate_annotation_sheet("wikidata","en","wm_additional_wd_en").to_csv("additional_annotation_wd_en.csv")

In [None]:
generate_annotation_sheet("wikidata","nl","wm_additional_wd_nl").to_csv("additional_annotation_wd_nl.csv")

In [None]:
generate_annotation_sheet("aat","en","wm_additional_aat_en").to_csv("additional_annotation_aat_en.csv")

In [None]:
generate_annotation_sheet("aat","nl","wm_additional_aat_nl").to_csv("additional_annotation_aat_nl.csv")

In [None]:
generate_annotation_sheet("pwn","en","wm_additional_pwn").to_csv("additional_annotation_pwn.csv")

In [None]:
generate_annotation_sheet("odwn","nl","wm_additional_odwn").to_csv("additional_annotation_odwn.csv")