### Getting N hits in resources' properties grouped by query terms and lemmas
This notebook generates 36 csv files (6 datasets, 2 types of grouping, 3 dataset subsets) in the 'n_hits' directory:
- related matches (Set 3) (12) 
- subset of relevant entities (Set 2) (12)
- all search results (Set 1) (12)

In [None]:
import json
import pandas as pd
from collections import Counter

In [None]:
def get_lemma_by_term(query_term:str, lang:str) -> str:
    '''
    Getting a lemma of a query term
    lang: str, 'en' or 'nl'
    Returns str, 'not found' if lemma was not found
    '''
    
    return_lemma = 'not found'
    
    # importing query terms with lemmas
    # change path to GitHub
    
    with open('/query_terms.json','r') as jf:
        query_terms = json.load(jf)
        
    for lemma, qt in query_terms[lang].items():
        if query_term in qt:
            return_lemma = lemma
            
    return return_lemma

In [None]:
def get_n_hits(path_to_dataset:str,resource:str,lang:str,groupby_lemma=False):
    '''
    Getting N of hits of query terms by properties in resources
    with optional grouping by lemmas
    path_to_dataset: str, path to a json file with results (related matches, subset, all results)
    resource: str, "wikidata", "aat", "pwn", "odwn"
    lang: str, language of results, "en" or "nl"
    group_by_lemma: bool, if group hits by lemma, default False (count by query term)
    Uses the function 'get_lemma_by_term';
    Returns a pandas dataframe
    '''
    
    # import a dataset
    with open(path_to_dataset,'r') as jf:
        dataset = json.load(jf)

    # check resource
    if resource == "wikidata":
        df_per_resource = pd.DataFrame(columns=["query_term","lang","wd_pref","wd_aliases","wd_descr","wd_total"])

        for query_term in dataset.keys():

            wd_hits = Counter([hit["found_in"] for hit in dataset[query_term]])
            wd_pref = wd_hits.get('prefLabel')
            wd_aliases = wd_hits.get('aliases')
            wd_descr = wd_hits.get('description')
            wd_total = sum(Counter(wd_hits).values())

            data = [query_term,lang,wd_pref,wd_aliases,wd_descr,wd_total]
            # replace None values
            data = [0 if value is None else value for value in data]
            # add a row
            df_per_resource.loc[len(df_per_resource)] = data

    if resource == "aat":
        df_per_resource = pd.DataFrame(columns=["query_term","lang","aat_pref","aat_alt","aat_scopeNote",\
                                                "aat_pref_comment","aat_alt_comment","aat_total"])

        for query_term in dataset.keys():

            aat_hits = Counter([hit["found_in"] for hit in dataset[query_term]])
            aat_pref = aat_hits.get('prefLabel')
            aat_alt = aat_hits.get('altLabel')
            aat_scopeNote = aat_hits.get('scopeNote')
            aat_pref_comment = aat_hits.get('prefLabel_comment')
            aat_alt_comment = aat_hits.get('altLabel_comment')
            aat_total = sum(Counter(aat_hits).values())

            data = [query_term,lang,aat_pref,aat_alt,aat_scopeNote,aat_pref_comment,aat_alt_comment,aat_total]
            # replace None values
            data = [0 if value is None else value for value in data]
            # add a row
            df_per_resource.loc[len(df_per_resource)] = data

    if resource == "pwn":
        df_per_resource = pd.DataFrame(columns=["query_term","lang","pwn_le","pwn_def","pwn_ex","pwn_total"])

        for query_term in dataset.keys():

            pwn_hits = Counter([hit["found_in"] for hit in dataset[query_term]])
            pwn_le = pwn_hits.get('lemmata')
            pwn_def = pwn_hits.get('definition')
            pwn_examples = pwn_hits.get('examples')
            pwn_total = sum(Counter(pwn_hits).values())

            data = [query_term,lang,pwn_le,pwn_def,pwn_examples,pwn_total]
            # replace None values
            data = [0 if value is None else value for value in data]
            # add a row
            df_per_resource.loc[len(df_per_resource)] = data

    if resource == "odwn":
        df_per_resource = pd.DataFrame(columns=["query_term","lang","odwn_le","odwn_sense_def",\
                                                    "odwn_synset_def","odwn_sense_ex","odwn_total"])

        for query_term in dataset.keys():

            odwn_hits = Counter([hit["found_in"] for hit in dataset[query_term]])
            odwn_le = odwn_hits.get('le')
            odwn_sense_ex = odwn_hits.get('sense_examples')
            odwn_sense_def = odwn_hits.get('sense_definition')
            odwn_synset_def = odwn_hits.get('synset_definitions')
            odwn_total = sum(Counter(odwn_hits).values())

            data = [query_term,lang,odwn_le,odwn_sense_def,odwn_synset_def,odwn_sense_ex,odwn_total]
            # replace None values
            data = [0 if value is None else value for value in data]
            # add a row
            df_per_resource.loc[len(df_per_resource)] = data

    # check if group by lemma
    if groupby_lemma:
        # add lemma column to the df
        lemmas = [get_lemma_by_term(query_term,lang) for query_term in list(df_per_resource["query_term"])]
        df_per_resource.insert(0,"lemma",lemmas)
        # create a new df; copy colums 
        new_colums = ["lemma"]
        new_colums.extend(list(df_per_resource.columns[2:]))
        df_per_resource_lemmas = pd.DataFrame(columns=new_colums)

        # grouping by lemma
        for lemma_group in df_per_resource.groupby("lemma"):
            row = [lemma_group[0],lang]
            # count n hits by properties
            for col in new_colums[2:]:
                row.append(sum(lemma_group[1][col]))

            df_per_resource_lemmas.loc[len(df_per_resource_lemmas)] = row
        
        return df_per_resource_lemmas
    
    else:
        return df_per_resource

#### Generating csv files

Related matches (Set 3)

In [None]:
# Wikidata EN
# terms
rm_wd_en_terms = get_n_hits('/rm/rm_wd_en.json','wikidata','en')
rm_wd_en_terms.to_csv('/n_hits/rm_wd_en_terms.csv')

# lemmas
rm_wd_en_lemmas = get_n_hits('/rm/rm_wd_en.json','wikidata','en',True)
rm_wd_en_lemmas.to_csv('/n_hits/rm_wd_en_lemmas.csv')

In [None]:
# Wikidata NL
# terms
rm_wd_nl_terms = get_n_hits('/rm/rm_wd_nl.json','wikidata','nl')
rm_wd_nl_terms.to_csv('/n_hits/rm_wd_nl_terms.csv')

# lemmas
rm_wd_nl_lemmas = get_n_hits('/rm/rm_wd_nl.json','wikidata','nl',True)
rm_wd_nl_lemmas.to_csv('/n_hits/rm_wd_nl_lemmas.csv')

In [None]:
# AAT EN
# terms
rm_aat_en_terms = get_n_hits('/rm/rm_aat_en.json','aat','en')
rm_aat_en_terms.to_csv('/n_hits/rm_aat_en_terms.csv')

# lemmas
rm_aat_en_lemmas = get_n_hits('/rm/rm_aat_en.json','aat','en',True)
rm_aat_en_lemmas.to_csv('/n_hits/rm_aat_en_lemmas.csv')

In [None]:
# AAT NL
# terms
rm_aat_nl_terms = get_n_hits('/rm/rm_aat_nl.json','aat','nl')
rm_aat_nl_terms.to_csv('/n_hits/rm_aat_nl_terms.csv')

# lemmas
rm_aat_nl_lemmas = get_n_hits('/rm/rm_aat_nl.json','aat','nl',True)
rm_aat_nl_lemmas.to_csv('/n_hits/rm_aat_nl_lemmas.csv')

In [None]:
# PWN
# terms
rm_pwn_terms = get_n_hits('/rm/rm_pwn.json','pwn','en')
rm_pwn_terms.to_csv('/n_hits/rm_pwn_terms.csv')

# lemmas
rm_pwn_lemmas = get_n_hits('/rm/rm_pwn.json','pwn','en',True)
rm_pwn_lemmas.to_csv('/n_hits/rm_pwn_lemmas.csv')

In [None]:
# ODWN
# terms
rm_odwn_terms = get_n_hits('/rm/rm_odwn.json','odwn','nl')
rm_odwn_terms.to_csv('/n_hits/rm_odwn_terms.csv')

# lemmas
rm_odwn_lemmas = get_n_hits('/rm/rm_odwn.json','odwn','nl',True)
rm_odwn_lemmas.to_csv('/n_hits/rm_odwn_lemmas.csv')

Subset (Set 2)

In [None]:
# Wikidata EN
# terms
subset_wd_en_terms = get_n_hits('/Wikidata/wd_en_subset.json','wikidata','en')
subset_wd_en_terms.to_csv('/n_hits/subset_wd_en_terms.csv')

# lemmas
subset_wd_en_lemmas = get_n_hits('/Wikidata/wd_en_subset.json','wikidata','en',True)
subset_wd_en_lemmas.to_csv('/n_hits/subset_wd_en_lemmas.csv')

In [None]:
# Wikidata NL
# terms
subset_wd_nl_terms = get_n_hits('/Wikidata/wd_nl_subset.json','wikidata','nl')
subset_wd_nl_terms.to_csv('/n_hits/subset_wd_nl_terms.csv')

# lemmas
subset_wd_nl_lemmas = get_n_hits('/Wikidata/wd_nl_subset.json','wikidata','nl',True)
subset_wd_nl_lemmas.to_csv('/n_hits/subset_wd_nl_lemmas.csv')

In [None]:
# AAT EN
# terms
subset_aat_en_terms = get_n_hits('/AAT/aat_en_subset.json','aat','en')
subset_aat_en_terms.to_csv('/n_hits/subset_aat_en_terms.csv')

# lemmas
subset_aat_en_lemmas = get_n_hits('/AAT/aat_en_subset.json','aat','en',True)
subset_aat_en_lemmas.to_csv('/n_hits/subset_aat_en_lemmas.csv')

In [None]:
# AAT NL
# terms
subset_aat_nl_terms = get_n_hits('/AAT/aat_nl_subset.json','aat','nl')
subset_aat_nl_terms.to_csv('/n_hits/subset_aat_nl_terms.csv')

# lemmas
subset_aat_nl_lemmas = get_n_hits('/AAT/aat_nl_subset.json','aat','nl',True)
subset_aat_nl_lemmas.to_csv('/n_hits/subset_aat_nl_lemmas.csv')

In [None]:
# PWN
# terms
subset_pwn_terms = get_n_hits('/PWN/pwn_subset.json','pwn','en')
subset_pwn_terms.to_csv('/n_hits/subset_pwn_terms.csv')

# lemmas
subset_pwn_lemmas = get_n_hits('/PWN/pwn_subset.json','pwn','en',True)
subset_pwn_lemmas.to_csv('/n_hits/subset_pwn_lemmas.csv')

In [None]:
# ODWN
# terms
subset_odwn_terms = get_n_hits('/ODWN/odwn_subset.json','odwn','nl')
subset_odwn_terms.to_csv('/n_hits/subset_odwn_terms.csv')

# lemmas
subset_odwn_lemmas = get_n_hits('/ODWN/odwn_subset.json','odwn','nl',True)
subset_odwn_lemmas.to_csv('/n_hits/subset_odwn_lemmas.csv')

All search results (Set 1)

In [None]:
# Wikidata EN
# terms
all_wd_en_terms = get_n_hits('/Wikidata/results_clean_en.json','wikidata','en')
all_wd_en_terms.to_csv('/n_hits/all_wd_en_terms.csv')

# lemmas
all_wd_en_lemmas = get_n_hits('/Wikidata/results_clean_en.json','wikidata','en',True)
all_wd_en_lemmas.to_csv('/n_hits/all_wd_en_lemmas.csv')

In [None]:
# Wikidata NL
# terms
all_wd_nl_terms = get_n_hits('/Wikidata/results_clean_nl.json','wikidata','nl')
all_wd_nl_terms.to_csv('/n_hits/all_wd_nl_terms.csv')

# lemmas
all_wd_nl_lemmas = get_n_hits('/Wikidata/results_clean_nl.json','wikidata','nl',True)
all_wd_nl_lemmas.to_csv('/n_hits/all_wd_nl_lemmas.csv')

In [None]:
# AAT EN
# terms
all_aat_en_terms = get_n_hits('/AAT/aat_query_results_en.json','aat','en')
all_aat_en_terms.to_csv('/n_hits/all_aat_en_terms.csv')

# lemmas
all_aat_en_lemmas = get_n_hits('/AAT/aat_query_results_en.json','aat','en',True)
all_aat_en_lemmas.to_csv('/n_hits/all_aat_en_lemmas.csv')

In [None]:
# AAT NL
# terms
all_aat_nl_terms = get_n_hits('/AAT/aat_query_results_nl.json','aat','nl')
all_aat_nl_terms.to_csv('/n_hits/all_aat_nl_terms.csv')

# lemmas
all_aat_nl_lemmas = get_n_hits('/AAT/aat_query_results_nl.json','aat','nl',True)
all_aat_nl_lemmas.to_csv('/n_hits/all_aat_nl_lemmas.csv')

In [None]:
# PWN
# terms
all_pwn_terms = get_n_hits('/PWN/pwn31_query_results.json','pwn','en')
all_pwn_terms.to_csv('/n_hits/all_pwn_terms.csv')

# lemmas
all_pwn_lemmas = get_n_hits('/PWN/pwn31_query_results.json','pwn','en',True)
all_pwn_lemmas.to_csv('/n_hits/all_pwn_lemmas.csv')

In [None]:
# ODWN
# terms
all_odwn_terms = get_n_hits('/ODWN/odwn_query_results.json','odwn','nl')
all_odwn_terms.to_csv('/n_hits/all_odwn_terms.csv')

# lemmas
all_odwn_lemmas = get_n_hits('/ODWN/odwn_query_results.json','odwn','nl',True)
all_odwn_lemmas.to_csv('/n_hits/all_odwn_lemmas.csv')