### Set 3: generating subsets of every dataset based on Top-10 entities for each term by their cosine similarity scores

In [None]:
import pandas as pd
import json

In [None]:
def _get_subset_entities(path_to_top10:str) -> dict:

    subset_dict = {}
    
    top_10 = pd.read_csv(path_to_top10)
    subset = top_10[top_10["cs_rm"] >= 0.5]
    
    # special condition for AAT
    if '_aat_' in path_to_top10:
        for term_group in subset.groupby("term"):
            list_of_entities_per_term = [str(int(hit_id)) for hit_id in term_group[1]["hit_id"]]
            subset_dict[term_group[0]] = list_of_entities_per_term
    else:
        for term_group in subset.groupby("term"):
            list_of_entities_per_term = list(set(term_group[1]["hit_id"]))
            subset_dict[term_group[0]] = list_of_entities_per_term
        
    return subset_dict

In [None]:
def construct_subset(dataset:str, path_to_top10:str, path_to_search_results:str) -> dict:
    '''
    Constructs a subset of relevant entities from all search results
    dataset: str, 'wikidata', 'aat', 'pwn', 'odwn'
    path_to_top10: str, a path to csv file with Top-10 entiites per lemma per dataset
    path_to_search_results: str, a path to json file with search results per dataset
    Returns dict with a subset of a dataset
    '''
    
    subset = {}
    
    # get the entities for the subset
    subset_entities = _get_subset_entities(path_to_top10)
    
    # load the search results
    with open(path_to_search_results,'r') as jf:
        search_results = json.load(jf)
    
    # check the resource
    if dataset == 'wikidata':
        entity_id = 'QID'
    if dataset == 'aat':
        entity_id = 'aat_uri'
    if dataset == 'pwn':
        entity_id = 'synset_id'
        
    # special conditions for ODWN    
    if dataset == 'odwn':
        for query_term, entities in subset_entities.items():
            hits_per_term = []
            for hit in search_results[query_term]:
                # in ODWN, instead of synset_id, there could be le_id; checking both
                if (hit.get("le_id") != None and hit.get("le_id") in entities) or (hit.get("synset_id") != None and hit.get("synset_id") in entities):
                    hits_per_term.append(hit)
            subset[query_term] = hits_per_term

    # get the subset for other datasets
    else:
        for query_term, entities in subset_entities.items():
            hits_per_term = []
            for hit in search_results[query_term]:
                if hit[entity_id] in entities:
                    hits_per_term.append(hit)
            subset[query_term] = hits_per_term
        
   
    return subset

### Generating and exporting the subset files

#### Wikidata EN

In [None]:
# results_clean_en.json is gzipped on GitHub 
wd_en_subset = construct_subset('wikidata',"/cs/top_10_by_lemma_rm_wikidata_en.csv","Wikidata/results_clean_en.json")

In [None]:
with open('/Wikidata/wd_en_subset.json', 'w') as jf:
    json.dump(wd_en_subset, jf)

#### Wikidata NL

In [None]:
# results_clean_nl.json is gzipped on GitHub 
wd_nl_subset = construct_subset('wikidata',"/cs/top_10_by_lemma_rm_wikidata_nl.csv","Wikidata/results_clean_nl.json")

In [None]:
with open('/Wikidata/wd_nl_subset.json', 'w') as jf:
    json.dump(wd_nl_subset, jf)

#### AAT EN

In [None]:
aat_en_subset = construct_subset('aat',"/cs/top_10_by_lemma_rm_aat_en.csv","/AAT/aat_query_results_en.json")

In [None]:
with open('/AAT/aat_en_subset.json', 'w') as jf:
    json.dump(aat_en_subset, jf)

#### AAT NL

In [None]:
aat_nl_subset = construct_subset('aat',"/cs/top_10_by_lemma_rm_aat_nl.csv","/AAT/aat_query_results_nl.json")

In [None]:
with open('/AAT/aat_nl_subset.json', 'w') as jf:
    json.dump(aat_nl_subset, jf)

#### PWN

In [None]:
pwn_subset = construct_subset('pwn',"/cs/top_10_by_lemma_pwn.csv","/PWN/pwn31_query_results.json")

In [None]:
with open('/PWN/pwn_subset.json', 'w') as jf:
    json.dump(pwn_subset, jf)

#### ODWN

In [None]:
odwn_subset = construct_subset('odwn',"/cs/top_10_by_lemma_odwn.csv","/ODWN/odwn_query_results.json")

In [None]:
with open('/ODWN/odwn_subset.json', 'w') as jf:
    json.dump(odwn_subset, jf)