In [2]:
import pandas as pd
import json

In [None]:
def _get_subset_entities(path_to_top10:str) -> dict:
    '''
    '''
    subset_dict = {}
    
    top_10 = pd.read_csv(path_to_top10)
    subset = top_10[top_10["cs_rm"] >= 0.5]
    
    # special condition for AAT
    if '_aat_' in path_to_top10:
        for term_group in subset.groupby("term"):
            list_of_entities_per_term = [str(int(hit_id)) for hit_id in term_group[1]["hit_id"]]
            subset_dict[term_group[0]] = list_of_entities_per_term
    else:
        for term_group in subset.groupby("term"):
            list_of_entities_per_term = list(set(term_group[1]["hit_id"]))
            subset_dict[term_group[0]] = list_of_entities_per_term
        
    return subset_dict

In [None]:
def construct_subset(resource:str, path_to_top10:str, path_to_search_results:str) -> dict:
    '''
    Constructs a subset of relevant entities from all search results
    resource: str, 'wikidata', 'aat', 'pwn', 'odwn'
    path_to_top10: str, a path to csv file with Top-10 entiites per lemma per resource
    path_to_search_results: str, a path to json file with search results per resource
    Returns dict with a subset for a resource
    '''
    
    subset = {}
    
    # get the entities for the subset
    subset_entities = _get_subset_entities(path_to_top10)
    
    # load the search results
    with open(path_to_search_results,'r') as jf:
        search_results = json.load(jf)
    
    # check the resource
    if resource == 'wikidata':
        entity_id = 'QID'
    if resource == 'aat':
        entity_id = 'aat_uri'
    if resource == 'pwn':
        entity_id = 'synset_id'
        
    # special conditions for ODWN    
    if resource == 'odwn':
        for query_term, entities in subset_entities.items():
            hits_per_term = []
            for hit in search_results[query_term]:
                # in ODWN, instead of synset_id, there could be le_id; checking both
                if (hit.get("le_id") != None and hit.get("le_id") in entities) or (hit.get("synset_id") != None and hit.get("synset_id") in entities):
                    hits_per_term.append(hit)
            subset[query_term] = hits_per_term

    # get the subset for other resources
    else:
        for query_term, entities in subset_entities.items():
            hits_per_term = []
            for hit in search_results[query_term]:
                if hit[entity_id] in entities:
                    hits_per_term.append(hit)
            subset[query_term] = hits_per_term
        
   
    return subset

### Generating and exporting the subset files

#### Wikidata EN

In [None]:
wd_en_subset = construct_subset('wikidata',"/Users/anesterov/reps/LODlit/cs/top_10_by_lemma_rm_wikidata_en.csv","/Users/anesterov/LODlit_local/wd/jan31/results_clean_en.json")

In [None]:
with open('/Users/anesterov/reps/LODlit/Wikidata/wd_en_subset.json', 'w') as jf:
    json.dump(wd_en_subset, jf)

#### Wikidata NL

In [None]:
wd_nl_subset = construct_subset('wikidata',"/Users/anesterov/reps/LODlit/cs/top_10_by_lemma_rm_wikidata_nl.csv","/Users/anesterov/LODlit_local/wd/jan31/results_clean_nl.json")

In [None]:
with open('/Users/anesterov/reps/LODlit/Wikidata/wd_nl_subset.json', 'w') as jf:
    json.dump(wd_nl_subset, jf)

#### AAT EN

In [None]:
aat_en_subset = construct_subset('aat',"/Users/anesterov/reps/LODlit/cs/top_10_by_lemma_rm_aat_en.csv","/Users/anesterov/reps/LODlit/AAT/aat_query_results_en.json")

In [None]:
with open('/Users/anesterov/reps/LODlit/AAT/aat_en_subset.json', 'w') as jf:
    json.dump(aat_en_subset, jf)

#### AAT NL

In [None]:
aat_nl_subset = construct_subset('aat',"/Users/anesterov/reps/LODlit/cs/top_10_by_lemma_rm_aat_nl.csv","/Users/anesterov/reps/LODlit/AAT/aat_query_results_nl.json")

In [None]:
with open('/Users/anesterov/reps/LODlit/AAT/aat_nl_subset.json', 'w') as jf:
    json.dump(aat_nl_subset, jf)

#### PWN

In [None]:
pwn_subset = construct_subset('pwn',"/Users/anesterov/reps/LODlit/cs/top_10_by_lemma_pwn.csv","/Users/anesterov/reps/LODlit/PWN/pwn31_query_results.json")

In [None]:
with open('/Users/anesterov/reps/LODlit/PWN/pwn_subset.json', 'w') as jf:
    json.dump(pwn_subset, jf)

#### ODWN

In [None]:
odwn_subset = construct_subset('odwn',"/Users/anesterov/reps/LODlit/cs/top_10_by_lemma_odwn.csv","/Users/anesterov/reps/LODlit/ODWN/odwn_query_results.json")

In [None]:
with open('/Users/anesterov/reps/LODlit/ODWN/odwn_subset.json', 'w') as jf:
    json.dump(odwn_subset, jf)

In [1]:
#### Stats

In [3]:
with open("/Users/anesterov/reps/LODlit/Wikidata/wd_en_subset.json",'r') as jf:
    wd_en_subset = json.load(jf)

In [15]:
with open("/Users/anesterov/reps/LODlit/Wikidata/wd_nl_subset.json",'r') as jf:
    wd_nl_subset = json.load(jf)

In [19]:
with open("/Users/anesterov/reps/LODlit/AAT/aat_en_subset.json",'r') as jf:
    aat_en_subset = json.load(jf)

In [25]:
with open("/Users/anesterov/reps/LODlit/AAT/aat_nl_subset.json",'r') as jf:
    aat_nl_subset = json.load(jf)

In [13]:
entities_en = []
for hits in wd_en_subset.values():
    entities_en.extend([hit['QID'] for hit in hits if hit['found_in'] == 'aliases'])

In [14]:
len(set(entities_en))

187

In [16]:
entities_nl = []
for hits in wd_nl_subset.values():
    entities_nl.extend([hit['QID'] for hit in hits if hit['found_in'] == 'aliases'])

In [18]:
len(set(entities_nl))

106

In [28]:
aat_entities_en = []
for hits in aat_en_subset.values():
    aat_entities_en.extend([hit['aat_uri'] for hit in hits if hit['found_in'] == 'altLabel'])
   # prefLabels.extend([hit["prefLabel"] for hit in hits if hit['found_in'] == 'prefLabel'])

In [29]:
len(set(aat_entities_en))

90

In [30]:
aat_entities_nl = []
for hits in aat_nl_subset.values():
    aat_entities_nl.extend([hit['aat_uri'] for hit in hits if hit['found_in'] == 'altLabel'])

In [31]:
len(set(aat_entities_nl))

14