In [None]:
import json
import re
import csv

#### Reading query terms

In [None]:
# EN
with open('query_terms_cont_en.json','r') as jf:
    query_terms_cont_en = json.load(jf)

In [None]:
# NL
with open('query_terms_cont_nl.json','r') as jf:
    query_terms_cont_nl = json.load(jf)

#### Reading the AAT EN subgraph in json

In [None]:
with open('aat/aat_subgraph_en.json','r') as jf:
    aat_subgraph_en = json.load(jf)

In [None]:
len(aat_subgraph_en['results']['bindings'])

#### Reading the AAT NL subgraph in json

In [None]:
with open('aat/aat_subgraph_nl.json','r') as jf:
    aat_subgraph_nl = json.load(jf)

In [None]:
len(aat_subgraph_nl['results']['bindings'])

#### Functions to parse the json file

In [None]:
def get_entity_info_aat(entity_id:str, aat_json:dict) -> dict:
    '''
    Get the values of prefLabel (str), altLabel (list),
    prefLabel_comment (str), altLabel_comment(list),
    scopeNote (str)
    '''
    results = {}
    results['entity'] = entity_id
    altLabel_list = []
    altLabel_comment_list = []
    results['scopeNote'] = ''
    results['prefLabel_comment'] = ''
    
    for triple in aat_json["results"]["bindings"]:
        if entity_id in triple['Subject']['value']:
            
            # prefLabel
            if 'prefLabel' in triple['Predicate']['value']:
                for triple_t in aat_json["results"]["bindings"]:
                    if triple_t['Subject']['value'] == triple['Object']['value']:
                        if 'literalForm' in triple_t['Predicate']['value']:
                            results['prefLabel'] = triple_t['Object']['value']
                        # prefLabel comment
                        if 'comment' in triple_t['Predicate']['value']:
                            results['prefLabel_comment'] = triple_t['Object']['value']
                        
            # altLabel
            if 'altLabel' in triple['Predicate']['value']:
                for triple_t in aat_json["results"]["bindings"]:
                    if triple_t['Subject']['value'] == triple['Object']['value']:
                        if 'literalForm' in triple_t['Predicate']['value']:
                            altLabel_list.append(triple_t['Object']['value'])
                        # altLabel comment
                        if 'comment' in triple_t['Predicate']['value']:
                            altLabel_comment_list.append(triple_t['Object']['value'])
                            
            # scopeNote
            if 'scopeNote' in triple['Predicate']['value']:
                for triple_t in aat_json["results"]["bindings"]:
                    if triple_t['Subject']['value'] == triple['Object']['value']:
                        results['scopeNote'] = triple_t['Object']['value']
                            
            results['altLabel'] = altLabel_list
            results['altLabel_comment'] = altLabel_comment_list
                        
    return results

In [None]:
def find_term_in_literal_aat(query_term:str, aat_json:dict) -> list:
    '''
    Searchers for a term in the literal values of properties:
    prefLabel, altLabel, rdfs comment (for prefLabel and altLabel), and scopeNote
    Returns a list of dicts with an entity URI and the property name in the literal value of which the term was found
    '''
    
    list_of_results = []
    
    for triple in aat_json["results"]["bindings"]:
        
        if triple['Object']['type'] == 'literal' \
        and len(re.findall(f'\\b{query_term}\\b',triple['Object']['value'],re.IGNORECASE)) > 0:
            
            results_per_hit = {}
            results_per_hit['query_term'] = query_term
            results_per_hit['aat_uri'] = ''
            
            # if a term found in scopeNote
                    
            if 'rdf-syntax' in triple['Predicate']['value']:
                results_per_hit['found_in'] = 'scopeNote'
                for triple_t in aat_json["results"]["bindings"]:
                    if triple_t['Object']['value'] == triple['Subject']['value']:
                        # getting entity URI
                        entity = triple_t['Subject']['value'].split('/')[-1]
                        
            # if a term found in labels
            
            if 'literalForm' in triple['Predicate']['value']:
                for triple_t in aat_json["results"]["bindings"]:
                    if triple_t['Object']['value'] == triple['Subject']['value']:
                        # getting entity URI
                        entity = triple_t['Subject']['value'].split('/')[-1]
                        # altLabel or prefLabel 
                        if 'altLabel' in triple_t['Predicate']['value']:
                            results_per_hit['found_in'] = 'altLabel'
                        if 'prefLabel' in triple_t['Predicate']['value']:
                            results_per_hit['found_in'] = 'prefLabel'
                        
            # if a term found in rdfs comment
            
            if 'comment' in triple['Predicate']['value']:
                for triple_t in aat_json["results"]["bindings"]:
                    if triple_t['Object']['value'] == triple['Subject']['value']:
                        # getting entity URI
                        entity = triple_t['Subject']['value'].split('/')[-1]
                        # comment to altLabel or prefLabel
                        if 'altLabel' in triple_t['Predicate']['value']:
                            results_per_hit['found_in'] = 'altLabel_comment'
                        if 'prefLabel' in triple_t['Predicate']['value']:
                            results_per_hit['found_in'] = 'prefLabel_comment'
            
            results_per_hit['aat_uri'] = entity
            
            entity_info = get_entity_info_aat(entity, aat_json)
            
            results_per_hit['prefLabel'] = entity_info['prefLabel']
            results_per_hit['prefLabel_comment'] = entity_info['prefLabel_comment']
            results_per_hit['altLabel'] = entity_info['altLabel']
            results_per_hit['altLabel_comment'] = entity_info['altLabel_comment']
            results_per_hit['scopeNote'] = entity_info['scopeNote']
            
            list_of_results.append(results_per_hit)
            
    return list_of_results

### Querying EN subgraph

In [None]:
# querying all the terms takes approx 50 min

result_dict_aat_en = {}

for lemma, wordforms in query_terms_cont_en.items():
    
    list_of_query_terms = []
    list_of_query_terms.append(lemma)
    list_of_query_terms.extend(wordforms)
    
    for query_term in list_of_query_terms:
        result_dict_aat_en[query_term] = find_term_in_literal_aat(query_term,aat_subgraph_en)
    
    print(lemma)

In [None]:
# saving the query results EN

with open('aat_en_query_results.json', 'w') as jf:
    json.dump(result_dict_aat_en, jf)

### Querying NL subgraph

In [None]:
result_dict_aat_nl = {}

for lemma, wordforms in query_terms_cont_nl.items():
    
    list_of_query_terms = []
    list_of_query_terms.append(lemma)
    list_of_query_terms.extend(wordforms)
    
    for query_term in list_of_query_terms:
        result_dict_aat_nl[query_term] = find_term_in_literal_aat(query_term,aat_subgraph_nl)
    
    print(lemma)

In [None]:
# saving the query results NL

with open('aat_nl_query_results.json', 'w') as jf:
    json.dump(result_dict_aat_nl, jf)