### Getting terms to be queried in the resources
This notebook generates the following files:
- (1) query_terms.json -- contains all query terms by lang
- (2) en_lemmas_with_label_uris.json -- lemmas of EN query terms connected to the corresponding label URIs in the Words Matter knowledge graph
- (3) nl_lemmas_with_label_uris.json -- lemmas of NL query terms connected to the corresponding label URIs in the Words Matter knowledge graph

In [None]:
import json
import requests

In [None]:
# path to query terms from Cultural AI GitHub
# EN
url_en = "https://github.com/cultural-ai/wordsmatter/raw/main/query_terms_cont_en.json"
# NL
url_nl = "https://github.com/cultural-ai/wordsmatter/raw/main/query_terms_cont_nl.json"

In [None]:
getting_en = requests.get(url_en)
wordforms_en = json.loads(getting_en.text)

In [None]:
getting_nl = requests.get(url_nl)
wordforms_nl = json.loads(getting_nl.text)

In [None]:
# merging query terms together in one dict
# {'en':{'lemma':[all wordforms inclusing lemma]}}

query_terms = {}
query_terms['en'] = wordforms_en
query_terms['nl'] = wordforms_nl

In [None]:
for values in query_terms.values():
    for lemma, wordforms in values.items():
        wordforms.append(lemma)

In [None]:
# saving all query terms
with open('query_terms.json', 'w') as jf:
    json.dump(query_terms, jf)

In [None]:
# stats
en_total = 0
en_lemmas = 0
nl_total = 0
nl_lemmas = 0

for lemma, wordforms in query_terms['en'].items():
    en_lemmas += 1
    en_total += len(wordforms)
    
for lemma, wordforms in query_terms['nl'].items():
    nl_lemmas += 1
    nl_total += len(wordforms)
    
print("EN:",en_lemmas,en_total)
print("NL:",nl_lemmas,nl_total)

### Connecting query terms (lemmas) to the corresponding contentious labels URIs

In [None]:
# importing query terms
with open("/Users/anesterov/reps/LODlit/query_terms.json","r") as jf:
    query_terms = json.load(jf)

In [None]:
# importing contentious issues with contentious labels
CI_url = "https://raw.githubusercontent.com/cultural-ai/wordsmatter/main/CI_with_cont_terms.json"
requesting_CI_url = requests.get(CI_url)
CI = json.loads(requesting_CI_url.text)

In [None]:
# EN
en_uri_labels = {}
for value in CI.values():
    for label_uri, lit in value["contentious_labels"].items():
        if lit['lang'] == 'en':
            en_uri_labels[label_uri] = lit['literal_form'].lower()

In [None]:
len(en_uri_labels)

In [None]:
# NL
nl_uri_labels = {}
for value in CI.values():
    for label_uri, lit in value["contentious_labels"].items():
        if lit['lang'] == 'nl':
            nl_uri_labels[label_uri] = lit['literal_form'].lower().replace('\xad','')

In [None]:
# NL labels l_238 ('Jappenkamp') and l_236 ('Jappenkampen') have the same lemma 'jappenkamp';
# so there are 83 NL label URIs but 82 NL lemmas
len(nl_uri_labels)

In [None]:
# EN
# checking wordforms bc in some cases, literal form of contentious terms is not the same as its lemma
en_lemmas_of_label_uris = {}
for lemma, wordforms in query_terms['en'].items():
    uri_list = [] # one lemma can have one or more label uris
    for l_uri, cont_label in en_uri_labels.items():  
        if cont_label in wordforms:
            uri_list.append(l_uri)
            en_lemmas_of_label_uris[lemma] = uri_list

In [None]:
# EN: exporting lemmas with label URIs
with open('en_lemmas_with_label_uris.json', 'w') as jf:
    json.dump(en_lemmas_of_label_uris, jf)

In [None]:
# NL
nl_lemmas_of_label_uris = {}
for lemma, wordforms in query_terms['nl'].items():
    uri_list = []
    for l_uri, cont_label in nl_uri_labels.items():  
        if cont_label in wordforms:
            uri_list.append(l_uri)
            nl_lemmas_of_label_uris[lemma] = uri_list

In [None]:
# NL: exporting lemmas with label URIs
with open('nl_lemmas_with_label_uris.json', 'w') as jf:
    json.dump(nl_lemmas_of_label_uris, jf)