### English

In [1]:
# install sparqlwrapper: https://sparqlwrapper.readthedocs.io/en/latest/main.html#installation-distribution

from SPARQLWrapper import SPARQLWrapper, JSON
import json
import pandas as pd
import requests

In [None]:
sparql = SPARQLWrapper("http://kaiko.getalp.org/sparql")

#### 'en_cont_wordforms.json'

In [None]:
# putting query terms in the list

en_cont_terms = []
with open ("cont_en.txt", "r") as f:
    for s in f:
        en_cont_terms.append(s.rstrip("\n"))

In [None]:
len(en_cont_terms)

In [None]:
# the first part of a query string that remains unchanged

query_srting_1 = """SELECT ?entry ?lemma_lit ?pos ?otherForm_lit\n\
WHERE {\n\
?entry a ontolex:LexicalEntry ;\n\
lime:language "en" ;\n\
lexinfo:partOfSpeech ?pos ;\n\
ontolex:canonicalForm/ontolex:writtenRep ?lemma_lit ;\n\
ontolex:otherForm / ontolex:writtenRep ?otherForm_lit ."""

In [None]:
# querying takes approx 18 min

data_en = {} # the resulting data in dict
# {"query_term": {"lemmata":[{"lemma_URI":"*","lemma":"*","pos":"*","wordforms":["*"]}]}}

for term in en_cont_terms:
    
    # generating a query string for each term
    
    query_string = query_srting_1 + f"\nFILTER regex(?lemma_lit, '^{term}$', 'i')\n\
FILTER (?pos = <http://www.lexinfo.net/ontology/2.0/lexinfo#noun> || \
?pos = <http://www.lexinfo.net/ontology/2.0/lexinfo#adjective>)" + "}"
    
    sparql.setQuery(query_string)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    # the results are converted into df to group by 'entry_URI'
    
    data_df = {"entry_URI":[entry['entry']['value'] for entry in results['results']['bindings']],
               "lemma_lit":[entry['lemma_lit']['value'] for entry in results['results']['bindings']],
               "pos":[entry['pos']['value'].split("#")[1] for entry in results['results']['bindings']],
               "otherForm_lit":[entry['otherForm_lit']['value'] for entry in results['results']['bindings']]}

    results_pd = pd.DataFrame(data_df)

    lemmata = []

    # shaping the resulting dataset
    
    for group in results_pd.groupby("entry_URI"):
            lemmata.append({"lemma_URI":group[0],
                         "lemma":list(group[1]['lemma_lit'])[0],
                         "pos":list(group[1]['pos'])[0],
                            # also adding a canonical form to the wordforms 
                         "wordforms":list(group[1]['otherForm_lit'])+ [list(group[1]['lemma_lit'])[0]]})
            
    data_en[term] = {"lemmata":lemmata}

In [None]:
# exporting json

with open ("en_cont_wordforms.json","w") as outfile:
    json.dump(data_en, outfile)

### Dutch

#### 'nl_cont_wordforms.json'

In [None]:
# putting all the query terms in one string
# LexiconService takes a string with multiple terms

all_labels = "" 
with open ("cont_nl.txt", "r") as f:
    for label in f:
        clean_label = label.replace('\xad', '').rstrip("\n")
        all_labels += f"{clean_label},"
str_labels = all_labels.rstrip(',')

In [None]:
# HTTP request to LexiconService: getting lemmas of Dutch contentious terms

url = 'https://sk.taalbanknederlands.inl.nl/LexiconService/lexicon/get_lemma'
query = {'database': 'molex', # modern Dutch lexicon
         'case_sensitive':'false',
          'wordform': str_labels} # querying all the terms at once
headers = {'Accept': 'application/json'} # request json format

r = requests.get(url,params=query,headers=headers)
lemmas = r.json()

In [None]:
# getting all the found lemmas' IDs

list_of_lemma_ids = []
for lemma in lemmas['lemmata_list']:
    for i in lemma['found_lemmata']:
        if i['pos'] == 'AA' or i['pos'] == 'NOU-C': # including only nouns and adjectives
            list_of_lemma_ids.append(i['lemma_id'])

In [None]:
len(list_of_lemma_ids)

In [None]:
# HTTP request to LexiconService: getting word forms by lemma IDs

url = 'https://sk.taalbanknederlands.inl.nl/LexiconService/lexicon/get_wordforms_from_lemma_id'
headers = {'Accept': 'application/json'} # request json format

# making a dict of {"lemma_id": [wordforms]}

lemmaID_wordforms = {}

for i in list_of_lemma_ids:
    
    query = {'database': 'molex',
             'case_sensitive':'false',
             'lemma_id': i} # only one ID at a time
    
    l = requests.get(url,params=query,headers=headers)
    wordforms = l.json()
    lemmaID_wordforms[i] = wordforms['wordforms_list'][0]['found_wordforms']

In [None]:
# shaping the final dataset

data_nl = {}
for i in lemmas['lemmata_list']:
    for l in i['found_lemmata']:
        if l['pos'] != 'AA' and l['pos'] != 'NOU-C': # removing verb lemma IDs (with no word forms queried)
            i['found_lemmata'].remove(l)
            
        if l['lemma_id'] in lemmaID_wordforms: # matching word forms with lemmas IDs
            l['wordforms'] = lemmaID_wordforms[l['lemma_id']]
            l['dataset'] = 'int/molex' # adding info about the dataset
        
    data_nl[i['query_word']] = {"lemmata":i['found_lemmata']}

In [None]:
# exporting json

with open('nl_cont_wordforms.json', 'w') as outfile:
    json.dump(data_nl, outfile)