In [None]:
import rdflib
from rdflib import Graph
from rdflib.namespace import Namespace
from rdflib.namespace import SKOS, RDF
import re
import json
import csv

### Querying NMVW / Saving the query results in json

In [None]:
# creating an empty graph
nmvw = Graph()

In [None]:
# dict to store query results
nmvw_json = {}

In [None]:
# populating the graph with triples from files
for number in range(1,45):
    # path to a directory with the thesaurus files (44 files)
    filename = '/nmvw_rdf/ThesRDFXML_' + str(number) + '.rdf'
    nmvw.parse(filename, format="xml")

In [None]:
query = """        
        SELECT ?concept ?prefLabel (GROUP_CONCAT(?note;SEPARATOR="#") AS ?notes) (GROUP_CONCAT(?altLabel;SEPARATOR="#") AS ?altLabels) ?exactMatch ?scheme
        
        WHERE {
        
        ?concept rdf:type skos:Concept ;
                 skos:prefLabel ?prefLabel ;
                 skos:inScheme ?scheme .
                 
        OPTIONAL {?concept skos:exactMatch ?exactMatch}
        OPTIONAL {?concept skos:altLabel ?altLabel}
        OPTIONAL {?concept skos:note ?note}
        
        }
        GROUP BY ?concept
        """

In [None]:
query_results = nmvw.query(query,initNs={'skos': SKOS, 'rdf': RDF})

In [None]:
# shaping json
for result in query_results.bindings:
    altLabels = []
    if str(result.get('altLabels')) != '':
        # converting to a set to remove dublicates
        altLabels = list(set(result.get('altLabels').split('#')))
        
    notes = list(set(result.get('notes').split('#')))
    
    nmvw_json[str(result['concept'])] = {'prefLabel':str(result['prefLabel']),\
                                        'altLabel':altLabels,\
                                        'notes':notes,\
                                        'exactMatch':str(result.get('exactMatch')),\
                                        'scheme':str(result['scheme'])}

In [None]:
len(nmvw_json)

In [None]:
# saving NMVW in json
with open('nmvw_thesaurus.json', 'w') as jf:
    json.dump(nmvw_json, jf)

### Finding contentious terms in NMVW

In [None]:
# importing the contentious terms (NL)

with open('query_terms_cont_nl.json','r') as jf:
    query_terms_cont_nl = json.load(jf)

In [None]:
with open('cont_terms_in_NMVW.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['lemma','query_term','nmvw_URI','prefLabel','altLabel','notes','exactMatch','scheme','found_in']
    writer.writerow(header)
    
    for lemma, wordforms in query_terms_cont_nl.items():
        list_of_query_terms = []
        list_of_query_terms.append(lemma)
        list_of_query_terms.extend(wordforms)

        for query_term in list_of_query_terms:

            for handle, values in nmvw_json.items():

                found_in = ''
                
                # searching in prefLabel
                if re.search(f"\\b{query_term}\\b",values['prefLabel'],re.IGNORECASE) != None:
                    found_in = 'prefLabel'

                if found_in == 'prefLabel':
                    data = [lemma,query_term,handle,values['prefLabel'],values['altLabel'],values['notes'],values['exactMatch'], values['scheme'],found_in]
                    writer.writerow(data)
                    
                # searching in altLabel
                for label in values['altLabel']:
                    if re.search(f"\\b{query_term}\\b",label,re.IGNORECASE) != None:
                        found_in = 'altLabel'

                if found_in == 'altLabel':
                    data = [lemma,query_term,handle,values['prefLabel'],values['altLabel'],values['notes'],values['exactMatch'], values['scheme'],found_in]
                    writer.writerow(data)