In [None]:
import json
import requests
import rdflib
from rdflib import Graph
from rdflib.namespace import Namespace
from rdflib.namespace import SKOS, RDF

### Querying WM KG to get pairs of contentious and suggested terms

In [None]:
# custom namespaces
culco = Namespace("https://w3id.org/culco#")
skosxl = Namespace("http://www.w3.org/2008/05/skos-xl#")
dcterms = Namespace("http://purl.org/dc/terms/")

In [None]:
# change to GitHub
path_to_wm = 'https://github.com/cultural-ai/wordsmatter/raw/main/glossary.ttl'

In [None]:
wm = Graph()
wm.parse(path_to_wm, format="turtle")

In [None]:
# Get cont_label URI instead, then retrieve query terms for every URI
# Get terms by language

### EN

In [None]:
suggested_terms_en = wm.query(
    """SELECT ?cont_label_URI (GROUP_CONCAT(?sug_label;SEPARATOR=",") AS ?sug_label_list)
    WHERE {
    ?Suggestion culco:suggestedFor ?cont_label_URI ;
        culco:hasSuggestedLabel / skosxl:literalForm ?sug_label .
        
    ?cont_label_URI skosxl:literalForm ?cont_lit .
    FILTER (lang(?cont_lit) = "en") }
    
    GROUP BY ?cont_label_URI
    """)

In [None]:
# {"cont_label_URI":["suggested_term"]}

In [None]:
suggested_en = {}

for row in suggested_terms_en:
    suggested_en[str(row.cont_label_URI).replace('https://w3id.org/culco/wordsmatter/','')] = [str(s) for s in row.sug_label_list.split(',')]

In [None]:
# Next: link query terms to their corresponding suggestions

In [None]:
url = 'https://raw.githubusercontent.com/cultural-ai/wordsmatter/main/related_matches/rm.json'
r = requests.get(url)
rms = r.json()

In [None]:
en_query_terms_with_suggestions = {}

for cont_label_URI, suggestions_list in suggested_en.items():
    for term in rms[cont_label_URI]['query_terms']:
        en_query_terms_with_suggestions[term] = suggestions_list

In [None]:
len(en_query_terms_with_suggestions)

In [None]:
with open('LODlit/query_terms.json','r') as jf:
    qt = json.load(jf)

In [None]:
qt_en = []
for wordforms in qt["en"].values():
    qt_en.extend(wordforms)

In [None]:
len(qt_en)

In [None]:
# EN dict with suggestions

en_qt_with_suggestions = {}

for term in qt_en:
    if term in en_query_terms_with_suggestions.keys():
        en_qt_with_suggestions[term] = en_query_terms_with_suggestions[term]
    else:
        en_qt_with_suggestions[term] = []

In [None]:
# exporting
with open('en_suggestions.json', 'w') as jf:
    json.dump(en_qt_with_suggestions, jf)

### NL

In [None]:
suggested_terms_nl = wm.query(
    """SELECT ?cont_label_URI (GROUP_CONCAT(?sug_label;SEPARATOR=",") AS ?sug_label_list)
    WHERE {
    ?Suggestion culco:suggestedFor ?cont_label_URI ;
        culco:hasSuggestedLabel / skosxl:literalForm ?sug_label .
        
    ?cont_label_URI skosxl:literalForm ?cont_lit .
    FILTER (lang(?cont_lit) = "nl") }
    
    GROUP BY ?cont_label_URI
    """)

In [None]:
suggested_nl = {}

for row in suggested_terms_nl:
    suggested_nl[str(row.cont_label_URI).replace('https://w3id.org/culco/wordsmatter/','')] = [str(s) for s in row.sug_label_list.split(',')]

In [None]:
nl_query_terms_with_suggestions = {}

for cont_label_URI, suggestions_list in suggested_nl.items():
    for term in rms[cont_label_URI]['query_terms']:
        nl_query_terms_with_suggestions[term] = suggestions_list

In [None]:
len(nl_query_terms_with_suggestions)

In [None]:
qt_nl = []
for wordforms in qt["nl"].values():
    qt_nl.extend(wordforms)

In [None]:
# NL dict with suggestions

nl_qt_with_suggestions = {}

for term in qt_nl:
    if term in nl_query_terms_with_suggestions.keys():
        nl_qt_with_suggestions[term] = nl_query_terms_with_suggestions[term]
    else:
        nl_qt_with_suggestions[term] = []

In [None]:
len(nl_qt_with_suggestions)

In [None]:
# exporting
with open('nl_suggestions.json', 'w') as jf:
    json.dump(nl_qt_with_suggestions, jf)

In [None]:
# How many query terms don't have any suggestions? (also check by lemma)
# EN: 104 qt
# NL: 162 qt

### Unique suggestions by lang

In [None]:
all_sug_en = []

for sug in suggested_en.values():
    all_sug_en.extend(sug)
    
en_sug_unique = list(set(all_sug_en))

In [None]:
len(en_sug_unique)

In [None]:
all_sug_nl = []

for sug in suggested_nl.values():
    all_sug_nl.extend(sug)
    
nl_sug_unique = list(set(all_sug_nl))

In [None]:
len(nl_sug_unique)