In [1]:
import json
import rdflib
from rdflib import Graph
from rdflib.namespace import Namespace
from rdflib.namespace import SKOS, RDF

### Querying WM KG to get pairs of contentious and suggested terms

In [2]:
# custom namespaces
culco = Namespace("https://w3id.org/culco#")
skosxl = Namespace("http://www.w3.org/2008/05/skos-xl#")
dcterms = Namespace("http://purl.org/dc/terms/")

In [3]:
# change to GitHub
path_to_wm = '/Users/anesterov/reps/wordsmatter/glossary.ttl'

In [4]:
wm = Graph()
wm.parse(path_to_wm, format="turtle")

<Graph identifier=N02775f7b95114677bdef75d5fcc7b405 (<class 'rdflib.graph.Graph'>)>

In [None]:
# Get cont_label URI instead, then retrieve query terms for every URI
# Get terms by language

### EN

In [5]:
suggested_terms_en = wm.query(
    """SELECT ?cont_label_URI (GROUP_CONCAT(?sug_label;SEPARATOR=",") AS ?sug_label_list)
    WHERE {
    ?Suggestion culco:suggestedFor ?cont_label_URI ;
        culco:hasSuggestedLabel / skosxl:literalForm ?sug_label .
        
    ?cont_label_URI skosxl:literalForm ?cont_lit .
    FILTER (lang(?cont_lit) = "en") }
    
    GROUP BY ?cont_label_URI
    """)

In [None]:
# {"cont_label_URI":["suggested_term"]}

In [9]:
suggested_en = {}

for row in suggested_terms_en:
    suggested_en[str(row.cont_label_URI).replace('https://w3id.org/culco/wordsmatter/','')] = [str(s) for s in row.sug_label_list.split(',')]

In [None]:
# Next: link query terms to their corresponding suggestions

In [11]:
with open('/Users/anesterov/reps/wordsmatter/related_matches/rm.json','r') as jf:
    rms = json.load(jf)

In [12]:
en_query_terms_with_suggestions = {}

for cont_label_URI, suggestions_list in suggested_en.items():
    for term in rms[cont_label_URI]['query_terms']:
        en_query_terms_with_suggestions[term] = suggestions_list

In [20]:
len(en_query_terms_with_suggestions)

50

In [14]:
with open('/Users/anesterov/reps/LODlit/query_terms.json','r') as jf:
    qt = json.load(jf)

In [15]:
qt_en = []
for wordforms in qt["en"].values():
    qt_en.extend(wordforms)

In [16]:
len(qt_en)

154

In [17]:
# EN dict with suggestions

en_qt_with_suggestions = {}

for term in qt_en:
    if term in en_query_terms_with_suggestions.keys():
        en_qt_with_suggestions[term] = en_query_terms_with_suggestions[term]
    else:
        en_qt_with_suggestions[term] = []

In [19]:
# exporting
with open('en_suggestions.json', 'w') as jf:
    json.dump(en_qt_with_suggestions, jf)

### NL

In [22]:
suggested_terms_nl = wm.query(
    """SELECT ?cont_label_URI (GROUP_CONCAT(?sug_label;SEPARATOR=",") AS ?sug_label_list)
    WHERE {
    ?Suggestion culco:suggestedFor ?cont_label_URI ;
        culco:hasSuggestedLabel / skosxl:literalForm ?sug_label .
        
    ?cont_label_URI skosxl:literalForm ?cont_lit .
    FILTER (lang(?cont_lit) = "nl") }
    
    GROUP BY ?cont_label_URI
    """)

In [23]:
suggested_nl = {}

for row in suggested_terms_nl:
    suggested_nl[str(row.cont_label_URI).replace('https://w3id.org/culco/wordsmatter/','')] = [str(s) for s in row.sug_label_list.split(',')]

In [24]:
nl_query_terms_with_suggestions = {}

for cont_label_URI, suggestions_list in suggested_nl.items():
    for term in rms[cont_label_URI]['query_terms']:
        nl_query_terms_with_suggestions[term] = suggestions_list

In [27]:
len(nl_query_terms_with_suggestions)

80

In [21]:
qt_nl = []
for wordforms in qt["nl"].values():
    qt_nl.extend(wordforms)

In [25]:
# NL dict with suggestions

nl_qt_with_suggestions = {}

for term in qt_nl:
    if term in nl_query_terms_with_suggestions.keys():
        nl_qt_with_suggestions[term] = nl_query_terms_with_suggestions[term]
    else:
        nl_qt_with_suggestions[term] = []

In [26]:
len(nl_qt_with_suggestions)

242

In [28]:
# exporting
with open('nl_suggestions.json', 'w') as jf:
    json.dump(nl_qt_with_suggestions, jf)

In [None]:
# How many query terms don't have any suggestions? (also check by lemma)
# EN: 104 qt
# NL: 162 qt

### Unique suggestions by lang

In [30]:
all_sug_en = []

for sug in suggested_en.values():
    all_sug_en.extend(sug)
    
en_sug_unique = list(set(all_sug_en))

In [32]:
len(en_sug_unique)

41

In [34]:
all_sug_nl = []

for sug in suggested_nl.values():
    all_sug_nl.extend(sug)
    
nl_sug_unique = list(set(all_sug_nl))

In [35]:
len(nl_sug_unique)

48

In [None]:
### Terms without suggestions by lemma