In [1]:
import json
import rdflib
from rdflib import Graph
from rdflib.namespace import Namespace
from rdflib.namespace import SKOS, RDF

### Querying WM KG to get pairs of contentious and suggested terms

In [2]:
# custom namespaces
culco = Namespace("https://w3id.org/culco#")
skosxl = Namespace("http://www.w3.org/2008/05/skos-xl#")
dcterms = Namespace("http://purl.org/dc/terms/")

In [3]:
# change to GitHub
path_to_wm = '/Users/anesterov/reps/wordsmatter/glossary.ttl'

In [4]:
wm = Graph()
wm.parse(path_to_wm, format="turtle")

<Graph identifier=N7cad654673334dad9725d8eac94777f9 (<class 'rdflib.graph.Graph'>)>

In [None]:
# Get cont_label URI instead, then retrieve query terms for every URI
# Get terms by language

In [59]:
suggested_terms = wm.query(
    """SELECT ?cont_label_URI (GROUP_CONCAT(?sug_label;SEPARATOR=",") AS ?sug_label_list)
    WHERE {
    ?Suggestion culco:suggestedFor ?cont_label_URI ;
        culco:hasSuggestedLabel / skosxl:literalForm ?sug_label .
    ?cont_label_URI skosxl:literalForm ?cont_lit .
    FILTER (lang(?cont_lit) = "en") }
    
    GROUP BY ?cont_label_URI
    """)

In [None]:
# {"cont_label_URI":["suggested_term"]}

In [60]:
suggested = {}

for row in suggested_terms:
    suggested[str(row.cont_label_URI).replace('https://w3id.org/culco/wordsmatter/','')] = [str(s) for s in row.sug_label_list.split(',')]

In [61]:
suggested

{'l_109': ['historic'],
 'l_50': ['mixed race'],
 'l_27': ['Black and Non-Black person of color',
  'Black and Non-Black People of Color',
  'Black and Non-Black persons of color',
  'Persons of Color',
  'Person of Color',
  'People of Color'],
 'l_48': ['Roma'],
 'l_34': ['People with disabilities', 'Disabled people', 'Differently abled'],
 'l_39': ['someone with dwarfism',
  'a person of short stature',
  'little person'],
 'l_01': ['Aboriginal',
  'Torres Strait Islander peoples',
  'First Nation peoples',
  'First Nation people',
  'First Nations people',
  'Indigenous',
  'Aboriginal peoples',
  'Aboriginal people'],
 'l_86': ['Asian'],
 'l_26': ['Caucasian', 'White'],
 'l_17': ['Mumbai'],
 'l_79': ['Muslim'],
 'l_93': ['LGBT'],
 'l_85': ['Black'],
 'l_112': ['cross-dresser'],
 'l_57': ['intersex'],
 'l_101': ['enslaved person', 'Enslaved'],
 'l_24': ['Maroon'],
 'l_11': ['Imazighen', 'Amazigh'],
 'l_60': ['Khoisan'],
 'l_76': ['spiritual healer', 'Traditional healer'],
 'l_66': 

In [None]:
# Next: link query terms to their corresponding suggestions

In [24]:
with open('/Users/anesterov/reps/wordsmatter/related_matches/rm.json','r') as jf:
    rms = json.load(jf)

In [29]:
query_terms_with_suggestions = {}

for cont_label_URI, suggestions_list in suggested.items():
    for term in rms[cont_label_URI]['query_terms']:
        query_terms_with_suggestions[term] = suggestions_list

In [30]:
query_terms_with_suggestions

{'disabled': ['Disabled people',
  'Differently abled',
  'People with disabilities'],
 'disableds': ['Disabled people',
  'Differently abled',
  'People with disabilities'],
 'oriëntaals': ['Aziatisch'],
 'oriëntaalste': ['Aziatisch'],
 'oriëntaalsere': ['Aziatisch'],
 'oriëntaal': ['Aziatisch'],
 'oriëntale': ['Aziatisch'],
 'oriëntalere': ['Aziatisch'],
 'oriëntaalse': ['Aziatisch'],
 'oriëntaalst': ['Aziatisch'],
 'oriëntaler': ['Aziatisch'],
 'oriëntaalser': ['Aziatisch'],
 'gekleurd': ['niet\xad-zwarte mensen van kleur',
  'mensen van kleur',
  'Iemand van kleur',
  'Zwart'],
 'gekleurdste': ['niet\xad-zwarte mensen van kleur',
  'mensen van kleur',
  'Iemand van kleur',
  'Zwart'],
 'gekleurdst': ['niet\xad-zwarte mensen van kleur',
  'mensen van kleur',
  'Iemand van kleur',
  'Zwart'],
 'gekleurds': ['niet\xad-zwarte mensen van kleur',
  'mensen van kleur',
  'Iemand van kleur',
  'Zwart'],
 'gekleurde': ['niet\xad-zwarte mensen van kleur',
  'mensen van kleur',
  'Iemand van 

In [None]:
# How many query terms don't have any suggestions? (also check by lemma)
# EN: 101 qt
# NL: 162 qt

In [31]:
with open('/Users/anesterov/reps/LODlit/query_terms.json','r') as jf:
    qt = json.load(jf)

In [33]:
qt_en = []
for wordforms in qt["en"].values():
    qt_en.extend(wordforms)

In [34]:
len(qt_en)

154

In [37]:
# EN dict with suggestions

en_qt_with_suggestions = {}

for term in qt_en:
    if term in query_terms_with_suggestions.keys():
        en_qt_with_suggestions[term] = query_terms_with_suggestions[term]
    else:
        en_qt_with_suggestions[term] = []

In [45]:
# exporting
with open('en_suggestions.json', 'w') as jf:
    json.dump(en_qt_with_suggestions, jf)

In [55]:
len([t for t,v in en_qt_with_suggestions.items() if len(v) > 0])

53

In [41]:
qt_nl = []
for wordforms in qt["nl"].values():
    qt_nl.extend(wordforms)

In [42]:
# NL dict with suggestions

nl_qt_with_suggestions = {}

for term in qt_nl:
    if term in query_terms_with_suggestions.keys():
        nl_qt_with_suggestions[term] = query_terms_with_suggestions[term]
    else:
        nl_qt_with_suggestions[term] = []

In [43]:
len(nl_qt_with_suggestions)

242

In [56]:
len([t for t,v in nl_qt_with_suggestions.items() if len(v) > 0])

80

In [46]:
### Unique suggestions
all_sug = []
for sug in en_qt_with_suggestions.values():
    all_sug.extend(sug)

In [49]:
sug_unique = list(set(all_sug))

In [51]:
sug_unique = [s.replace('\xad','').lower() for s in sug_unique]

In [None]:
### Terms without suggestions by lemma