## Convert related matches to json, add their literal values

In [1]:
import csv
import json
import pandas as pd
import requests

In [2]:
related_matches = pd.read_csv('rm.csv')

In [4]:
# generating a dict
related_matches_dict = {}

for row in related_matches.iterrows():
    
    rm_per_label = {'aat':[row[1].aat],
                    'wikidata':[row[1].wikidata],
                    'pwn':row[1].pwn.split('#'),
                    'odwn':row[1].odwn.split('#'),
                    'nmvw':[row[1].nmvw]}
    
    related_matches_dict[row[1].label_uri] = {'literal_form':row[1].literal,
                                             'lang':row[1].lang,
                                             'query_terms':row[1].query_terms.split('#'),
                                             'related_matches':rm_per_label}

In [None]:
# saving related matches as json
with open('rm.json', 'w') as jf:
    json.dump(related_matches_dict, jf)

In [48]:
with open('rm.json', 'r') as jf:
    related_matches_dict = json.load(jf)

### Merging related matches with literals

#### Importing literals from the json files
 the files were generated with lodlitparser.get_literals()

In [49]:
# AAT EN
path_aat_en = 'https://github.com/cultural-ai/wordsmatter/raw/main/related_matches/aat_rm_en.json'
aat_en = requests.get(path_aat_en).json()

# AAT NL
path_aat_nl = 'https://github.com/cultural-ai/wordsmatter/raw/main/related_matches/aat_rm_nl.json'
aat_nl = requests.get(path_aat_nl).json()

# Wikidata EN
path_wikidata_en = 'https://github.com/cultural-ai/wordsmatter/raw/main/related_matches/wikidata_rm_en.json'
wikidata_en = requests.get(path_wikidata_en).json()

# Wikidata NL
path_wikidata_nl = 'https://github.com/cultural-ai/wordsmatter/raw/main/related_matches/wikidata_rm_nl.json'
wikidata_nl = requests.get(path_wikidata_nl).json()

# PWN
path_pwn = 'https://github.com/cultural-ai/wordsmatter/raw/main/related_matches/pwn_rm.json'
pwn_rm = requests.get(path_pwn).json()

# ODWN
path_odwn = 'https://github.com/cultural-ai/wordsmatter/raw/main/related_matches/odwn_rm.json'
odwn_rm = requests.get(path_odwn).json()

# NMVW
path_nmvw = 'https://github.com/cultural-ai/wordsmatter/raw/main/related_matches/nmvw_rm.json'
nmvw_rm = requests.get(path_nmvw).json()

In [50]:
# this code updates related_matches_dict
for label_uri, values in related_matches_dict.items():
    
    literals_per_label = {}
    
    # EN
    if values['lang'] == 'en':
        # aat
        per_resource =[]
        if values['related_matches']['aat'][0] != 'None':
            aat_uri = values['related_matches']['aat'][0]
            aat_en[aat_uri]['aat_uri'] = aat_uri
            per_resource.append(aat_en[aat_uri])

        literals_per_label['aat_en'] = per_resource
        
        # wikidata
        per_resource =[]
        if values['related_matches']['wikidata'][0] != 'None':
            wikidata_q = values['related_matches']['wikidata'][0]
            for result in wikidata_en:
                if wikidata_q in result.keys():
                    per_resource.append(result[wikidata_q])

        literals_per_label['wikidata_en'] = per_resource
        
        # PWN
        per_resource = []
        for synset_id in values['related_matches']['pwn']:
            if synset_id != 'None':
                pwn_rm[synset_id]['synset_id'] = synset_id
                dict_per_hit = pwn_rm[synset_id]
                per_resource.append(dict_per_hit)
            
        literals_per_label['pwn'] = per_resource
        
    # NL
    if values['lang'] == 'nl':
        # aat
        per_resource = []
        if values['related_matches']['aat'][0] != 'None':
            aat_uri = values['related_matches']['aat'][0]
            aat_nl[aat_uri]['aat_uri'] = aat_uri
            per_resource.append(aat_nl[aat_uri])

        literals_per_label['aat_nl'] = per_resource
        
        # wikidata
        per_resource =[]
        if values['related_matches']['wikidata'][0] != 'None':
            wikidata_q = values['related_matches']['wikidata'][0]
            for result in wikidata_nl:
                if wikidata_q in result.keys():
                    per_resource.append(result[wikidata_q])

        literals_per_label['wikidata_nl'] = per_resource
        
        #odwn
        per_resource = []
        for le_id in values['related_matches']['odwn']:
            if le_id != 'None':
                odwn_rm[le_id]['LE_id'] = le_id
                dict_per_hit = odwn_rm[le_id]
                per_resource.append(dict_per_hit)
                
        literals_per_label['odwn'] = per_resource
        
        #nmvw
        per_resource = []
        if values['related_matches']['nmvw'][0] != 'None':
            nmvw_handle = f"https://hdl.handle.net/20.500.11840/termmaster{values['related_matches']['nmvw'][0]}"
            nmvw_rm[nmvw_handle]['nmvw_handle'] = nmvw_handle
            per_resource.append(nmvw_rm[nmvw_handle])
        
        literals_per_label['nmvw'] = per_resource
        
    values['related_matches_literals'] = literals_per_label

In [51]:
# saving the json file with literals of related matches
with open('rm_literals.json', 'w') as jf:
    json.dump(related_matches_dict, jf)

In [2]:
#### where contentious terms are found

path_to_rm_lit = 'https://github.com/cultural-ai/wordsmatter/raw/main/related_matches/rm_literals.json'
rm_lit = requests.get(path_to_rm_lit).json()

In [4]:
import re

In [37]:
rm_lit['l_154']['related_matches_literals']

{'aat_nl': [{'lang': 'nl',
   'prefLabel': 'bedienden',
   'altLabels': ['bediende', 'dienaren'],
   'prefLabel_comment': None,
   'altLabel_comment': None,
   'scopeNote': 'In algemene zin, personen die in dienst van particulieren persoonlijke of huishoudelijke werkzaamheden uitvoeren.',
   'aat_uri': '300025874'}],
 'odwn': [{'lemma': 'bediende',
   'sense_def': 'huisbediende',
   'examples': ['een bediende in livrei'],
   'synset_ID': 'eng-30-10582154-n',
   'synset_def': ['huisbediende'],
   'LE_id': 'bediende-n-2'}],
 'nmvw': []}

In [47]:
rm_lit['l_154']

{'literal_form': 'Bediende',
 'lang': 'nl',
 'query_terms': ['bediende', 'bediendes', 'bediend', 'bedienden'],
 'related_matches': {'aat': ['300025874'],
  'wikidata': ['Q833860'],
  'pwn': ['None'],
  'odwn': ['bediende-n-2'],
  'nmvw': ['None']},
 'related_matches_literals': {'aat_nl': [{'lang': 'nl',
    'prefLabel': 'bedienden',
    'altLabels': ['bediende', 'dienaren'],
    'prefLabel_comment': None,
    'altLabel_comment': None,
    'scopeNote': 'In algemene zin, personen die in dienst van particulieren persoonlijke of huishoudelijke werkzaamheden uitvoeren.',
    'aat_uri': '300025874'}],
  'odwn': [{'lemma': 'bediende',
    'sense_def': 'huisbediende',
    'examples': ['een bediende in livrei'],
    'synset_ID': 'eng-30-10582154-n',
    'synset_def': ['huisbediende'],
    'LE_id': 'bediende-n-2'}],
  'nmvw': []}}

In [26]:
for e in rm_lit['l_154']['related_matches_literals']['wikidata_en'][0]['aliases']['en']:
    print(e['value'])

indigenous peoples
aboriginal peoples
traditional peoples
native peoples
indigenous culture


In [45]:
for label_uri, label_info in rm_lit.items():
    hits = {}
    # checking the label language
    if label_info['lang'] == 'en':
        hits['aat_en'] = []
        hits['wikidata_en'] = []
        hits['pwn'] = []
        
        # in AAT
        if len(label_info['related_matches_literals']['aat_en']) > 0:
            hits_per_resource = []
            
            # checking every query term
            for query_term in label_info['query_terms']:
                
                # in prefLabel
                prefLabel = label_info['related_matches_literals']['aat_en'][0]['prefLabel']
                aat_uri = label_info['related_matches_literals']['aat_en'][0]['aat_uri']
                if len(re.findall(f'\\b{query_term}\\b',prefLabel,re.IGNORECASE)) > 0:
                    hit = {"query_term":query_term,"found_in":"prefLabel","aat_uri":aat_uri,"source":prefLabel}
                    hits_per_resource.append(hit)
                    
                # in altLabels
                for altLabel in label_info['related_matches_literals']['aat_en'][0]['altLabels']:
                    if len(re.findall(f'\\b{query_term}\\b',altLabel,re.IGNORECASE)) > 0:
                        hit = {"query_term":query_term,"found_in":"altLabel","aat_uri":aat_uri,"source":altLabel}
                        hits_per_resource.append(hit)
            
            hits['aat_en'] = hits_per_resource
                        
        # in Wikidata
        if len(label_info['related_matches_literals']['wikidata_en']) > 0:
            hits_per_resource = []
            
            # checking every query term
            for query_term in label_info['query_terms']:
                
                # in prefLabel
                prefLabel = label_info['related_matches_literals']['wikidata_en'][0]['labels']['en']['value']
                qid = label_info['related_matches_literals']['wikidata_en'][0]['id']
                if len(re.findall(f'\\b{query_term}\\b',prefLabel,re.IGNORECASE)) > 0:
                    hit = {"query_term":query_term,"found_in":"prefLabel","QID":qid,"source":prefLabel}
                    hits_per_resource.append(hit)
                    
                # in aliases
                if len(label_info['related_matches_literals']['wikidata_en'][0]['aliases']) > 0:
                    for e in label_info['related_matches_literals']['wikidata_en'][0]['aliases']['en']:
                        if len(re.findall(f'\\b{query_term}\\b',e["value"],re.IGNORECASE)) > 0:
                            hit = {"query_term":query_term,"found_in":"aliases","QID":qid,"source":e["value"]}
                            hits_per_resource.append(hit)
                
            hits['wikidata_en'] = hits_per_resource
            
        # in PWN
        if len(label_info['related_matches_literals']['pwn']) > 0:
            hits_per_resource = []
            
            # checking every query term
            for query_term in label_info['query_terms']:
                
                # in lemmata
                for le in label_info['related_matches_literals']['pwn']:
                    synset_id = le['synset_id']
                    for lemma in le['lemmata']:
                        if len(re.findall(f'\\b{query_term}\\b',lemma,re.IGNORECASE)) > 0:
                            hit = {"query_term":query_term,"found_in":"lemmata","synset_id":synset_id,"source":lemma}
                            hits_per_resource.append(hit)
                        
            hits['pwn'] = hits_per_resource
            
    if label_info['lang'] == 'nl':
        hits['aat_nl'] = []
        hits['wikidata_nl'] = []
        hits['odwn'] = []
        hits['nmvw'] = []
        
        # in AAT
        if len(label_info['related_matches_literals']['aat_nl']) > 0:
            hits_per_resource = []
            
            # checking every query term
            for query_term in label_info['query_terms']:
                
                # in prefLabel
                prefLabel = label_info['related_matches_literals']['aat_nl'][0]['prefLabel']
                aat_uri = label_info['related_matches_literals']['aat_nl'][0]['aat_uri']
                if len(re.findall(f'\\b{query_term}\\b',prefLabel,re.IGNORECASE)) > 0:
                    hit = {"query_term":query_term,"found_in":"prefLabel","aat_uri":aat_uri,"source":prefLabel}
                    hits_per_resource.append(hit)
                    
                # in altLabels
                for altLabel in label_info['related_matches_literals']['aat_nl'][0]['altLabels']:
                    if len(re.findall(f'\\b{query_term}\\b',altLabel,re.IGNORECASE)) > 0:
                        hit = {"query_term":query_term,"found_in":"altLabel","aat_uri":aat_uri,"source":altLabel}
                        hits_per_resource.append(hit)
            
            hits['aat_nl'] = hits_per_resource
            
        # in Wikidata
        if len(label_info['related_matches_literals']['wikidata_nl']) > 0:
            hits_per_resource = []
            
            # checking every query term
            for query_term in label_info['query_terms']:
                
                # in prefLabel
                prefLabel = label_info['related_matches_literals']['wikidata_nl'][0]['labels']['nl']['value']
                qid = label_info['related_matches_literals']['wikidata_nl'][0]['id']
                if len(re.findall(f'\\b{query_term}\\b',prefLabel,re.IGNORECASE)) > 0:
                    hit = {"query_term":query_term,"found_in":"prefLabel","QID":qid,"source":prefLabel}
                    hits_per_resource.append(hit)
                    
                # in aliases
                if len(label_info['related_matches_literals']['wikidata_nl'][0]['aliases']) > 0:
                    for e in label_info['related_matches_literals']['wikidata_nl'][0]['aliases']['nl']:
                        if len(re.findall(f'\\b{query_term}\\b',e["value"],re.IGNORECASE)) > 0:
                            hit = {"query_term":query_term,"found_in":"aliases","QID":qid,"source":e["value"]}
                            hits_per_resource.append(hit)
                
            hits['wikidata_nl'] = hits_per_resource
        
    print(hits)
            
    #break

{'aat_en': [{'query_term': 'aboriginal', 'found_in': 'altLabel', 'aat_uri': '300379660', 'source': 'aboriginal peoples'}], 'wikidata_en': [{'query_term': 'aboriginal', 'found_in': 'aliases', 'QID': 'Q103817', 'source': 'aboriginal peoples'}], 'pwn': [{'query_term': 'aboriginal', 'found_in': 'lemmata', 'synset_id': 'native.n.01', 'source': 'aboriginal'}, {'query_term': 'aboriginal', 'found_in': 'lemmata', 'synset_id': 'native.a.03', 'source': 'aboriginal'}]}
{'aat_en': [], 'wikidata_en': [{'query_term': 'allochtoon', 'found_in': 'prefLabel', 'QID': 'Q2008347', 'source': 'allochtoon'}], 'pwn': []}
{'aat_en': [], 'wikidata_en': [], 'pwn': []}
{'aat_en': [], 'wikidata_en': [{'query_term': 'barbarian', 'found_in': 'prefLabel', 'QID': 'Q134313', 'source': 'barbarian'}], 'pwn': [{'query_term': 'barbarian', 'found_in': 'lemmata', 'synset_id': 'savage.n.01', 'source': 'barbarian'}, {'query_term': 'barbarian', 'found_in': 'lemmata', 'synset_id': 'peasant.n.03', 'source': 'barbarian'}, {'query_te

KeyError: 'wikidata_nl'

In [18]:
hits

{'aat_en': [{'query_term': 'aboriginal',
   'found_in': 'altLabel',
   'aat_uri': '300379660',
   'source': 'aboriginal peoples'}]}

In [12]:
rm_lit

{'l_01': {'literal_form': 'Aboriginal',
  'lang': 'en',
  'query_terms': ['aboriginal', 'aboriginals'],
  'related_matches': {'aat': ['300379660'],
   'wikidata': ['Q103817'],
   'pwn': ['native.n.01', 'native.a.03'],
   'odwn': ['None'],
   'nmvw': ['None']},
  'related_matches_literals': {'aat_en': [{'lang': 'en',
     'prefLabel': 'indigenous people',
     'altLabels': ['indigenous person',
      'indigenous peoples',
      'aboriginal peoples',
      'indigenous populations',
      'native peoples',
      'aborigines',
      'indigenes',
      'people, indigenous'],
     'prefLabel_comment': None,
     'altLabel_comment': None,
     'scopeNote': 'The original, or earliest known, inhabitants of a colony or country.',
     'aat_uri': '300379660'}],
   'wikidata_en': [{'type': 'item',
     'id': 'Q103817',
     'labels': {'en': {'language': 'en', 'value': 'indigenous people'}},
     'descriptions': {'en': {'language': 'en',
       'value': 'ethnic groups descended from and identified 