## Convert related matches to json, add their literal values

In [None]:
import csv
import json
import pandas as pd
import requests
import re

In [None]:
# importing related matches from csv
related_matches = pd.read_csv('rm.csv')

In [None]:
# generating a dict
related_matches_dict = {}

for row in related_matches.iterrows():
    
    rm_per_label = {'aat':[row[1].aat],
                    'wikidata':[row[1].wikidata],
                    'pwn':row[1].pwn.split('#'),
                    'nmvw':[row[1].nmvw]}
    
    related_matches_dict[row[1].label_uri] = {'literal_form':row[1].literal,
                                             'lang':row[1].lang,
                                             'query_terms':row[1].query_terms.split('#'),
                                             'related_matches':rm_per_label}

In [None]:
# saving related matches as json
with open('rm.json', 'w') as jf:
    json.dump(related_matches_dict, jf)

### Merging related matches with literals

#### Importing literals from the json files
 the files were generated with lodlitparser.get_literals()

In [None]:
# AAT EN
path_aat_en = 'https://github.com/cultural-ai/wordsmatter/raw/main/related_matches/aat_rm_en.json'
aat_en = requests.get(path_aat_en).json()

# AAT NL
path_aat_nl = 'https://github.com/cultural-ai/wordsmatter/raw/main/related_matches/aat_rm_nl.json'
aat_nl = requests.get(path_aat_nl).json()

# Wikidata EN
path_wikidata_en = 'https://github.com/cultural-ai/wordsmatter/raw/main/related_matches/wikidata_rm_en.json'
wikidata_en = requests.get(path_wikidata_en).json()

# Wikidata NL
path_wikidata_nl = 'https://github.com/cultural-ai/wordsmatter/raw/main/related_matches/wikidata_rm_nl.json'
wikidata_nl = requests.get(path_wikidata_nl).json()

# PWN
path_pwn = 'https://github.com/cultural-ai/wordsmatter/raw/main/related_matches/pwn_rm.json'
pwn_rm = requests.get(path_pwn).json()

# NMVW
path_nmvw = 'https://github.com/cultural-ai/wordsmatter/raw/main/related_matches/nmvw_rm.json'
nmvw_rm = requests.get(path_nmvw).json()

In [None]:
# this code updates related_matches_dict
for label_uri, values in related_matches_dict.items():
    
    literals_per_label = {}
    
    # EN
    if values['lang'] == 'en':
        # aat
        per_resource =[]
        if values['related_matches']['aat'][0] != 'None':
            aat_uri = values['related_matches']['aat'][0]
            aat_en[aat_uri]['aat_uri'] = aat_uri
            per_resource.append(aat_en[aat_uri])

        literals_per_label['aat_en'] = per_resource
        
        # wikidata
        per_resource =[]
        if values['related_matches']['wikidata'][0] != 'None':
            wikidata_q = values['related_matches']['wikidata'][0]
            for result in wikidata_en:
                if wikidata_q in result.keys():
                    per_resource.append(result[wikidata_q])

        literals_per_label['wikidata_en'] = per_resource
        
        # PWN
        per_resource = []
        for synset_id in values['related_matches']['pwn']:
            if synset_id != 'None':
                pwn_rm[synset_id]['synset_id'] = synset_id
                dict_per_hit = pwn_rm[synset_id]
                per_resource.append(dict_per_hit)
            
        literals_per_label['pwn'] = per_resource
        
    # NL
    if values['lang'] == 'nl':
        # aat
        per_resource = []
        if values['related_matches']['aat'][0] != 'None':
            aat_uri = values['related_matches']['aat'][0]
            aat_nl[aat_uri]['aat_uri'] = aat_uri
            per_resource.append(aat_nl[aat_uri])

        literals_per_label['aat_nl'] = per_resource
        
        # wikidata
        per_resource =[]
        if values['related_matches']['wikidata'][0] != 'None':
            wikidata_q = values['related_matches']['wikidata'][0]
            for result in wikidata_nl:
                if wikidata_q in result.keys():
                    per_resource.append(result[wikidata_q])

        literals_per_label['wikidata_nl'] = per_resource
        
        #nmvw
        per_resource = []
        if values['related_matches']['nmvw'][0] != 'None':
            nmvw_handle = f"https://hdl.handle.net/20.500.11840/termmaster{values['related_matches']['nmvw'][0]}"
            nmvw_rm[nmvw_handle]['nmvw_handle'] = nmvw_handle
            per_resource.append(nmvw_rm[nmvw_handle])
        
        literals_per_label['nmvw'] = per_resource
        
    values['related_matches_literals'] = literals_per_label

In [None]:
# saving the json file with literals of related matches
with open('rm_literals.json', 'w') as jf:
    json.dump(related_matches_dict, jf)

### Checking where contentious terms were found
adding 'hits' to related matches

In [None]:
# this code updates related_matches_dict
for label_uri, label_info in related_matches_dict.items():
    hits = {}
    # checking the label language
    if label_info['lang'] == 'en':
        hits['aat_en'] = []
        hits['wikidata_en'] = []
        hits['pwn'] = []
        
        # in AAT
        if len(label_info['related_matches_literals']['aat_en']) > 0:
            hits_per_resource = []
            
            # checking every query term
            for query_term in label_info['query_terms']:
                
                # in prefLabel
                prefLabel = label_info['related_matches_literals']['aat_en'][0]['prefLabel']
                aat_uri = label_info['related_matches_literals']['aat_en'][0]['aat_uri']
                if len(re.findall(f"\\b{query_term}(?!')\\b",prefLabel,re.IGNORECASE)) > 0:
                    hit = {"query_term":query_term,"found_in":"prefLabel","aat_uri":aat_uri,"source":prefLabel}
                    hits_per_resource.append(hit)
                    
                # in altLabels
                for altLabel in label_info['related_matches_literals']['aat_en'][0]['altLabels']:
                    if len(re.findall(f"\\b{query_term}(?!')\\b",altLabel,re.IGNORECASE)) > 0:
                        hit = {"query_term":query_term,"found_in":"altLabel","aat_uri":aat_uri,"source":altLabel}
                        hits_per_resource.append(hit)
            
            hits['aat_en'] = hits_per_resource
                        
        # in Wikidata
        if len(label_info['related_matches_literals']['wikidata_en']) > 0:
            hits_per_resource = []
            
            # checking every query term
            for query_term in label_info['query_terms']:
                
                # in prefLabel
                prefLabel = label_info['related_matches_literals']['wikidata_en'][0]['labels']['en']['value']
                qid = label_info['related_matches_literals']['wikidata_en'][0]['id']
                if len(re.findall(f"\\b{query_term}(?!')\\b",prefLabel,re.IGNORECASE)) > 0:
                    hit = {"query_term":query_term,"found_in":"prefLabel","QID":qid,"source":prefLabel}
                    hits_per_resource.append(hit)
                    
                # in aliases
                if len(label_info['related_matches_literals']['wikidata_en'][0]['aliases']) > 0:
                    for e in label_info['related_matches_literals']['wikidata_en'][0]['aliases']['en']:
                        if len(re.findall(f"\\b{query_term}(?!')\\b",e["value"],re.IGNORECASE)) > 0:
                            hit = {"query_term":query_term,"found_in":"aliases","QID":qid,"source":e["value"]}
                            hits_per_resource.append(hit)
                
            hits['wikidata_en'] = hits_per_resource
            
        # in PWN
        if len(label_info['related_matches_literals']['pwn']) > 0:
            hits_per_resource = []
            
            # checking every query term
            for query_term in label_info['query_terms']:
                
                # in lemmata
                for le in label_info['related_matches_literals']['pwn']:
                    synset_id = le['synset_id']
                    for lemma in le['lemmata']:
                        if len(re.findall(f"\\b{query_term}(?!')\\b",lemma,re.IGNORECASE)) > 0:
                            hit = {"query_term":query_term,"found_in":"lemmata","synset_id":synset_id,"source":lemma}
                            hits_per_resource.append(hit)
                        
            hits['pwn'] = hits_per_resource
            
    if label_info['lang'] == 'nl':
        hits['aat_nl'] = []
        hits['wikidata_nl'] = []
        hits['nmvw'] = []
        
        # in AAT
        if len(label_info['related_matches_literals']['aat_nl']) > 0:
            hits_per_resource = []
            
            # checking every query term
            for query_term in label_info['query_terms']:
                
                # in prefLabel
                prefLabel = label_info['related_matches_literals']['aat_nl'][0]['prefLabel']
                aat_uri = label_info['related_matches_literals']['aat_nl'][0]['aat_uri']
                if len(re.findall(f"\\b{query_term}(?!')\\b",prefLabel,re.IGNORECASE)) > 0:
                    hit = {"query_term":query_term,"found_in":"prefLabel","aat_uri":aat_uri,"source":prefLabel}
                    hits_per_resource.append(hit)
                    
                # in altLabels
                for altLabel in label_info['related_matches_literals']['aat_nl'][0]['altLabels']:
                    if len(re.findall(f"\\b{query_term}(?!')\\b",altLabel,re.IGNORECASE)) > 0:
                        hit = {"query_term":query_term,"found_in":"altLabel","aat_uri":aat_uri,"source":altLabel}
                        hits_per_resource.append(hit)
            
            hits['aat_nl'] = hits_per_resource
            
        # in Wikidata
        if len(label_info['related_matches_literals']['wikidata_nl']) > 0:
            hits_per_resource = []
            
            # checking every query term
            for query_term in label_info['query_terms']:
                
                # in prefLabel
                prefLabel = label_info['related_matches_literals']['wikidata_nl'][0]['labels']['nl']['value']
                qid = label_info['related_matches_literals']['wikidata_nl'][0]['id']
                if len(re.findall(f"\\b{query_term}(?!')\\b",prefLabel,re.IGNORECASE)) > 0:
                    hit = {"query_term":query_term,"found_in":"prefLabel","QID":qid,"source":prefLabel}
                    hits_per_resource.append(hit)
                    
                # in aliases
                if len(label_info['related_matches_literals']['wikidata_nl'][0]['aliases']) > 0:
                    for e in label_info['related_matches_literals']['wikidata_nl'][0]['aliases']['nl']:
                        if len(re.findall(f"\\b{query_term}(?!')\\b",e["value"],re.IGNORECASE)) > 0:
                            hit = {"query_term":query_term,"found_in":"aliases","QID":qid,"source":e["value"]}
                            hits_per_resource.append(hit)
                
            hits['wikidata_nl'] = hits_per_resource
            
        # in NMVW
        if len(label_info['related_matches_literals']['nmvw']) > 0:
            hits_per_resource = []
            
            # checking every query term
            for query_term in label_info['query_terms']:
                
                # in prefLabel
                prefLabel = label_info['related_matches_literals']['nmvw'][0]['prefLabel']
                term_id = label_info['related_matches_literals']['nmvw'][0]['nmvw_handle']
                if len(re.findall(f"\\b{query_term}(?!')\\b",prefLabel,re.IGNORECASE)) > 0:
                    hit = {"query_term":query_term,"found_in":"prefLabel","term_id":term_id,"source":prefLabel}
                    hits_per_resource.append(hit)
                    
                # in altLabels
                for altLabel in label_info['related_matches_literals']['nmvw'][0]['altLabel']:
                    if len(re.findall(f"\\b{query_term}(?!')\\b",altLabel,re.IGNORECASE)) > 0:
                        hit = {"query_term":query_term,"found_in":"altLabel","term_id":term_id,"source":altLabel}
                        hits_per_resource.append(hit)
            
            hits['nmvw'] = hits_per_resource
        
    label_info["hits"] = hits

In [None]:
# saving the json file with literals of related matches and hits
with open('rm_hits.json', 'w') as jf:
    json.dump(related_matches_dict, jf)

### Generating an overview per label in csv

In [None]:
with open('rm_hits_stats.csv','w') as csv_file:
    
    writer = csv.writer(csv_file)
    header = ["label_uri","lit_form","lang","total_hits","aat_total","aat_pref","aat_alt",\
                "wd_total","wd_pref","wd_aliases","nmvw_total","nmvw_pref","nmvw_alt",\
                "pwn_total"]
    writer.writerow(header)

    for label_uri, label_info in related_matches_dict.items():

        lit_form = label_info['literal_form']
        lang = label_info['lang']

        nmvw_total = 0
        nmvw_pref = 0
        nmvw_alt = 0
        pwn_total = 0

        #EN
        if lang == 'en':

            aat_total = len(label_info['hits']['aat_en'])
            wd_total = len(label_info['hits']['wikidata_en'])
            pwn_total = len(label_info['hits']['pwn'])
            total_hits = aat_total + wd_total + pwn_total

            # checking pref and alt
            aat_pref = 0
            aat_alt = 0
            if aat_total > 0:
                for hit in label_info['hits']['aat_en']:
                    if hit['found_in'] == 'prefLabel':
                        aat_pref += 1
                    if hit['found_in'] == 'altLabel':
                        aat_alt += 1

            wd_pref = 0
            wd_aliases = 0
            if wd_total > 0:
                for hit in label_info['hits']['wikidata_en']:
                    if hit['found_in'] == 'prefLabel':
                        wd_pref += 1
                    if hit['found_in'] == 'aliases':
                        wd_aliases += 1
        # NL                                    
        if lang == 'nl':

            aat_total = len(label_info['hits']['aat_nl'])
            wd_total = len(label_info['hits']['wikidata_nl'])
            nmvw_total = len(label_info['hits']['nmvw'])
            total_hits = aat_total + wd_total + nmvw_total

            # checking pref and alt
            aat_pref = 0
            aat_alt = 0
            if aat_total > 0:
                for hit in label_info['hits']['aat_nl']:
                    if hit['found_in'] == 'prefLabel':
                        aat_pref += 1
                    if hit['found_in'] == 'altLabel':
                        aat_alt += 1

            wd_pref = 0
            wd_aliases = 0
            if wd_total > 0:
                for hit in label_info['hits']['wikidata_nl']:
                    if hit['found_in'] == 'prefLabel':
                        wd_pref += 1
                    if hit['found_in'] == 'aliases':
                        wd_aliases += 1

            if nmvw_total > 0:
                for hit in label_info['hits']['nmvw']:
                    if hit['found_in'] == 'prefLabel':
                        nmvw_pref += 1
                    if hit['found_in'] == 'altLabel':
                        nmvw_alt += 1


        data = [label_uri,lit_form,lang,total_hits,aat_total,aat_pref,aat_alt,\
              wd_total,wd_pref,wd_aliases,nmvw_total,nmvw_pref,nmvw_alt,\
              pwn_total]
        
        writer.writerow(data)