In [None]:
import json
import gzip
import csv
import requests

### Getting literals of resources from Set 3
These resources are taken from the Words Matter knowledge graph and are connected to culturally sensitive terms with the propertty *skos:relatedMatch*; here, these resources are called "related matches" 

#### 1. Generating a file with terms grouped by canonical forms linked to their related resources (related matches) from the Words Matter knowledge graph

In [None]:
# {lemma: {"query_terms":[], "related_matches":{"wikidata":"","aat":"",pwn:["",""]}}

In [None]:
# import query term
with open('query_terms.json','r') as jf:
    query_terms = json.load(jf)

In [None]:
# loading related matches from GitHub
path_rm = "https://github.com/cultural-ai/wordsmatter/raw/main/related_matches/rm.json"
rm = requests.get(path_rm).json()

In [None]:
query_terms_related_matches = {}

for lang in query_terms.keys():

    dict_per_lang = {}

    for lemma, terms in query_terms[lang].items():
        
        dict_per_lemma = {}
        dict_per_lemma["query_terms"] = terms
        dict_per_lemma["related_matches"] = {}
        dict_per_lemma["related_matches"]["wikidata"] = []
        dict_per_lemma["related_matches"]["aat"] = []

        for value in rm.values():
            
            # check lang
            if value["lang"] == lang and lemma in value["query_terms"]:
                
                if value["related_matches"]["wikidata"][0] != "None":
                    dict_per_lemma["related_matches"]["wikidata"].append(value["related_matches"]["wikidata"][0])
                if value["related_matches"]["aat"][0] != "None":
                    dict_per_lemma["related_matches"]["aat"].append(value["related_matches"]["aat"][0])
               
                if lang == "en":
                    if value["related_matches"]["pwn"][0] != "None":
                        dict_per_lemma["related_matches"]["pwn"] = value["related_matches"]["pwn"]
                    else:
                        dict_per_lemma["related_matches"]["pwn"] = []
                        
        dict_per_lang[lemma] = dict_per_lemma
        
    query_terms_related_matches[lang] = dict_per_lang

In [None]:
# connecting related matches from ODWN

In [None]:
with open('/bg/related_matches_odwn.json','r') as jf:
    odwn_rm = json.load(jf)

In [None]:
for lemma, info in query_terms_related_matches["nl"].items():
    
    info["related_matches"]["odwn"] = {"synset_id":[],"le_id":[]}
    for value in odwn_rm.values():
        if lemma in value["query_terms"]:
            if value["odwn_synsets"] != "":
                info["related_matches"]["odwn"]["synset_id"].extend(value["odwn_synsets"])
            if value["odwn_le"][0] != "None":
                info["related_matches"]["odwn"]["le_id"].extend(value["odwn_le"])

In [None]:
# exporting
with open('lemmas_terms_rms.json', 'w') as jf:
    json.dump(query_terms_related_matches, jf)

#### 2. Add literals of related resources to the file generated above

In [None]:
# {lemma: {"query_terms":[], "related_matches":{"wikidata":"","aat":"",pwn:["",""]},
#          "related_matches_lit":{"wikidata":[], "aat":[], "pwn":[]}}

In [None]:
# importing query results in each dataset

with gzip.open(f"/Wikidata/gzip_results_clean_en.json", 'r') as gzip_json:
    wd_en = json.loads(gzip_json.read().decode('utf-8'))
    
with gzip.open(f"/Wikidata/gzip_results_clean_nl.json", 'r') as gzip_json:
    wd_nl = json.loads(gzip_json.read().decode('utf-8'))
    
with open('/AAT/aat_query_results_en.json','r') as jf:
    aat_en = json.load(jf)
    
with open('/AAT/aat_query_results_nl.json','r') as jf:
    aat_nl = json.load(jf)
    
with open('/PWN/pwn31_query_results.json','r') as jf:
    pwn = json.load(jf)
    
with open('/ODWN/odwn_query_results.json','r') as jf:
    odwn = json.load(jf)

In [None]:
# shaping dicts per lang
for lemma, info in query_terms_related_matches["en"].items():
    info["related_matches_lit"] = {}
    info["related_matches_lit"]["wikidata"] = []
    info["related_matches_lit"]["aat"] = []
    info["related_matches_lit"]["pwn"] = []

In [None]:
for lemma, info in query_terms_related_matches["nl"].items():
    info["related_matches_lit"] = {}
    info["related_matches_lit"]["wikidata"] = []
    info["related_matches_lit"]["aat"] = []
    info["related_matches_lit"]["odwn"] = []

In [None]:
# adding literals from Wikidata EN

for lemma, info in query_terms_related_matches["en"].items():
    if len(info["related_matches"]["wikidata"]) > 0:
        
        for term in info["query_terms"]:
            if wd_en[term] != []:
                for hit in wd_en[term]:
                    if info["related_matches"]["wikidata"][0] == hit["QID"]:
                        info["related_matches_lit"]["wikidata"].append(hit["prefLabel"])
                        if hit["aliases"] != None:
                            info["related_matches_lit"]["wikidata"].extend(hit["aliases"])
                        if type(hit["description"]) == str:
                            info["related_matches_lit"]["wikidata"].append(hit["description"])
                        if type(hit["description"]) == list:
                            info["related_matches_lit"]["wikidata"].extend(hit["description"])
                        info["related_matches_lit"]["wikidata"].extend(hit["instance_of"])
                        info["related_matches_lit"]["wikidata"].extend(hit["subclass_of"])
                        
                        # if hit with RM is found, don't iterate over other hits
                        break
                        
            # if RM for lemma is found, don't iterate over other query terms
            if len(info["related_matches_lit"]["wikidata"]) > 0:
                break
    continue
    

In [None]:
# adding literals from Wikidata NL

for lemma, info in query_terms_related_matches["nl"].items():
    if len(info["related_matches"]["wikidata"]) > 0:
        
        for term in info["query_terms"]:
            if wd_nl[term] != []:
                for hit in wd_nl[term]:
                    if info["related_matches"]["wikidata"][0] == hit["QID"]:
                        info["related_matches_lit"]["wikidata"].append(hit["prefLabel"])
                        if hit["aliases"] != None:
                            info["related_matches_lit"]["wikidata"].extend(hit["aliases"])
                        if type(hit["description"]) == str:
                            info["related_matches_lit"]["wikidata"].append(hit["description"])
                        if type(hit["description"]) == list:
                            info["related_matches_lit"]["wikidata"].extend(hit["description"])
                        info["related_matches_lit"]["wikidata"].extend(hit["instance_of"])
                        info["related_matches_lit"]["wikidata"].extend(hit["subclass_of"])
                        
                        # if hit with RM is found, don't iterate over other hits
                        break
                        
            # if RM for lemma is found, don't iterate over other query terms
            if len(info["related_matches_lit"]["wikidata"]) > 0:
                break
                
    continue

In [None]:
# adding literals from AAT EN

for lemma, info in query_terms_related_matches["en"].items():
    if len(info["related_matches"]["aat"]) > 0:
        
        for term in info["query_terms"]:
            if aat_en[term] != []:
                for hit in aat_en[term]:
                    if info["related_matches"]["aat"][0] == hit["aat_uri"]:
                        info["related_matches_lit"]["aat"].append(hit["prefLabel"])
                        info["related_matches_lit"]["aat"].append(hit["prefLabel_comment"])
                        info["related_matches_lit"]["aat"].extend(hit["altLabel"])
                        info["related_matches_lit"]["aat"].extend(hit["altLabel_comment"])
                        info["related_matches_lit"]["aat"].append(hit["scopeNote"])

                        # if hit with RM is found, don't iterate over other hits
                        break
                        
            # if RM for lemma is found, don't iterate over other query terms
            if len(info["related_matches_lit"]["aat"]) > 0:
                break
    continue

In [None]:
# adding literals from AAT NL

for lemma, info in query_terms_related_matches["nl"].items():
    if len(info["related_matches"]["aat"]) > 0:
        
        for term in info["query_terms"]:
            if aat_nl[term] != []:
                for hit in aat_nl[term]:
                    if info["related_matches"]["aat"][0] == hit["aat_uri"]:
                        info["related_matches_lit"]["aat"].append(hit["prefLabel"])
                        info["related_matches_lit"]["aat"].append(hit["prefLabel_comment"])
                        info["related_matches_lit"]["aat"].extend(hit["altLabel"])
                        info["related_matches_lit"]["aat"].extend(hit["altLabel_comment"])
                        info["related_matches_lit"]["aat"].append(hit["scopeNote"])

                        # if hit with RM is found, don't iterate over other hits
                        break
                        
            # if RM for lemma is found, don't iterate over other query terms
            if len(info["related_matches_lit"]["aat"]) > 0:
                break
    continue

In [None]:
# adding literals from PWN

for lemma, info in query_terms_related_matches["en"].items():
    if len(info["related_matches"]["pwn"]) > 0:
        
        for term in info["query_terms"]:
            if pwn[term] != []:
                for hit in pwn[term]:
                    for hit_id in info["related_matches"]["pwn"]:
                        if hit_id == hit["synset_id"]:
                            info["related_matches_lit"]["pwn"].extend(hit["lemmata"])
                            info["related_matches_lit"]["pwn"].append(hit["definition"])
                            info["related_matches_lit"]["pwn"].extend(hit["examples"])
                        
            # if RM for lemma is found, don't iterate over other query terms
            if len(info["related_matches_lit"]["pwn"]) > 0:
                info["related_matches_lit"]["pwn"] = list(set(info["related_matches_lit"]["pwn"]))
                
                break
    continue

In [None]:
# adding literals from ODWN

for lemma, info in query_terms_related_matches["nl"].items():
    if len(info["related_matches"]["odwn"]["synset_id"]) > 0 or \
    len(info["related_matches"]["odwn"]["le_id"]) > 0:
        
        for term in info["query_terms"]:
            if odwn[term] != []:
                for hit in odwn[term]:

                    if hit["synset_id"] != "" and hit["synset_id"] in info["related_matches"]["odwn"]["synset_id"]:
                        info["related_matches_lit"]["odwn"].append(hit.get("le_written_form"))
                        if hit.get("sense_examples"):
                            info["related_matches_lit"]["odwn"].extend(hit.get("sense_examples"))
                        info["related_matches_lit"]["odwn"].extend(hit["synonyms"])
                        info["related_matches_lit"]["odwn"].extend(hit["synset_definitions"])
                        info["related_matches_lit"]["odwn"].append(hit.get("sense_definition"))
                    if hit["synset_id"] == "" and hit["le_id"] in info["related_matches"]["odwn"]["le_id"]:
                        info["related_matches_lit"]["odwn"].append(hit.get("le_written_form"))
                        if hit.get("sense_examples"):
                            info["related_matches_lit"]["odwn"].extend(hit.get("sense_examples"))
                        info["related_matches_lit"]["odwn"].extend(hit["synonyms"])
                        info["related_matches_lit"]["odwn"].extend(hit["synset_definitions"])
                        info["related_matches_lit"]["odwn"].append(hit.get("sense_definition"))

        info["related_matches_lit"]["odwn"] = list(set(info["related_matches_lit"]["odwn"]))
                
    continue

In [None]:
# exporting file
with open('lemmas_query_terms_related_matches.json', 'w') as jf:
    json.dump(query_terms_related_matches, jf)

#### 3. Generating 2 csv files with literlas of related resources
related_matches_literals_en.csv, related_matches_literals_nl.csv

In [None]:
# using the file generated above
with open('lemmas_query_terms_related_matches.json','r') as jf:
    rm_lits = json.load(jf)

In [None]:
with open('related_matches_literals_en.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ["lemma","source_1","source_2","source_3"]
    writer.writerow(header)
    
    for lemma, info in rm_lits["en"].items():
        data = [lemma,info["related_matches_lit"]["wikidata"],\
                info["related_matches_lit"]["aat"],\
                info["related_matches_lit"]["pwn"]]
    
        writer.writerow(data)

In [None]:
with open('related_matches_literals_nl.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ["lemma","source_1","source_2","source_3"]
    writer.writerow(header)
    
    for lemma, info in rm_lits["nl"].items():
        data = [lemma,info["related_matches_lit"]["wikidata"],\
                info["related_matches_lit"]["aat"],\
                info["related_matches_lit"]["odwn"]]
    
        writer.writerow(data)