In [None]:
import json
import csv
import pandas as pd

In [None]:
def get_lemma_by_term(query_term:str, lang:str) -> str:
    '''
    Getting a lemma of a query term
    lang: str, 'en' or 'nl'
    Returns str, 'not found' if lemma was not found
    '''
    
    return_lemma = 'not found'
    
    # importing query terms with lemmas
    # change path to GitHub
    
    with open('/Users/anesterov/reps/LODlit/query_terms.json','r') as jf:
        query_terms = json.load(jf)
        
    for lemma, qt in query_terms[lang].items():
        if query_term in qt:
            return_lemma = lemma
            
    return return_lemma

## Wikidata

In [None]:
# 1. Check if QIDs have explicit markers by lang by markers -> df
# 2. remove duplicates by QID, lemma, lang

In [None]:
wd_explicit = pd.DataFrame(columns=["resource","lang","lemma","entity_id","explicit_marker","value","level"])

In [None]:
lang = "nl"

In [None]:
# importing all search results
with open(f"/Users/anesterov/LODlit_local/wd/jan31/results_clean_{lang}.json",'r') as jf:
    wd_all = json.load(jf)

# import subset
with open(f"/Users/anesterov/reps/LODlit/Wikidata/wd_{lang}_subset.json",'r') as jf:
    wd_subset = json.load(jf)

# get all QIDs in the subset
subset_quids = []
for hits in wd_subset.values():
    for hit in hits:
        subset_quids.append(hit["QID"])

# import rm
wd_rm = pd.read_csv("/Users/anesterov/reps/LODlit/rm/rm_entities_unique.csv")
rm_quids = list(wd_rm[wd_rm["resource"] == "wikidata"][wd_rm["lang"] == lang]["entity_id"])

In [None]:
# import P31 markers
p31 = pd.read_csv("/Users/anesterov/reps/LODlit/sensitivity_markers/explicit/wikidata_P31_all.csv")

In [None]:
# make a dict of entity and value
p31_pairs = {}
for row in p31.iterrows():
    p31_pairs[row[1]["item"].lstrip("http://www.wikidata.org/entity/")] = row[1]["instance_of"].lstrip("http://www.wikidata.org/entity/")

In [None]:
# import P2559 markers
p2559 = pd.read_csv("/Users/anesterov/reps/LODlit/sensitivity_markers/explicit/wikidata_P2559_all.csv")

In [None]:
p2559_pairs = {}
for row in p2559.iterrows():
    p2559_pairs[row[1]["item"].lstrip("http://www.wikidata.org/entity/")] = row[1]["usage_instructions"]

In [None]:
for term, hits in wd_all.items():
    lemma = get_lemma_by_term(term, lang)
    
    for hit in hits:
        # if p31 markers 
        if hit["QID"] in p31_pairs.keys():
            
            # check level
            level = "1"
            if hit["QID"] in set(subset_quids):
                level = "2"
            if hit["QID"] in set(rm_quids):
                level = "3"
                
            row = ["wikidata",lang,lemma,hit["QID"],"P31",p31_pairs[hit["QID"]],level]
            wd_explicit.loc[len(wd_explicit)] = row
            
        # if p2559 markers 
        if hit["QID"] in p2559_pairs.keys():
            
            # check level
            level = "1"
            if hit["QID"] in set(subset_quids):
                level = "2"
            if hit["QID"] in set(rm_quids):
                level = "3"
                
            row = ["wikidata",lang,lemma,hit["QID"],"P2559",p2559_pairs[hit["QID"]],level]
            wd_explicit.loc[len(wd_explicit)] = row

In [None]:
wd_explicit.drop_duplicates(["lang","lemma","entity_id","explicit_marker","value"],ignore_index=True).to_csv("wd_explicit.csv")

## AAT

In [None]:
aat_explicit = pd.DataFrame(columns=["resource","lang","lemma","entity_id","explicit_marker","value","level"])

In [None]:
lang = "en"

In [None]:
# importing all search results
with open(f"/Users/anesterov/reps/LODlit/AAT/aat_query_results_{lang}.json",'r') as jf:
    aat_all = json.load(jf)

# import subset
with open(f"/Users/anesterov/reps/LODlit/AAT/aat_{lang}_subset.json",'r') as jf:
    aat_subset = json.load(jf)

# get all QIDs in the subset
subset = []
for hits in aat_subset.values():
    for hit in hits:
        subset.append(hit["aat_uri"])

# import rm
aat_rm = pd.read_csv("/Users/anesterov/reps/LODlit/rm/rm_entities_unique.csv")
rm = list(aat_rm[aat_rm["resource"] == "aat"][aat_rm["lang"] == lang]["entity_id"])

In [None]:
strip_part = "http://vocab.getty.edu/aat/"

In [None]:
# import historicFlag markers
historic_flag = pd.read_csv("/Users/anesterov/reps/LODlit/sensitivity_markers/explicit/aat_historicFlag_all.csv")

In [None]:
historic_flag_list = list(set([c.lstrip(strip_part) for c in historic_flag["concept"]]))

In [None]:
# import termKind markers
termKind_flag = pd.read_csv("/Users/anesterov/reps/LODlit/sensitivity_markers/explicit/aat_termKind_all.csv")

In [None]:
for group in wd_en_implicit.groupby(["lemma","entity_id","value"]):
    if (len(group[1])) > 1:
        grouped_markers = (list(group[1]["implicit_marker"]))

In [None]:
# make a dict of concepts with flags
# !NB several labels can be marked, make a list
termKind_flag_pairs = {}
for group in termKind_flag.groupby("concept"):
    grouped_flags = list(group[1]["flag"])
    termKind_flag_pairs[str(group[0]).lstrip(strip_part)] = [f.lstrip("http://vocab.getty.edu/term/kind/") for f in grouped_flags]

In [None]:
termKind_flag_pairs

In [None]:
for term, hits in aat_all.items():
    lemma = get_lemma_by_term(term, lang)
    
    for hit in hits:
        # if historic_flag markers
        if hit["aat_uri"] in historic_flag_list:
            
            # check level
            level = "1"
            if hit["aat_uri"] in set(subset):
                level = "2"
            if hit["aat_uri"] in set(rm):
                level = "3"
                
            row = ["aat",lang,lemma,hit["aat_uri"],"historicFlag","",level]
            aat_explicit.loc[len(aat_explicit)] = row
            
        # if termKind flag
        if hit["aat_uri"] in termKind_flag_pairs.keys():
            
            # check level
            level = "1"
            if hit["aat_uri"] in set(subset):
                level = "2"
            if hit["aat_uri"] in set(rm):
                level = "3"
                
            row = ["aat",lang,lemma,hit["aat_uri"],"termKind",termKind_flag_pairs[hit["aat_uri"]],level]
            aat_explicit.loc[len(aat_explicit)] = row

In [None]:
aat_explicit

In [None]:
aat_explicit.drop_duplicates(["lang","lemma","entity_id","explicit_marker"],ignore_index=True).to_csv("aat_explicit.csv")

## PWN

In [None]:
pwn_explicit = pd.DataFrame(columns=["resource","lang","lemma","entity_id","explicit_marker","value","level"])

In [None]:
# importing all search results
with open("/Users/anesterov/reps/LODlit/PWN/pwn31_query_results.json",'r') as jf:
    pwn_all = json.load(jf)

# import subset
with open("/Users/anesterov/reps/LODlit/PWN/pwn_subset.json",'r') as jf:
    pwn_subset = json.load(jf)

# get all QIDs in the subset
subset = []
for hits in pwn_subset.values():
    for hit in hits:
        subset.append(hit["synset_id"])

# import rm
pwn_rm = pd.read_csv("/Users/anesterov/reps/LODlit/rm/rm_entities_unique.csv")
rm = list(pwn_rm[pwn_rm["resource"] == "pwn"]["entity_id"])

In [None]:
# import pwn markers
usage_domains = pd.read_csv("/Users/anesterov/reps/LODlit/sensitivity_markers/explicit/pwn_usage_domain_synsets_all.csv")

In [None]:
usage_domains_pairs = {}
for row in usage_domains.iterrows():
    usage_domains_pairs[row[1]["synset_id"]] = \
    row[1]["usage_domains"].replace("[","").replace("]","").replace("'","").replace(" ","").split(",")
    

In [None]:
for term, hits in pwn_all.items():
    lemma = get_lemma_by_term(term, "en")
    
    for hit in hits:
        # if historic_flag markers
        if hit["synset_id"] in usage_domains_pairs.keys():
            
            # check level
            level = "1"
            if hit["synset_id"] in set(subset):
                level = "2"
            if hit["synset_id"] in set(rm):
                level = "3"
                
            row = ["pwn","en",lemma,hit["synset_id"],"usage_domain",usage_domains_pairs[hit["synset_id"]],level]
            pwn_explicit.loc[len(pwn_explicit)] = row

In [None]:
pwn_explicit.drop_duplicates(["lemma","entity_id"],ignore_index=True).to_csv("pwn_explicit.csv")

## ODWN

In [None]:
odwn_explicit = pd.DataFrame(columns=["resource","lang","lemma","entity_id","explicit_marker","value","level"])

In [None]:
# importing all search results
with open("/Users/anesterov/reps/LODlit/ODWN/odwn_query_results.json",'r') as jf:
    odwn_all = json.load(jf)

# import subset
with open("/Users/anesterov/reps/LODlit/ODWN/odwn_subset.json",'r') as jf:
    odwn_subset = json.load(jf)

# get all ids in the subset
subset = []
for hits in odwn_subset.values():
    for hit in hits:
        if hit["synset_id"] != "":
            subset.append(hit["synset_id"])
        else:
            subset.append(hit["sense_id"])

# import rm
odwn_rm = pd.read_csv("/Users/anesterov/reps/LODlit/rm/rm_entities_unique.csv")
rm = list(odwn_rm[odwn_rm["resource"] == "odwn"]["entity_id"])

In [None]:
# import odwn markers
with open("/Users/anesterov/reps/LODlit/sensitivity_markers/explicit/odwn_all_pragmatics.json",'r') as jf:
    odwn_markers = json.load(jf)

In [None]:
for term, hits in odwn_all.items():
    lemma = get_lemma_by_term(term, "nl")
    
    for hit in hits:
        
        if hit["synset_id"] != "":
            odwn_id = hit["synset_id"]
        else:
            odwn_id = hit["sense_id"]
        
        # check level
        level = "1"
        if odwn_id in set(subset):
            level = "2"
        if odwn_id in set(rm):
            level = "3"
        
        # get markers values for each synset/sense
        # NB! collects pragmatics values of all lemmas in a synset
        chron = list(set([le["pragmatics"]["chronology"] for le in odwn_markers[odwn_id] if le["pragmatics"]["chronology"]]))
        conn = list(set([le["pragmatics"]["connotation"] for le in odwn_markers[odwn_id] if le["pragmatics"]["connotation"]]))
        
        if len(chron) > 0:
            row = ["odwn","nl",lemma,odwn_id,"pragmatics_chronology",chron,level]
            odwn_explicit.loc[len(odwn_explicit)] = row

        if len(conn) > 0:
            row = ["odwn","nl",lemma,odwn_id,"pragmatics_connotation",conn,level]
            odwn_explicit.loc[len(odwn_explicit)] = row

In [None]:
odwn_explicit.drop_duplicates(["lemma","entity_id","explicit_marker"],ignore_index=True).to_csv("odwn_explicit.csv")