In [None]:
import re
import json
import csv
import pandas as pd
from collections import Counter

In [None]:
def get_lemma_by_term(query_term:str, lang:str) -> str:
    '''
    Getting a lemma of a query term
    lang: str, 'en' or 'nl'
    Returns str, 'not found' if lemma was not found
    '''
    
    return_lemma = 'not found'
    
    # importing query terms with lemmas
    
    with open('/LODlit/query_terms.json','r') as jf:
        query_terms = json.load(jf)
        
    for lemma, qt in query_terms[lang].items():
        if query_term in qt:
            return_lemma = lemma
            
    return return_lemma

## Wikidata

In [None]:
def search_implicit_markers_wd(lang:str):
    '''
    lang: str, "en" or "nl"
    Returns pandas DataFrame
    '''
    
    wd_implicit = pd.DataFrame(columns=["resource","lang","lemma","entity_id","property","value","implicit_marker","level"])
    
    # importing implicit markers dict
    with open('implicit_markers.json','r') as jf:
        implicit_markers = json.load(jf)
    
    # importing all search results
    # this file is gzipped on Github with the prefix "_gzip"
    with open(f"/LODlit/Wikidata/gzip_results_clean_{lang}.json",'r') as jf:
        wd_all = json.load(jf)
        
    # import subset
    with open(f"/LODlit/Wikidata/wd_{lang}_subset.json",'r') as jf:
        wd_subset = json.load(jf)
        
    # get all QIDs in the subset
    subset_quids = []
    for hits in wd_subset.values():
        for hit in hits:
            subset_quids.append(hit["QID"])
            
    # import rm
    wd_rm = pd.read_csv("/LODlit/rm/rm_entities_unique.csv")
    rm_quids = list(wd_rm[wd_rm["resource"] == "wikidata"][wd_rm["lang"] == lang]["entity_id"])
    
    # searching in descriptions
    for term, hits in wd_all.items():
        lemma = get_lemma_by_term(term, lang)
        
        for hit in hits:

            level = "1"
            # check entity level
            if hit["QID"] in set(subset_quids):
                level = "2"
            if hit["QID"] in set(rm_quids):
                level = "3"
                
            for marker in implicit_markers["wikidata"][lang]:

                # check descriptions type
                if type(hit["description"]) == list:
                    for d in hit["description"]:
                        match = re.search(f"\\b{marker}\\b",d,flags=re.IGNORECASE)
                        if match:
                            row = ["wikidata",lang,lemma,hit["QID"],"description",d,match[0],level]
                            wd_implicit.loc[len(wd_implicit)] = row

                if type(hit["description"]) == str:
                    match = re.search(f"\\b{marker}\\b",hit["description"],flags=re.IGNORECASE)
                    if match:
                        row = ["wikidata",lang,lemma,hit["QID"],"description",hit["description"],match[0],level]
                        wd_implicit.loc[len(wd_implicit)] = row
                        
    return wd_implicit.drop_duplicates(["lemma","entity_id","value","implicit_marker"],ignore_index=True)

In [None]:
wd_implicit = search_implicit_markers_wd("nl")

In [None]:
wd_implicit.to_csv("wd_nl_implicit.csv")

## AAT

In [None]:
def search_implicit_markers_aat(lang:str):
    '''
    lang: str, "en" or "nl"
    Returns pandas DataFrame
    '''
    
    aat_implicit = pd.DataFrame(columns=["resource","lang","lemma","entity_id","property","value","implicit_marker","level"])
    
    # importing implicit markers dict
    with open('implicit_markers.json','r') as jf:
        implicit_markers = json.load(jf)
    
    # importing all search results
    with open(f"/LODlit/AAT/aat_query_results_{lang}.json",'r') as jf:
        aat_all = json.load(jf)
        
    # import subset
    with open(f"/LODlit/AAT/aat_{lang}_subset.json",'r') as jf:
        aat_subset = json.load(jf)
        
    # get all QIDs in the subset
    subset_uris = []
    for hits in aat_subset.values():
        for hit in hits:
            subset_uris.append(hit["aat_uri"])
            
    # import rm
    aat_rm = pd.read_csv("/LODlit/rm/rm_entities_unique.csv")
    rms = list(aat_rm[aat_rm["resource"] == "aat"][aat_rm["lang"] == lang]["entity_id"])
    
    for term, hits in aat_all.items():
        lemma = get_lemma_by_term(term, lang)
        
        for hit in hits:

            level = "1"
            # check entity level
            if hit["aat_uri"] in set(subset_uris):
                level = "2"
            if hit["aat_uri"] in set(rms):
                level = "3"
                
            for marker in implicit_markers["aat"][lang]:
                # searching in scopeNotes
                match = re.search(f"\\b{marker}\\b",hit["scopeNote"],flags=re.IGNORECASE)
                if match:
                    row = ["aat",lang,lemma,hit["aat_uri"],"scopeNote",hit["scopeNote"],match[0],level]
                    aat_implicit.loc[len(aat_implicit)] = row
                    
                # searching in prefLabel comments
                match = re.search(f"\\b{marker}\\b",hit["prefLabel_comment"],flags=re.IGNORECASE)
                if match:
                    row = ["aat",lang,lemma,hit["aat_uri"],"prefLabel_comment",hit["prefLabel_comment"],match[0],level]
                    aat_implicit.loc[len(aat_implicit)] = row
                    
                # searching in altLabel comments    
                for c in hit["altLabel_comment"]:
                    match = re.search(f"\\b{marker}\\b",c,flags=re.IGNORECASE)
                    if match:
                        row = ["aat",lang,lemma,hit["aat_uri"],"altLabel_comment",c,match[0],level]
                        aat_implicit.loc[len(aat_implicit)] = row
                        
    return aat_implicit.drop_duplicates(["lemma","entity_id","property","implicit_marker"],ignore_index=True)

In [None]:
aat_implicit = search_implicit_markers_aat("nl")

In [None]:
# export csv
aat_implicit.to_csv("aat_nl_implicit.csv")

## PWN

In [None]:
pwn_implicit = pd.DataFrame(columns=["resource","lang","lemma","entity_id","property","value","implicit_marker","level"])
    
# importing implicit markers dict
with open('implicit_markers.json','r') as jf:
    implicit_markers = json.load(jf)

# importing all search results
with open(f"/LODlit/PWN/pwn31_query_results.json",'r') as jf:
    pwn_all = json.load(jf)

# import subset
with open(f"/LODlit/PWN/pwn_subset.json",'r') as jf:
    pwn_subset = json.load(jf)

# get all QIDs in the subset
subset = []
for hits in pwn_subset.values():
    for hit in hits:
        subset.append(hit["synset_id"])

# import rm
pwn_rm = pd.read_csv("/LODlit/rm/rm_entities_unique.csv")
rms = list(pwn_rm[pwn_rm["resource"] == "pwn"]["entity_id"])

for term, hits in pwn_all.items():
    lemma = get_lemma_by_term(term, "en")
        
    for hit in hits:

        level = "1"
        # check entity level
        if hit["synset_id"] in set(subset):
            level = "2"
        if hit["synset_id"] in set(rms):
            level = "3"
                
        for marker in implicit_markers["pwn"]["en"]:
            # searching in definitions
            match = re.search(f"\\b{marker}\\b",hit["definition"],flags=re.IGNORECASE)
            if match:
                row = ["pwn","en",lemma,hit["synset_id"],"definition",hit["definition"],match[0],level]
                pwn_implicit.loc[len(pwn_implicit)] = row

In [None]:
# export csv
pwn_implicit.drop_duplicates(["lemma","entity_id","value","implicit_marker"],ignore_index=True).to_csv("pwn_implicit.csv")

In [None]:
# There are no implicit markers in ODWN

In [None]:
### merge wikidata

In [None]:
wd_en_impl = pd.read_csv("wd_en_implicit.csv")
wd_nl_impl = pd.read_csv("wd_nl_implicit.csv")

In [None]:
frames = [wd_en_impl,wd_nl_impl]

In [None]:
wd_impl = pd.concat(frames,ignore_index=True)

In [None]:
wd_impl.to_csv("wd_implicit.csv")

In [None]:
### merge aat

In [None]:
aat_en_impl = pd.read_csv("aat_en_implicit.csv")
aat_nl_impl = pd.read_csv("aat_nl_implicit.csv")

In [None]:
frames = [aat_en_impl,aat_nl_impl]

In [None]:
aat_impl = pd.concat(frames,ignore_index=True)

In [None]:
aat_impl.to_csv("aat_implicit.csv")