In [1]:
import json
from thefuzz import process
import itertools
from itertools import groupby

In [None]:
# suggestion can be found in the same properties as query terms
# {"suggestions":{"pref": [('intersex', 90)] }

In [None]:
def find_suggestions(resource:str, lang:str, resource_file_path:str) -> dict:
    '''
    Finding suggestions from WM in the literal values of entities based on fuzzy string matching (thefuzz library);
    there can be an exact match (score = 100) or near match (score > 90), matches with lower scores are not included;
    
    resource: str, 'wikidata', 'aat', 'pwn', or 'odwn'
    lang: str, 'en' or 'nl'
    resource_file_path: str, path to the file with query results of a resoure (related matches, subset, all results)
    Returns dict per resource per lang adding the found suggestions to every hit:
        for example, {'suggestions':{'prefLabel':[("match",100)]}};
    '''
    
    results = {}
    
    # load suggestions per lang
    if lang == "en":
        # change path
        with open('/Users/anesterov/reps/LODlit/RQ2/en_suggestions.json','r') as jf:
            suggestions_dict = json.load(jf)
    if lang == "nl":
        with open('/Users/anesterov/reps/LODlit/RQ2/nl_suggestions.json','r') as jf:
            suggestions_dict = json.load(jf)
    
    # load the resource data
    with open(resource_file_path,'r') as jf:
        resource_data = json.load(jf)

    for term, hits in resource_data.items():
        hits_with_suggestions = []
        
        for hit in hits:
            # searhing suggestions in literals of every hit
            # this will allow to compare in which properties the query terms and suggestions were found
            suggestions = {}

            # check resource
            # AAT
            if resource == "aat":

                # prefLabel
                # filtering matches by scores
                # excluding suggestions that are equal to query terms
                matches_pref = [tup for tup in process.extract(hit['prefLabel'], suggestions_dict[term]) if tup[1] > 90\
                                and tup[0].lower() != hit["query_term"]]

                # altLabels
                matches_alt = []
                for alt in hit["altLabel"]:
                    matches_alt.extend([tup for tup in process.extract(alt, suggestions_dict[term]) if tup[1] > 90 \
                                       and tup[0].lower() != hit["query_term"]])

                # scopeNotes
                matches_scopeNote = [tup for tup in process.extract(hit["scopeNote"], suggestions_dict[term]) if tup[1] > 90 \
                                    and tup[0].lower() != hit["query_term"]]

                # prefLabel comment
                matches_pref_comment = [tup for tup in process.extract(hit['prefLabel_comment'], suggestions_dict[term]) if tup[1] > 90 \
                                       and tup[0].lower() != hit["query_term"]]
                
                # altLabels comments
                matches_alt_comment = []
                for alt_comment in hit["altLabel_comment"]:
                    matches_alt_comment.extend([tup for tup in process.extract(alt_comment, suggestions_dict[term]) if tup[1] > 90 \
                                               and tup[0].lower() != hit["query_term"]])
                    
                # group by suggestions, take unique sugestions with max score
                suggestions['prefLabel'] = [max(score) for l,score in itertools.groupby(sorted(matches_pref), key=lambda t: t[0])]
                suggestions['altLabel'] = [max(score) for l,score in itertools.groupby(sorted(matches_alt), key=lambda t: t[0])]
                suggestions['scopeNote'] = [max(score) for l,score in itertools.groupby(sorted(matches_scopeNote), key=lambda t: t[0])]
                suggestions['prefLabel_comment'] = [max(score) for l,score in itertools.groupby(sorted(matches_pref_comment), key=lambda t: t[0])]
                suggestions['altLabel_comment'] = [max(score) for l,score in itertools.groupby(sorted(matches_alt_comment), key=lambda t: t[0])]

                hit["suggestions"] = suggestions

                # if there are suggestions found, append it
                if len(matches_pref) > 0 or len(matches_alt) > 0 or len(matches_scopeNote) > 0 \
                or len(matches_pref_comment) > 0 or len(matches_alt_comment) > 0:
                    hits_with_suggestions.append(hit)
            
            # Wikidata        
            if resource == "wd":
                # prefLabel
                if type(hit['prefLabel']) == str:
                    matches_pref = [tup for tup in process.extract(hit['prefLabel'], suggestions_dict[term]) if tup[1] > 90 \
                                   and tup[0].lower() != hit["query_term"]]

                # aliases
                matches_alt = []
                if hit["aliases"]:
                    for alt in hit["aliases"]:
                        matches_alt.extend([tup for tup in process.extract(alt, suggestions_dict[term]) if tup[1] > 90 \
                                           and tup[0].lower() != hit["query_term"]])

                # description
                matches_desc = []
                # check desc type
                if type(hit["description"]) == list:
                    for d in hit["description"]:
                        matches_desc.extend([tup for tup in process.extract(d, suggestions_dict[term]) if tup[1] > 90 \
                                            and tup[0].lower() != hit["query_term"]])
                if type(hit["description"]) == str:
                    matches_desc = [tup for tup in process.extract(hit["description"], suggestions_dict[term]) if tup[1] > 90 \
                                   and tup[0].lower() != hit["query_term"]]
                    
                # group by suggestions, take unique sugestions with max score
                suggestions['prefLabel'] = [max(score) for l,score in itertools.groupby(sorted(matches_pref), key=lambda t: t[0])]
                suggestions['aliases'] = [max(score) for l,score in itertools.groupby(sorted(matches_alt), key=lambda t: t[0])]
                suggestions['description'] = [max(score) for l,score in itertools.groupby(sorted(matches_desc), key=lambda t: t[0])]
                
                hit["suggestions"] = suggestions
                
                # if there are suggestions found, append it
                if len(matches_pref) > 0 or len(matches_alt) > 0 or len(matches_desc) > 0:
                    hits_with_suggestions.append(hit)
            
            # PWN
            if resource == "pwn":
                # lemmata
                matches_lemma = []
                for l in hit["lemmata"]:
                    matches_lemma.extend([tup for tup in process.extract(l, suggestions_dict[term]) if tup[1] > 90 \
                                         and tup[0].lower() != hit["query_term"]])
                    
                # definitions
                matches_def = [tup for tup in process.extract(hit['definition'], suggestions_dict[term]) if tup[1] > 90 \
                              and tup[0].lower() != hit["query_term"]]
                
                # examples
                matches_ex = []
                for e in hit["examples"]:
                    matches_ex.extend([tup for tup in process.extract(e, suggestions_dict[term]) if tup[1] > 90 \
                                      and tup[0].lower() != hit["query_term"]])
                    
                # group by suggestions, take unique sugestions with max score
                suggestions['lemmata'] = [max(score) for l,score in itertools.groupby(sorted(matches_lemma), key=lambda t: t[0])]
                suggestions['definition'] = [max(score) for l,score in itertools.groupby(sorted(matches_def), key=lambda t: t[0])]
                suggestions['examples'] = [max(score) for l,score in itertools.groupby(sorted(matches_ex), key=lambda t: t[0])]
                
                hit["suggestions"] = suggestions
                
                # if there are suggestions found, append it
                if len(matches_lemma) > 0 or len(matches_def) > 0 or len(matches_ex) > 0:
                    hits_with_suggestions.append(hit)
                    
            # ODWN
            if resource == "odwn":
                # le_written_form
                # optional
                matches_le = []
                if hit.get('le_written_form'):
                    matches_le = [tup for tup in process.extract(hit['le_written_form'], suggestions_dict[term]) if tup[1] > 90 \
                                 and tup[0].lower() != hit["query_term"]]
                
                # sense_definition
                # optional
                matches_sense_def = []
                if hit.get('sense_definition'):
                    matches_sense_def = [tup for tup in process.extract(hit['sense_definition'], suggestions_dict[term]) if tup[1] > 90 \
                                        and tup[0].lower() != hit["query_term"]]
                
                # synset def
                matches_synset_def = []
                for synset_def in hit['synset_definitions']:
                    matches_synset_def.extend([tup for tup in process.extract(synset_def, suggestions_dict[term]) if tup[1] > 90 \
                                              and tup[0].lower() != hit["query_term"]])
            
                # sense ex
                # optional
                matches_sense_ex = []
                if hit.get("sense_examples"):
                    for sense_ex in hit["sense_examples"]:
                        matches_sense_ex.extend([tup for tup in process.extract(sense_ex, suggestions_dict[term]) if tup[1] > 90 \
                                                and tup[0].lower() != hit["query_term"]])
                
                # group by suggestions, take unique sugestions with max score
                suggestions['le_written_form'] = [max(score) for l,score in itertools.groupby(sorted(matches_le), key=lambda t: t[0])]
                suggestions['sense_definition'] = [max(score) for l,score in itertools.groupby(sorted(matches_sense_def), key=lambda t: t[0])]
                suggestions['synset_definitions'] = [max(score) for l,score in itertools.groupby(sorted(matches_synset_def), key=lambda t: t[0])]
                suggestions['sense_examples'] = [max(score) for l,score in itertools.groupby(sorted(matches_sense_ex), key=lambda t: t[0])]

                
                hit["suggestions"] = suggestions
                
                # if there are suggestions found, append it
                if len(matches_le) > 0 or len(matches_sense_def) > 0 or len(matches_synset_def) > 0 \
                or len(matches_sense_ex) > 0:
                    hits_with_suggestions.append(hit)
                
        results[term] = hits_with_suggestions

    return results

In [None]:
sug_all_wd_nl = find_suggestions('wd','nl','/Users/anesterov/LODlit_local/wd/jan31/results_clean_nl.json')

In [None]:
# exporting the json file
with open('sug_all_wd_nl.json', 'w') as jf:
    json.dump(sug_all_wd_nl, jf)

### Suggestions stats

In [3]:
# N suggestions per lang
with open('/Users/anesterov/reps/LODlit/RQ2/en_suggestions.json','r') as jf:
    en_suggestions = json.load(jf)

with open('/Users/anesterov/reps/LODlit/RQ2/nl_suggestions.json','r') as jf:
    nl_suggestions = json.load(jf)

In [8]:
sug_unique = []
for term, sug in nl_suggestions.items():
    sug_unique.extend(sug)

In [9]:
len(set(sug_unique))

48

In [None]:
# related matches
with open('/Users/anesterov/reps/LODlit/RQ2/sug_rm_wd_en.json','r') as jf:
    sug_rm_wd_en = json.load(jf)

In [35]:
def get_suggestions_count(resource:str, path_to_dataset:str) -> dict:
    '''
    Counting N of entities and hits with exact and near match suggestions
    resource: str, "wd", "aat", "pwn", or "odwn"
    path_to_dataset: str, path to a file with hits
    Returns dict
    '''
    
    results = {}
    
    with open(path_to_dataset,'r') as jf:
        data = json.load(jf)
        
    # check resource
    if resource == "wd":
        entity_id_key = "QID"
    if resource == "aat":
        entity_id_key = "aat_uri"
    if resource == "pwn":
        entity_id_key = "synset_id"
        
    # how many exact match suggestions?
    entities_exact = []
    entities_near = []
    hits_exact = 0
    hits_near = 0

    for term, sug_hits in data.items():
        for hit in sug_hits:
            for prop, matches in hit["suggestions"].items():
                for m in matches:
                    if m[1] == 100:
                        hits_exact += 1
                        entities_exact.append(hit)
                    else:
                        hits_near += 1
                        entities_near.append(hit)
    # ODWN
    if resource == "odwn":
        n_entities_exact_d = []
        for hit in entities_exact:
            if hit.get("synset_id"):
                n_entities_exact_d.append(hit["synset_id"])
            else:
                n_entities_exact_d.append(hit["le_id"])
        n_entities_exact = len(set(n_entities_exact_d))

        n_entities_near_d = []
        for hit in entities_near:
            if hit.get("synset_id"):
                n_entities_near_d.append(hit["synset_id"])
            else:
                n_entities_near_d.append(hit["le_id"])
        n_entities_near = len(set(n_entities_near_d))
        
    else:
        n_entities_exact = len(set([hit[entity_id_key] for hit in entities_exact]))
        n_entities_near = len(set([hit[entity_id_key] for hit in entities_near]))

    results["n_entities_exact"] = n_entities_exact
    results["n_entities_near"] = n_entities_near
    results["hits_exact"] = hits_exact
    results["hits_near"] = hits_near

    return results

In [37]:
get_suggestions_count("wd","sug_rm_wd_en.json")

{'n_entities_exact': 7, 'n_entities_near': 2, 'hits_exact': 17, 'hits_near': 6}

### Check unique entities with suggestions in the subset and all search results

In [13]:
def get_unique_entities_with_suggestions(resource:str,lang:str,to_compare:tuple) -> dict:
    '''
    to_compare: tuple; ("rm","subset") or ("subset","all")
    '''
    
    results = {}
    
    lang = "_" + lang
    if resource == "odwn" or resource == "pwn":
        lang = ''
    
    with open(f"sug_{to_compare[0]}_{resource}{lang}.json",'r') as jf:
        resource_1 = json.load(jf)
        
    with open(f"sug_{to_compare[1]}_{resource}{lang}.json",'r') as jf:
        resource_2 = json.load(jf)
        
    # check resource
    # special conditions for ODWN
    if resource == "odwn":
        e_with_sug_1 = []
        e_with_sug_2 = []
        
        for term, sug_hits in resource_1.items():
            for hit in sug_hits:
                if hit.get("synset_id"):
                    e_with_sug_1.append(hit["synset_id"])
                else:
                    e_with_sug_1.append(hit["le_id"])

        for term, sug_hits in resource_2.items():
            for hit in sug_hits:
                if hit.get("synset_id"):
                    e_with_sug_2.append(hit["synset_id"])
                else:
                    e_with_sug_2.append(hit["le_id"])
    else:
    
        if resource == "wd":
            entity_id_key = "QID"
        if resource == "aat":
            entity_id_key = "aat_uri"
        if resource == "pwn":
            entity_id_key = "synset_id"

        e_with_sug_1 = []
        for term, sug_hits in resource_1.items():
            for hit in sug_hits:
                e_with_sug_1.append(hit[entity_id_key])

        e_with_sug_2 = []
        for term, sug_hits in resource_2.items():
            for hit in sug_hits:
                e_with_sug_2.append(hit[entity_id_key])
            
    unique_e = [e for e in list(set(e_with_sug_2)) if e not in list(set(e_with_sug_1))]
    
    results[f"{resource}{lang}"] = unique_e
    
    return results

In [60]:
ent = get_unique_entities_with_suggestions("wd","en",("subset","all"))

In [61]:
ent

{'wd_en': ['Q4117169',
  'Q48801961',
  'Q459387',
  'Q12060728',
  'Q475027',
  'Q8060',
  'Q11387167',
  'Q112918934',
  'Q97690709',
  'Q3200179',
  'Q1145774',
  'Q6025468',
  'Q4668353',
  'Q9064330',
  'Q37178',
  'Q97703712',
  'Q19860',
  'Q100379957']}

In [62]:
len(ent["wd_en"])

18

### Are sugestions used when contentious terms are found in prefLabel?

In [None]:
# related matches

In [None]:
with open('/Users/anesterov/reps/LODlit/RQ2/sug_rm_wd_en.json','r') as jf:
    sug_rm_wd_en = json.load(jf)

In [4]:
def check_suggestions(resource:str,lang:str,level:str,search_suggestions_in:str) -> dict:
    '''
    resource: 'wd', 'aat'
    level: 'rm', 'subset', 'all'
    search_suggestions_in: 'pref', 'alt'
    '''
    
    results = {}
    
    # check resource
    if resource == "wd":
        pref = "prefLabel"
        alt = "aliases"
        entity_key = "QID"
    if resource == "aat":
        pref = "prefLabel"
        alt = "altLabel"
        entity_key = "aat_uri"
    
    with open(f"sug_{level}_{resource}_{lang}.json",'r') as jf:
        hits_with_suggestions = json.load(jf)
    
    n_hits = 0
    for term, sug_hits in hits_with_suggestions.items():
        hits_per_term = []
        for hit in sug_hits:
            # query terms are found in aliases and suggestions are used as prefLabel
            if search_suggestions_in == 'pref':
                if hit["found_in"] == alt and len(hit["suggestions"][pref]) > 0:
                    hits_per_term.append(hit)
                    n_hits += 1
                    
            if search_suggestions_in == 'alt':
                if hit["found_in"] == pref and len(hit["suggestions"][alt]) > 0:
                    hits_per_term.append(hit)
                    n_hits += 1
                
        if len(hits_per_term) > 0:
            results[term] = hits_per_term
    
    # get N of unique entities
    n_e_unique = []
    for term, hits in results.items():
        for hit in hits:
            n_e_unique.append(hit[entity_key])
    
    string_to_return = f"Suggestions in {search_suggestions_in}. N unique e: {len(set(n_e_unique))} {(set(n_e_unique))}, N hits all: {n_hits}"
        
    return string_to_return,results

In [89]:
check_suggestions("aat","en","subset","pref")

("Suggestions in pref. N unique e: 2 {'300018279', '300404576'}, N hits all: 2",
 {'aboriginal': [{'query_term': 'aboriginal',
    'aat_uri': '300404576',
    'found_in': 'altLabel',
    'prefLabel': 'indigenous art',
    'prefLabel_comment': '',
    'altLabel': ['art, indigenous', 'aboriginal art'],
    'altLabel_comment': [],
    'scopeNote': 'Art and cultural works produced by the original inhabitants of an area, as contrasted to works produced by descendants of colonists to the area. A primary usage of the term refers to cultures who pre-existed European colonialism in the Americas, Africa, and Oceania.',
    'suggestions': {'prefLabel': [['Indigenous', 95]],
     'altLabel': [],
     'scopeNote': [],
     'prefLabel_comment': [],
     'altLabel_comment': []}}],
  'oriental': [{'query_term': 'oriental',
    'aat_uri': '300018279',
    'found_in': 'altLabel',
    'prefLabel': 'Asian',
    'prefLabel_comment': '',
    'altLabel': ['Oriental (Asian)'],
    'altLabel_comment': [],
    