In [None]:
import json
import csv
import pandas as pd
import gzip

In [None]:
# reading query terms
with open('/Users/anesterov/reps/LODlit/query_terms.json','r') as jf:
        query_terms = json.load(jf)

In [None]:
# reading related matches bows
with open('/Users/anesterov/reps/LODlit/bg/background_info_bows.json','r') as jf:
        bg_info = json.load(jf)

### 1. N terms found | N found terms with RMs per resource

#### Wikidata EN

In [None]:
# (1) query terms total
# (2) query terms by lemmas total
# (3) found query terms
# (4) found lemmas
# (5) query tems with RM
# (6) lemmas with RM

In [None]:
def get_stats_by_resource(resource:str, lang:str) -> str:
    '''
    resource: str, "wikidata", "aat", "pwn", "odwn"
    lang: str, "en" or "nl"
    '''
    found_terms = []
    total_lemmas = []
    found_lemmas = []
    
    # reading query terms
    with open('/Users/anesterov/reps/LODlit/query_terms.json','r') as jf:
        query_terms = json.load(jf)
        
    # reading background info
    with open('/Users/anesterov/reps/LODlit/bg/background_info_bows.json','r') as jf:
        bg_info = json.load(jf)
        
    # reading appropriate file
    if resource == "wikidata" and lang == "en":
        with gzip.open(f"/Users/anesterov/reps/LODlit/Wikidata/gzip_wd_bows_en.json", 'r') as gzip_json:
            search_results = json.loads(gzip_json.read().decode('utf-8'))
            
    if resource == "wikidata" and lang == "nl":
        with gzip.open(f"/Users/anesterov/reps/LODlit/Wikidata/gzip_wd_bows_nl.json", 'r') as gzip_json:
            search_results = json.loads(gzip_json.read().decode('utf-8'))
    
    if resource == "aat" and lang == "en":
        with open('/Users/anesterov/reps/LODlit/AAT/aat_bows_en.json','r') as jf:
            search_results = json.load(jf)
            
    if resource == "aat" and lang == "nl":
        with open('/Users/anesterov/reps/LODlit/AAT/aat_bows_nl.json','r') as jf:
            search_results = json.load(jf)
            
    if resource == "pwn":
        with open('/Users/anesterov/reps/LODlit/PWN/pwn31_bows.json','r') as jf:
            search_results = json.load(jf)
            
    if resource == "odwn":
        with open('/Users/anesterov/reps/LODlit/ODWN/odwn_bows.json','r') as jf:
            search_results = json.load(jf)

    for term, hits in search_results.items():
        if len(hits) > 0:
            found_terms.append(term)
        for lemma, qt in query_terms[lang].items():
            hits_by_lemma = 0
            if term in qt:
                total_lemmas.append(lemma)
                hits_by_lemma += len(hits)
            if hits_by_lemma > 0:
                found_lemmas.append(lemma)

    query_terms_found_with_rm = []
    lemmas_found_with_rm = []

    for term, rms in bg_info[lang].items():
        if term in found_terms and rms.get(resource) != None:
            query_terms_found_with_rm.append(term)

        for lemma, qt in query_terms[lang].items():
            rm_flag = False
            if lemma in found_lemmas:
                if term in qt and rms.get(resource) != None:
                    rm_flag = True

                if rm_flag == True:
                    lemmas_found_with_rm.append(lemma)

    stats_str = f"total query terms: {len(search_results)}, \
found query terms: {len(found_terms)}, \
query terms with RM: {len(query_terms_found_with_rm)}, \
total lemmas: {len(set(total_lemmas))}, \
found lemmas: {len(set(found_lemmas))}, \
lemmas with RM: {len(set(lemmas_found_with_rm))}"
    
    result_dict = {"stats":stats_str,"lemmas_found_with_rm":set(lemmas_found_with_rm),\
                  "query_terms_found_with_rm":set(query_terms_found_with_rm)}
    
    return result_dict

In [None]:
get_stats_by_resource("aat","en")

## 2. N RMs per term / per lemma in Top-10 per resource

In [None]:
def get_n_rm_in_top10(resource:str,lang:str,metric:str, top=9) -> int:
    '''
    resource: str, "wikidata", "aat", "pwn", or "odwn";
    lang: str, "en" or "nl";
    metric: str, "rm", "wm", or "rmwm";
    top: int, index, N of top entities, max 9, default 9
    '''
    
    # count N rms in top-10
    lemmas_with_rm = []
    query_terms_with_rm = []
    lemmas_without_rm = {}
    
    # loading related matches
    if resource != "odwn":
        with open('/Users/anesterov/reps/wordsmatter/related_matches/rm.json','r') as jf:
            rm = json.load(jf)
    
    # reading appropriate files
    if resource == "wikidata" or resource == "aat":
        top_10 = pd.read_csv(f"/Users/anesterov/reps/LODlit/cs/top_10_{metric}_{resource}_{lang}.csv")
    
        # making a dict of related matches per query term
        query_terms_rms = {}
        for values in rm.values():
            if values["lang"] == lang and values["related_matches"][resource][0] != "None":
                for query_term in values["query_terms"]:
                    query_terms_rms[query_term] = values["related_matches"][resource][0]
        
        # looking only at the found lemmas
        found_lemmas_with_rms = get_stats_by_resource(resource,lang)["lemmas_found_with_rm"]
        
        for lemma, qt in query_terms[lang].items():
            rm_flag = False
            if lemma in found_lemmas_with_rms:
                for t in qt:
                    term_group = top_10.groupby("term").get_group(t)[:top]
                    if query_terms_rms.get(t):
                        if resource == "aat":
                            # top-10
                            ids_per_term = [str(i).replace('.0','') for i in list(term_group["hit_id"])]
                            if query_terms_rms[t] in ids_per_term:
                                query_terms_with_rm.append(t)
                                rm_flag = True
                        else:
                            if query_terms_rms[t] in list(term_group["hit_id"]):
                                query_terms_with_rm.append(t)
                                rm_flag = True
                if rm_flag == True:
                    lemmas_with_rm.append(lemma)
                else:
                    lemmas_without_rm[lemma] = query_terms_rms.get(lemma)
                    
    if resource == "pwn":
        top_10 = pd.read_csv(f"/Users/anesterov/reps/LODlit/cs/top_10_{metric}_pwn.csv")
        
        found_lemmas_with_rms = get_stats_by_resource("pwn","en")["lemmas_found_with_rm"]
        
        # making a dict of related matches per query term
        query_terms_rms = {}
        for values in rm.values():
            if values["related_matches"]["pwn"][0] != "None":
                for query_term in values["query_terms"]:
                    query_terms_rms[query_term] = values["related_matches"]["pwn"] # a list
                    
        # count N rms in top-10
        for lemma, qt in query_terms[lang].items():
            if lemma in found_lemmas_with_rms:
                list_of_flags = []
                for t in qt:
                    term_group = top_10.groupby("term").get_group(t)[:top]
                    if query_terms_rms.get(t):
                        for hit_id in query_terms_rms[t]:
                            if hit_id in list(term_group["hit_id"]):
                                list_of_flags.append("True")
                                query_terms_with_rm.append(t)
                if "True" in list_of_flags:
                    lemmas_with_rm.append(lemma)
                else:
                    lemmas_without_rm[lemma] = query_terms_rms.get(lemma)

    # related matches in odwn are in a separate file
    if resource == "odwn":
        with open('/Users/anesterov/reps/LODlit/bg/related_matches_odwn.json','r') as jf:
            rm = json.load(jf)
            
        top_10 = pd.read_csv(f"/Users/anesterov/reps/LODlit/cs/top_10_{metric}_odwn.csv")
        
        found_lemmas_with_rms = get_stats_by_resource("odwn","nl")["lemmas_found_with_rm"]
        
        # making a dict of related matches per query term
        query_terms_rms = {}
        for values in rm.values():
            odwn_hit_ids = []
            if values["odwn_le"][0] != "None":
                odwn_hit_ids.extend(values["odwn_le"])
            if values["odwn_synsets"] != "":
                odwn_hit_ids.extend(values["odwn_synsets"])
            for query_term in values["query_terms"]:
                query_terms_rms[query_term] = odwn_hit_ids # a list
                    
        # count N rms in top-10
        for lemma, qt in query_terms[lang].items():
            if lemma in found_lemmas_with_rms:
                list_of_flags = []
                for t in qt:
                    term_group = top_10.groupby("term").get_group(t)[:top]
                    if query_terms_rms.get(t):
                        for hit_id in query_terms_rms[t]:
                            if hit_id in list(term_group["hit_id"]):
                                list_of_flags.append("True")
                                query_terms_with_rm.append(t)
                if "True" in list_of_flags:
                    lemmas_with_rm.append(lemma)
                else:
                    lemmas_without_rm[lemma] = query_terms_rms.get(lemma)

    stats_str = f"lemmas with RM: {len(set(lemmas_with_rm))}, \
 query terms with RM: {len(set(query_terms_with_rm))}, \
 lemmas without RM: {lemmas_without_rm}"
    
    return stats_str

In [None]:
get_n_rm_in_top10("odwn","nl","wm",top=1)