In [1]:
import pandas as pd
import csv
import json

In [44]:
def get_top_10_overlap(resource:str, lang:str):
    
    with open(f"cs/overlap_top_10_{resource}_{lang}.csv",'w') as csv_file:
        writer = csv.writer(csv_file)
        header = ["term","overlap"]
        writer.writerow(header)
        
        if resource == "wikidata" or resource == "aat":
            rm = pd.read_csv(f"/Users/anesterov/reps/LODlit/cs/top_10_rm_{resource}_{lang}.csv")
            rm.dropna(subset=["hit_id"],inplace=True)
            wm = pd.read_csv(f"/Users/anesterov/reps/LODlit/cs/top_10_wm_{resource}_{lang}.csv")
            wm.dropna(subset=["hit_id"],inplace=True)
            
        if resource == "pwn" or resource == "odwn":
            rm = pd.read_csv(f"/Users/anesterov/reps/LODlit/cs/top_10_rm_{resource}.csv")
            rm.dropna(subset=["hit_id"],inplace=True)
            wm = pd.read_csv(f"/Users/anesterov/reps/LODlit/cs/top_10_wm_{resource}.csv")
            wm.dropna(subset=["hit_id"],inplace=True)
            
        for group in rm.groupby("term"):
            hits_rm = set(group[1]["hit_id"])
            hits_wm = set(wm.groupby("term").get_group(group[0])["hit_id"])
            overlap_share = len(hits_rm.intersection(hits_wm)) / len(set(group[1]["hit_id"]))
            
            data = [group[0], overlap_share]
            writer.writerow(data)

In [50]:
get_top_10_overlap("pwn","en")

### Comparing overlap of Top-10 by RM and WM per term per resource per lang

In [3]:
# term, overlap, wd_en, aat_en, pwn

In [7]:
# reading files
wd_en_overlap = pd.read_csv("/Users/anesterov/reps/LODlit/cs/overlap_top_10_wikidata_en.csv")
aat_en_overlap = pd.read_csv("/Users/anesterov/reps/LODlit/cs/overlap_top_10_aat_en.csv")
pwn_overlap = pd.read_csv("/Users/anesterov/reps/LODlit/cs/overlap_top_10_pwn_en.csv")

In [16]:
wd_aat = wd_en_overlap.join(aat_en_overlap.set_index('term'), on="term", lsuffix="_wd", rsuffix="_aat")

In [20]:
pwn_overlap.rename(columns={"overlap": "pwn_overlap"}, inplace=True)

In [21]:
all_resources = wd_aat.join(pwn_overlap.set_index('term'), on='term')

In [22]:
all_resources

Unnamed: 0,term,overlap_wd,overlap_aat,pwn_overlap
0,aboriginal,0.2,0.5,1.0
1,aboriginals,0.7,1.0,
2,allochtoon,1.0,,
3,baboo,0.6,,1.0
4,barbarian,0.5,1.0,1.0
...,...,...,...,...
131,westerns,0.6,1.0,
132,white,0.2,0.2,0.5
133,whiter,0.9,1.0,1.0
134,whites,0.5,1.0,1.0


In [23]:
all_resources.to_csv("overlap_by_term_en.csv")

In [24]:
# NL
# reading files
wd_nl_overlap = pd.read_csv("/Users/anesterov/reps/LODlit/cs/overlap_top_10_wikidata_nl.csv")
aat_nl_overlap = pd.read_csv("/Users/anesterov/reps/LODlit/cs/overlap_top_10_aat_nl.csv")
odwn_overlap = pd.read_csv("/Users/anesterov/reps/LODlit/cs/overlap_top_10_odwn_nl.csv")

In [26]:
wd_aat_nl = wd_nl_overlap.join(aat_nl_overlap.set_index('term'), on="term", lsuffix="_wd", rsuffix="_aat")

In [28]:
odwn_overlap.rename(columns={"overlap": "odwn_overlap"}, inplace=True)

In [29]:
all_overlap_nl = wd_aat_nl.join(odwn_overlap.set_index('term'), on='term')

In [31]:
all_overlap_nl.to_csv("overlap_by_term_nl.csv")

### Getting additional entities from WM ranking

In [26]:
def get_rm_wm_difference(resource:str, lang:str):
    '''
    Getting entities from Top-10 based on WM text CS similarity
    (the difference between Top-10 based on related matches and WM)
    resource: str, the name of the resource, "wikidata", "aat", "pwn", or "odwn"
    lang: str, "en" or "nl"
    '''
    new_wm_entities = pd.DataFrame(columns=["term","hit_id"])
    
    if resource == "wikidata" or resource == "aat":
        rm = pd.read_csv(f"/Users/anesterov/reps/LODlit/cs/top_10_rm_{resource}_{lang}.csv")
        rm.dropna(subset=["hit_id"],inplace=True)
        wm = pd.read_csv(f"/Users/anesterov/reps/LODlit/cs/top_10_wm_{resource}_{lang}.csv")
        wm.dropna(subset=["hit_id"],inplace=True)

    if resource == "pwn" or resource == "odwn":
        rm = pd.read_csv(f"/Users/anesterov/reps/LODlit/cs/top_10_rm_{resource}.csv")
        rm.dropna(subset=["hit_id"],inplace=True)
        wm = pd.read_csv(f"/Users/anesterov/reps/LODlit/cs/top_10_wm_{resource}.csv")
        wm.dropna(subset=["hit_id"],inplace=True)

    for group in rm.groupby("term"):
        hits_rm = set(group[1]["hit_id"])
        hits_wm = set(wm.groupby("term").get_group(group[0])["hit_id"])
        # getting the difference in Top-10
        diff = list(hits_wm.difference(hits_rm))
        for entity_id in diff:
            data = (group[0], entity_id)
            new_wm_entities.loc[len(new_wm_entities)] = data
            
    return new_wm_entities

In [27]:
wm_additional_wd_en = get_rm_wm_difference("wikidata","en")
wm_additional_wd_en.to_csv("cs/wm_additional_wd_en.csv")

In [28]:
wm_additional_wd_nl = get_rm_wm_difference("wikidata","nl")
wm_additional_wd_nl.to_csv("cs/wm_additional_wd_nl.csv")

In [29]:
wm_additional_aat_en = get_rm_wm_difference("aat","en")
wm_additional_aat_en.to_csv("cs/wm_additional_aat_en.csv")

In [30]:
wm_additional_aat_nl = get_rm_wm_difference("aat","nl")
wm_additional_aat_nl.to_csv("cs/wm_additional_aat_nl.csv")

In [31]:
wm_additional_pwn = get_rm_wm_difference("pwn","en")
wm_additional_pwn.to_csv("cs/wm_additional_pwn.csv")

In [32]:
wm_additional_odwn = get_rm_wm_difference("odwn","nl")
wm_additional_odwn.to_csv("cs/wm_additional_odwn.csv")