In [1]:
import json
import csv
import pandas as pd

### Importing related matches

In [10]:
with open('/Users/anesterov/reps/wordsmatter/related_matches/rm.json','r') as jf:
    rms = json.load(jf)

In [11]:
# related matches from ODWN are in a separate file
with open('/Users/anesterov/reps/LODlit/bg/related_matches_odwn.json','r') as jf:
    rms_odwn = json.load(jf)

In [12]:
# importing query terms with lemmas
with open('/Users/anesterov/reps/LODlit/query_terms.json','r') as jf:
    query_terms = json.load(jf)

In [13]:
def get_lemma_by_term(query_term:str, lang:str) -> str:
    '''
    Getting a lemma of a query term
    lang: str, 'en' or 'nl'
    Returns str, 'not found' if lemma was not found
    '''
    
    return_lemma = 'not found'
    
    # importing query terms with lemmas
    # change path to GitHub
    
    with open('/Users/anesterov/reps/LODlit/query_terms.json','r') as jf:
        query_terms = json.load(jf)
        
    for lemma, qt in query_terms[lang].items():
        if query_term in qt:
            return_lemma = lemma
            
    return return_lemma

### How many query terms are associated with related matches?

In [14]:
# importing bg info
with open('/Users/anesterov/reps/LODlit/bg/background_info_bows.json','r') as jf:
    bg_info = json.load(jf)

In [15]:
# EN 
no_rm = []
for query_term, bows in bg_info["en"].items():
    if bows.get("wm") != None and len(bows) == 1:
        no_rm.append(query_term)

In [16]:
len(no_rm)

11

In [19]:
set([get_lemma_by_term(term,'en') for term in no_rm])

{'baboo',
 'bush negro',
 'full blood',
 'half-blood',
 'low-income countries',
 'roots'}

In [20]:
# NL 
no_rm_nl = []
for query_term, bows in bg_info["nl"].items():
    if bows.get("wm") != None and len(bows) == 1:
        no_rm_nl.append(query_term)

In [21]:
len(no_rm_nl)

24

In [22]:
set([get_lemma_by_term(term,'nl') for term in no_rm_nl])

{'exotisch', 'oriëntaals', 'roots', 'traditioneel', 'volbloed'}

### Importing search results and generating a subset with related matches for every resource

In [61]:
def generate_rm_subset(resource_data:dict, rm_data:dict, resource:str, lang:str) -> dict:
    '''
    Retrieves info about entities that are related matches
    resource_data: dict, search results in a resource
    rm_data: dict, info about related matches ('rm.json')
    resource: str, 'wikidata', 'aat', 'pwn', 'odwn'
    lang: str, 'en' or 'nl'
    Returns dict 
    '''
    results = {}

    for query_term, value in resource_data.items():
        
        hits_by_query_term = []
    
        for rm_info in rm_data.values():
            
            if rm_info['lang'] == lang and query_term in rm_info['query_terms']:
                for hit in value:
                    
                    # checking resource
                    
                    if resource == 'wikidata':
                        
                        if hit.get('QID') == rm_info['related_matches']['wikidata'][0]:
                            hits_by_query_term.append(hit)
                            
                    if resource == 'aat':
                        
                        if hit.get('aat_uri') == rm_info['related_matches']['aat'][0]:
                            hits_by_query_term.append(hit)
                            
                    if resource == 'pwn':
                        
                        if hit.get('synset_id') in rm_info['related_matches']['pwn']:
                            hits_by_query_term.append(hit)
                            
                    # ODWN is handled separately
                            
        results[query_term] = hits_by_query_term
        
    return results

#### Wikidata EN

In [11]:
with open('/Users/anesterov/LODlit_local/wd/jan31/results_clean_en.json','r') as jf:
    wd_en = json.load(jf)

In [62]:
rm_wd_en = generate_rm_subset(wd_en,rms,"wikidata","en")

In [65]:
with open('/Users/anesterov/reps/LODlit/rm/rm_wd_en.json', 'w') as jf:
    json.dump(rm_wd_en, jf)

#### Wikidata NL

In [52]:
with open('/Users/anesterov/LODlit_local/wd/jan31/results_clean_nl.json','r') as jf:
    wd_nl = json.load(jf)

In [66]:
rm_wd_nl = generate_rm_subset(wd_nl,rms,"wikidata","nl")

In [67]:
with open('/Users/anesterov/reps/LODlit/rm/rm_wd_nl.json', 'w') as jf:
    json.dump(rm_wd_nl, jf)

#### AAT EN

In [55]:
with open('/Users/anesterov/reps/LODlit/AAT/aat_query_results_en.json','r') as jf:
    aat_en = json.load(jf)

In [68]:
rm_aat_en = generate_rm_subset(aat_en,rms,"aat","en")

In [69]:
with open('/Users/anesterov/reps/LODlit/rm/rm_aat_en.json', 'w') as jf:
    json.dump(rm_aat_en, jf)

#### AAT NL

In [70]:
with open('/Users/anesterov/reps/LODlit/AAT/aat_query_results_nl.json','r') as jf:
    aat_nl = json.load(jf)

In [71]:
rm_aat_nl = generate_rm_subset(aat_nl,rms,"aat","nl")

In [72]:
with open('/Users/anesterov/reps/LODlit/rm/rm_aat_nl.json', 'w') as jf:
    json.dump(rm_aat_nl, jf)

#### PWN

In [73]:
with open('/Users/anesterov/reps/LODlit/PWN/pwn31_query_results.json','r') as jf:
    pwn = json.load(jf)

In [74]:
rm_pwn = generate_rm_subset(pwn,rms,"pwn","en")

In [75]:
with open('/Users/anesterov/reps/LODlit/rm/rm_pwn.json', 'w') as jf:
    json.dump(rm_pwn, jf)

#### ODWN

In [76]:
with open('/Users/anesterov/reps/LODlit/ODWN/odwn_query_results.json','r') as jf:
    odwn = json.load(jf)

In [79]:
rm_odwn = {}

for query_term, value in odwn.items():
    
    hits_by_query_term = []
    
    for rm_info in rms_odwn.values():
        if query_term in rm_info['query_terms']:
            
            for hit in value:
                # all related matches in ODWN have LE_id
                if hit.get('le_id') in rm_info['odwn_le']:
                    hits_by_query_term.append(hit)
                    
    rm_odwn[query_term] = hits_by_query_term

In [81]:
with open('/Users/anesterov/reps/LODlit/rm/rm_odwn.json', 'w') as jf:
    json.dump(rm_odwn, jf)

### N unique related matches per resource

In [89]:
quids = []

for rm_info in rm_wd_en.values():
    quids.extend([hit['QID'] for hit in rm_info])
    
print("Wikidata EN:",len(set(quids)))

Wikidata EN: 58


In [90]:
quids = []

for rm_info in rm_wd_nl.values():
    quids.extend([hit['QID'] for hit in rm_info])
    
print("Wikidata NL:",len(set(quids)))

Wikidata NL: 63


In [92]:
aat_uris = []

for rm_info in rm_aat_en.values():
    aat_uris.extend([hit['aat_uri'] for hit in rm_info])
    
print("AAT EN:",len(set(aat_uris)))

AAT EN: 37


In [93]:
aat_uris = []

for rm_info in rm_aat_nl.values():
    aat_uris.extend([hit['aat_uri'] for hit in rm_info])
    
print("AAT NL:",len(set(aat_uris)))

AAT NL: 27


In [95]:
pwn_synsets = []

for rm_info in rm_pwn.values():
    pwn_synsets.extend([hit['synset_id'] for hit in rm_info])
    
print("PWN:",len(set(pwn_synsets)))

PWN: 81


In [99]:
odwn_le = []

for rm_info in rm_odwn.values():
    odwn_le.extend([hit['le_id'] for hit in rm_info])
    
print("ODWN:",len(set(odwn_le)))

ODWN: 65


### Overview by properties

In [143]:
import statistics
from statistics import Counter

In [146]:
all_occurences = []
for query_term, rm_info in rm_wd_en.items():
    for hit in rm_info:
        all_occurences.append(hit["found_in"])

In [147]:
Counter(all_occurences)

Counter({'prefLabel': 44, 'aliases': 67, 'description': 4})

In [148]:
total_hits = 0
for query_term, rm_info in rm_wd_en.items():
    total_hits += len(rm_info)

In [149]:
total_hits

115

In [150]:
for query_term, rm_info in rm_wd_en.items():
    for hit in rm_info:
        if hit["found_in"] == 'description':
            print(query_term)

descent
homosexual
indo
primitive


In [151]:
# NL wikidata
all_occurences_nl = []
for query_term, rm_info in rm_wd_nl.items():
    for hit in rm_info:
        all_occurences_nl.append(hit["found_in"])

In [152]:
Counter(all_occurences_nl)

Counter({'prefLabel': 51, 'aliases': 53})

In [153]:
# EN AAT
all_occurences_aat_en = []
for query_term, rm_info in rm_aat_en.items():
    for hit in rm_info:
        all_occurences_aat_en.append(hit["found_in"])
Counter(all_occurences_aat_en)

Counter({'prefLabel': 23, 'altLabel': 60, 'scopeNote': 10})

In [154]:
len(all_occurences_aat_en)

93

In [155]:
# NL AAT
all_occurences_aat_nl = []
for query_term, rm_info in rm_aat_nl.items():
    for hit in rm_info:
        all_occurences_aat_nl.append(hit["found_in"])
Counter(all_occurences_aat_nl)

Counter({'prefLabel': 25, 'scopeNote': 6, 'altLabel': 11})

In [156]:
len(all_occurences_aat_nl)

42

In [157]:
# PWN
all_occurences_pwn = []
for query_term, rm_info in rm_pwn.items():
    for hit in rm_info:
        all_occurences_pwn.append(hit["found_in"])
Counter(all_occurences_pwn)

Counter({'lemmata': 90, 'definition': 11, 'examples': 36})

In [158]:
len(all_occurences_pwn)

137

In [159]:
# ODWN
all_occurences_odwn = []
for query_term, rm_info in rm_odwn.items():
    for hit in rm_info:
        all_occurences_odwn.append(hit["found_in"])
Counter(all_occurences_odwn)

Counter({'le': 66, 'sense_examples': 38, 'sense_definition': 2})

In [160]:
len(all_occurences_odwn)

106

In [166]:
sum(Counter(all_occurences_odwn).values())

106

In [167]:
# make an overview for all resources
# EN
with open('rm_stats_en.csv','w') as csv_file:
    
    writer = csv.writer(csv_file)
    header = ["query_term","lang","wd_pref","wd_aliases","wd_descr","wd_total",\
              "aat_pref","aat_alt","aat_scopeNote","aat_total",\
              "pwn_le","pwn_def","pwn_examples","pwn_total","total_hits"]
    writer.writerow(header)
    
    for query_term in rm_wd_en.keys():
        
        # wikidata
        wd_hits = Counter([hit["found_in"] for hit in rm_wd_en[query_term]])
        wd_pref = wd_hits.get('prefLabel')
        wd_aliases = wd_hits.get('aliases')
        wd_descr = wd_hits.get('description')
        wd_total = sum(Counter(wd_hits).values())
        
        # aat
        aat_hits = Counter([hit["found_in"] for hit in rm_aat_en[query_term]])
        aat_pref = aat_hits.get('prefLabel')
        aat_alt = aat_hits.get('altLabel')
        aat_scopeNote = aat_hits.get('scopeNote')
        aat_total = sum(Counter(aat_hits).values())
        
        # pwn
        pwn_hits = Counter([hit["found_in"] for hit in rm_pwn[query_term]])
        pwn_le = pwn_hits.get('lemmata')
        pwn_def = pwn_hits.get('definition')
        pwn_examples = pwn_hits.get('examples')
        pwn_total = sum(Counter(pwn_hits).values())
        
        total_hits = wd_total + aat_total + pwn_total
        
        data = [query_term,"en",wd_pref,wd_aliases,wd_descr,wd_total,aat_pref,aat_alt,aat_scopeNote,aat_total,\
               pwn_le,pwn_def,pwn_examples,pwn_total,total_hits]
        
        writer.writerow(data)

In [174]:
# which terms and lemmas have no hits
# EN

no_hits = []

for query_term in rm_wd_en.keys():
        
    # wikidata
    wd_hits = Counter([hit["found_in"] for hit in rm_wd_en[query_term]])
    wd_pref = wd_hits.get('prefLabel')
    wd_aliases = wd_hits.get('aliases')
    wd_descr = wd_hits.get('description')
    wd_total = sum(Counter(wd_hits).values())

    # aat
    aat_hits = Counter([hit["found_in"] for hit in rm_aat_en[query_term]])
    aat_pref = aat_hits.get('prefLabel')
    aat_alt = aat_hits.get('altLabel')
    aat_scopeNote = aat_hits.get('scopeNote')
    aat_total = sum(Counter(aat_hits).values())

    # pwn
    pwn_hits = Counter([hit["found_in"] for hit in rm_pwn[query_term]])
    pwn_le = pwn_hits.get('lemmata')
    pwn_def = pwn_hits.get('definition')
    pwn_examples = pwn_hits.get('examples')
    pwn_total = sum(Counter(pwn_hits).values())

    total_hits = wd_total + aat_total + pwn_total

    if total_hits == 0:
        no_hits.append(query_term)

In [175]:
len(no_hits)

60

In [176]:
for lemma, wordforms in query_terms['en'].items():
    k = 0
    for w in wordforms:
        if w in no_hits:
            k += 1
            
    if k == len(wordforms):
        print(lemma)

low-income countries
baboo
bush negro
roots
half-blood
full blood


In [168]:
# NL
with open('rm_stats_nl.csv','w') as csv_file:
    
    writer = csv.writer(csv_file)
    header = ["query_term","lang","wd_pref","wd_aliases","wd_descr","wd_total",\
              "aat_pref","aat_alt","aat_scopeNote","aat_total",\
              "odwn_le","odwn_sense_examples","odwn_sense_definition","odwn_total","total_hits"]
    writer.writerow(header)
    
    for query_term in rm_wd_nl.keys():
        
        # wikidata
        wd_hits = Counter([hit["found_in"] for hit in rm_wd_nl[query_term]])
        wd_pref = wd_hits.get('prefLabel')
        wd_aliases = wd_hits.get('aliases')
        wd_descr = wd_hits.get('description')
        wd_total = sum(Counter(wd_hits).values())
        
        # aat
        aat_hits = Counter([hit["found_in"] for hit in rm_aat_nl[query_term]])
        aat_pref = aat_hits.get('prefLabel')
        aat_alt = aat_hits.get('altLabel')
        aat_scopeNote = aat_hits.get('scopeNote')
        aat_total = sum(Counter(aat_hits).values())
        
        # pwn
        odwn_hits = Counter([hit["found_in"] for hit in rm_odwn[query_term]])
        odwn_le = odwn_hits.get('le')
        odwn_sense_ex = odwn_hits.get('sense_examples')
        odwn_sense_def = odwn_hits.get('sense_definition')
        odwn_total = sum(Counter(odwn_hits).values())
        
        total_hits = wd_total + aat_total + odwn_total
        
        data = [query_term,"nl",wd_pref,wd_aliases,wd_descr,wd_total,aat_pref,aat_alt,aat_scopeNote,aat_total,\
               odwn_le,odwn_sense_ex,odwn_sense_def,odwn_total,total_hits]
        
        writer.writerow(data)

In [169]:
# which terms and lemmas have no hits
# NL
no_hits = []

for query_term in rm_wd_nl.keys():
        
    # wikidata
    wd_hits = Counter([hit["found_in"] for hit in rm_wd_nl[query_term]])
    wd_pref = wd_hits.get('prefLabel')
    wd_aliases = wd_hits.get('aliases')
    wd_descr = wd_hits.get('description')
    wd_total = sum(Counter(wd_hits).values())

    # aat
    aat_hits = Counter([hit["found_in"] for hit in rm_aat_nl[query_term]])
    aat_pref = aat_hits.get('prefLabel')
    aat_alt = aat_hits.get('altLabel')
    aat_scopeNote = aat_hits.get('scopeNote')
    aat_total = sum(Counter(aat_hits).values())

    # pwn
    odwn_hits = Counter([hit["found_in"] for hit in rm_odwn[query_term]])
    odwn_le = odwn_hits.get('le')
    odwn_sense_ex = odwn_hits.get('sense_examples')
    odwn_sense_def = odwn_hits.get('sense_definition')
    odwn_total = sum(Counter(odwn_hits).values())

    total_hits = wd_total + aat_total + odwn_total
    
    if total_hits == 0:
        no_hits.append(query_term)

In [170]:
len(no_hits)

125

In [173]:
for lemma, wordforms in query_terms['nl'].items():
    k = 0
    for w in wordforms:
        if w in no_hits:
            k += 1
            
    if k == len(wordforms):
        print(lemma)

exotisch
oriëntaals
traditioneel
roots
volbloed


### Overview by lemma

#### EN

In [4]:
stats_en = pd.read_csv('/Users/anesterov/reps/LODlit/rm/rm_stats_en.csv')

In [6]:
stats_en.fillna(0, inplace=True)

In [7]:
stats_en

Unnamed: 0,query_term,lang,wd_pref,wd_aliases,wd_descr,wd_total,aat_pref,aat_alt,aat_scopeNote,aat_total,pwn_le,pwn_def,pwn_examples,pwn_total,total_hits
0,batavias,en,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0
1,batavia,en,1.0,0.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0,1
2,southern rhodesia,en,1.0,0.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0,1
3,madras,en,0.0,1.0,0.0,1,0.0,0.0,0.0,0,1.0,1.0,0.0,2,3
4,low-income country,en,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149,full blood,en,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0
150,metis,en,0.0,1.0,0.0,1,1.0,0.0,1.0,2,0.0,0.0,0.0,0,3
151,métis,en,1.0,0.0,0.0,1,0.0,1.0,0.0,1,0.0,0.0,0.0,0,2
152,tribes,en,0.0,0.0,0.0,0,1.0,0.0,0.0,1,0.0,0.0,0.0,0,1


In [185]:
# get the lemmas column for each query term EN
lemmas = []
for row in stats_en.iterrows():
    for lemma, wordforms in query_terms['en'].items():
        if row[1]['query_term'] in wordforms:
            lemmas.append(lemma)

In [188]:
stats_en.insert(0,"lemma",lemmas)

In [197]:
# lemmas EN

with open('rm_stats_by_lemma_en.csv','w') as csv_file:
    
    writer = csv.writer(csv_file)
    header = ["lemma","lang","wd_pref","wd_aliases","wd_descr","wd_total",\
              "aat_pref","aat_alt","aat_scopeNote","aat_total",\
              "pwn_le","pwn_def","pwn_examples","pwn_total","total_hits"]
    writer.writerow(header)
    
    
    for lemma_group in stats_en.groupby("lemma"):
        data = [lemma_group[0],"en",sum(lemma_group[1]['wd_pref']),sum(lemma_group[1]['wd_aliases']),\
              sum(lemma_group[1]['wd_descr']),sum(lemma_group[1]['wd_total']),\
              sum(lemma_group[1]['aat_pref']),sum(lemma_group[1]['aat_alt']),\
              sum(lemma_group[1]['aat_scopeNote']),sum(lemma_group[1]['aat_total']),\
              sum(lemma_group[1]['pwn_le']),sum(lemma_group[1]['pwn_def']),\
              sum(lemma_group[1]['pwn_examples']),sum(lemma_group[1]['pwn_total']),\
              sum(lemma_group[1]['total_hits'])]
        
        writer.writerow(data)

#### NL

In [198]:
stats_nl = pd.read_csv('/Users/anesterov/reps/LODlit/rm/rm_stats_nl.csv')

In [199]:
stats_nl.fillna(0, inplace=True)

In [200]:
# get the lemmas column for each query term EN
lemmas = []
for row in stats_nl.iterrows():
    for lemma, wordforms in query_terms['nl'].items():
        if row[1]['query_term'] in wordforms:
            lemmas.append(lemma)

In [201]:
stats_nl.insert(0,"lemma",lemmas)

In [202]:
stats_nl

Unnamed: 0,lemma,query_term,lang,wd_pref,wd_aliases,wd_descr,wd_total,aat_pref,aat_alt,aat_scopeNote,aat_total,odwn_le,odwn_sense_examples,odwn_sense_definition,odwn_total,total_hits
0,politionele actie,politionele acties,nl,1.0,0.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0,1
1,politionele actie,politionele actie,nl,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0
2,birma,birma,nl,0.0,1.0,0.0,1,0.0,0.0,0.0,0,1.0,0.0,0.0,1,2
3,exotisch,exotische,nl,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0
4,exotisch,exotischere,nl,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,inheems,inheemst,nl,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0
238,inheems,inheemser,nl,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0
239,inheems,inheems,nl,0.0,1.0,0.0,1,0.0,0.0,0.0,0,0.0,0.0,0.0,0,1
240,trans,transen,nl,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0


In [203]:
# lemmas NL

with open('rm_stats_by_lemma_nl.csv','w') as csv_file:
    
    writer = csv.writer(csv_file)
    header = ["lemma","lang","wd_pref","wd_aliases","wd_descr","wd_total",\
              "aat_pref","aat_alt","aat_scopeNote","aat_total",\
              "odwn_le","odwn_sense_examples","odwn_sense_definition","odwn_total","total_hits"]
    writer.writerow(header)
    
    
    for lemma_group in stats_nl.groupby("lemma"):
        data = [lemma_group[0],"nl",sum(lemma_group[1]['wd_pref']),sum(lemma_group[1]['wd_aliases']),\
              sum(lemma_group[1]['wd_descr']),sum(lemma_group[1]['wd_total']),\
              sum(lemma_group[1]['aat_pref']),sum(lemma_group[1]['aat_alt']),\
              sum(lemma_group[1]['aat_scopeNote']),sum(lemma_group[1]['aat_total']),\
              sum(lemma_group[1]['odwn_le']),sum(lemma_group[1]['odwn_sense_examples']),\
              sum(lemma_group[1]['odwn_sense_definition']),sum(lemma_group[1]['odwn_total']),\
              sum(lemma_group[1]['total_hits'])]
        
        writer.writerow(data)

In [None]:
### Generate a csv with related matches
# group by lemmas

In [244]:
with open('rm_entities_all.csv','w') as csv_file:
    
    writer = csv.writer(csv_file)
    header = ["lemma","lang","resource","entity_id"]
    writer.writerow(header)
        
    # EN wikidata
    for query_term, rms in rm_wd_en.items():
        for rm in rms:
            data = [get_lemma_by_term(query_term,'en'),'en',"wikidata",rm['QID']]
            writer.writerow(data)
            
    # NL wikidata
    for query_term, rms in rm_wd_nl.items():
        for rm in rms:
            data = [get_lemma_by_term(query_term,'nl'),'nl',"wikidata",rm['QID']]
            writer.writerow(data)
            
    # EN AAT
    for query_term, rms in rm_aat_en.items():
        for rm in rms:
            data = [get_lemma_by_term(query_term,'en'),'en',"aat",rm['aat_uri']]
            writer.writerow(data)
    
    # NL AAT
    for query_term, rms in rm_aat_nl.items():
        for rm in rms:
            data = [get_lemma_by_term(query_term,'nl'),'nl',"aat",rm['aat_uri']]
            writer.writerow(data)
            
    # PWN
    for query_term, rms in rm_pwn.items():
        for rm in rms:
            data = [get_lemma_by_term(query_term,'en'),'en',"pwn",rm['synset_id']] 
            writer.writerow(data)
            
    # ODWN
    for query_term, rms in rm_odwn.items():
        for rm in rms:
            if rm['synset_id'] == '':
                data = [get_lemma_by_term(query_term,'nl'),'nl',"odwn",rm['sense_id']]
                writer.writerow(data)
            else:
                data = [get_lemma_by_term(query_term,'nl'),'nl',"odwn",rm['synset_id']]
                writer.writerow(data)


In [245]:
all_entities = pd.read_csv("/Users/anesterov/reps/LODlit/rm/rm_entities_all.csv")

In [246]:
all_entities.drop_duplicates(inplace=True)

In [247]:
all_entities.to_csv("/Users/anesterov/reps/LODlit/rm/rm_entities_unique.csv")

In [2]:
rm_entities = pd.read_csv("/Users/anesterov/reps/LODlit/rm/rm_entities_unique.csv")

In [4]:
entities_wd = rm_entities[rm_entities["resource"] == "wikidata"]

In [10]:
entities_aat = rm_entities[rm_entities["resource"] == "aat"]

In [22]:
entities_aat_nl = rm_entities[rm_entities["resource"] == "aat"][rm_entities["lang"] == "nl"]

  entities_aat_nl = rm_entities[rm_entities["resource"] == "aat"][rm_entities["lang"] == "nl"]


In [23]:
len(set(entities_aat_nl["entity_id"]))

27

In [9]:
len(set(list(entities_wd["entity_id"])))

79

In [11]:
len(set(list(entities_aat["entity_id"])))

41