### Generates 2 csv files with background information for EN and NL terms
"bg_en.csv", "bg_nl.csv"

In [None]:
import json
import csv

In [None]:
#1. collect query terms
#2. collect lemmas
#3. get rm lit
#4. get wm text
#5. export csv

In [None]:
def _get_lemma_by_term(query_term:str, lang:str) -> str:
    '''
    Getting a lemma of a query term
    lang: str, 'en' or 'nl'
    Returns str, 'not found' if lemma was not found
    '''
    
    return_lemma = 'not found'
    
    # importing query terms with lemmas
    # change path to GitHub
    
    with open('/query_terms.json','r') as jf:
        query_terms = json.load(jf)
        
    for lemma, qt in query_terms[lang].items():
        if query_term in qt:
            return_lemma = lemma
            
    return return_lemma

In [None]:
# importing subsets EN
with open('/Wikidata/wd_en_subset.json','r') as jf:
    wd_en_subset = json.load(jf)
with open('/AAT/aat_en_subset.json','r') as jf:
    aat_en_subset = json.load(jf)
with open('/PWN/pwn_subset.json','r') as jf:
    pwn_subset = json.load(jf)

In [None]:
# all unique query terms
qt_en = []
qt_en.extend(list(wd_en_subset))
qt_en.extend(list(aat_en_subset))
qt_en.extend(list(pwn_subset))
qt_en_u = list(set(qt_en))

In [None]:
# all unique lemmas
en_lemmas = []
for qt in qt_en_u:
    en_lemmas.append(_get_lemma_by_term(qt,"en"))
en_lemmas = list(set(en_lemmas))

In [None]:
# importing rm 
with open('/rm/rm_wd_en.json','r') as jf:
    rm_wd_en = json.load(jf)
with open('/rm/rm_aat_en.json','r') as jf:
    rm_aat_en = json.load(jf)
with open('/rm/rm_pwn.json','r') as jf:
    rm_pwn = json.load(jf)

In [None]:
lemma_lit_en = {}

for lemma in en_lemmas:
    
    source1 = []
    source2 = []
    source3 = []
    
    # source1: wikidata
    for hit in rm_wd_en[lemma]:
        source1.append(hit['prefLabel'])
        if hit["aliases"]:
            lits.extend(hit["aliases"])
        if hit["description"]:
            lits.extend(hit["description"])
        source1.extend(hit['instance_of'])
        source1.extend(hit['subclass_of'])
    
    # source2: aat
    for hit in rm_aat_en[lemma]:
        source2.append(hit['prefLabel'])
        source2.extend(hit['altLabel'])
        source2.append(hit['scopeNote'])
        source2.append(hit['prefLabel_comment'])
        source2.extend(hit['altLabel_comment'])
    
    # source3: pwn
    for hit in rm_pwn[lemma]:
        source3.extend(hit['lemmata'])
        source3.append(hit['definition'])
        source3.extend(hit['examples'])
        
    lemma_lit_en[lemma] = {"source_1":list(set(source1)),\
                           "source_2":list(set(source2)),\
                           "source_3":list(set(source3))}

In [None]:
# adding wm text
with open("/bg/en_wm_bows.json",'r') as jf:
    wm_en = json.load(jf)

In [None]:
for lemma, bg in lemma_lit_en.items():
    bg["wm"] = wm_en[lemma]["wm_text"]

In [None]:
# remove empty str
for lemma, bg in lemma_lit_en.items():
    for source, text_list in bg.items():
        bg[source] = [t for t in text_list if t != '']

In [None]:
# export a csv
with open('bg_en.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ["term","source_1","source_2","source_3","wm"]
    writer.writerow(header)
    
    for lemma, bg in lemma_lit_en.items():
        data = [lemma, bg["source_1"], bg["source_2"], bg["source_3"], bg["wm"]]
        writer.writerow(data)

### NL

In [None]:
# importing subsets NL
with open('/Wikidata/wd_nl_subset.json','r') as jf:
    wd_nl_subset = json.load(jf)
with open('/AAT/aat_nl_subset.json','r') as jf:
    aat_nl_subset = json.load(jf)
with open('/ODWN/odwn_subset.json','r') as jf:
    odwn_subset = json.load(jf)

In [None]:
# all unique query terms
qt_nl = []
qt_nl.extend(list(wd_nl_subset))
qt_nl.extend(list(aat_nl_subset))
qt_nl.extend(list(odwn_subset))
qt_nl_u = list(set(qt_nl))

In [None]:
# all unique lemmas
nl_lemmas = []
for qt in qt_nl_u:
    nl_lemmas.append(_get_lemma_by_term(qt,"nl"))
nl_lemmas = list(set(nl_lemmas))

In [None]:
# importing rm 
with open('/rm/rm_wd_nl.json','r') as jf:
    rm_wd_nl = json.load(jf)
with open('/rm/rm_aat_nl.json','r') as jf:
    rm_aat_nl = json.load(jf)
with open('/rm/rm_odwn.json','r') as jf:
    rm_odwn = json.load(jf)

In [None]:
lemma_lit_nl = {}

for lemma in nl_lemmas:
    
    source1 = []
    source2 = []
    source3 = []
    
    # source1: wikidata
    for hit in rm_wd_nl[lemma]:
        source1.append(hit['prefLabel'])
        if hit["aliases"]:
            lits.extend(hit["aliases"])
        if hit["description"]:
            lits.extend(hit["description"])
        source1.extend(hit['instance_of'])
        source1.extend(hit['subclass_of'])
    
    # source2: aat
    for hit in rm_aat_nl[lemma]:
        source2.append(hit['prefLabel'])
        source2.extend(hit['altLabel'])
        source2.append(hit['scopeNote'])
        source2.append(hit['prefLabel_comment'])
        source2.extend(hit['altLabel_comment'])
    
    # source3: odwn
    for hit in rm_odwn[lemma]:
        source3.append(hit.get('le_written_form'))
        source3.append(hit.get('sense_definition'))
        source3.extend(hit.get('sense_examples'))
        source3.extend(hit.get('synonyms'))
        source3.extend(hit.get('synset_definitions'))
        
    lemma_lit_nl[lemma] = {"source_1":list(set(source1)),\
                           "source_2":list(set(source2)),\
                           "source_3":list(set(source3))}

In [None]:
# adding wm text
with open("/bg/nl_wm_bows.json",'r') as jf:
    wm_nl = json.load(jf)

In [None]:
for lemma, bg in lemma_lit_nl.items():
    bg["wm"] = wm_nl[lemma]["wm_text"]

In [None]:
# remove empty str
for lemma, bg in lemma_lit_nl.items():
    for source, text_list in bg.items():
        bg[source] = [t for t in text_list if t != '']

In [None]:
# export a csv
with open('bg_nl.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ["term","source_1","source_2","source_3","wm"]
    writer.writerow(header)
    
    for lemma, bg in lemma_lit_nl.items():
        data = [lemma, bg["source_1"], bg["source_2"], bg["source_3"], bg["wm"]]
        writer.writerow(data)