In [None]:
import json
import pandas as pd
from LODlit import bows, aat, wd, pwn31, odwn

### Collecting BoWs with background info for every term
The backgound info consists of:
* Literals of related matches (resources from the knowledge graph)
* Words Matter text

This notebook generates the following files:
- file names: {rm} = "related matches", {bows} = "bags of words", {dataset}, {language suffix})
- (1) rm_bows_wikidata_en.json
- (2) rm_bows_wikidata_nl.json
- (3) rm_bows_aat_en.json
- (4) rm_bows_aat_nl.json
- (5) rm_bows_pwn.json -- (PWN results are only in EN)
- (6) rm_bows_odwn.json -- (ODWN results are only in NL)
- (7) background_info_bows.json -- a joint file with all bows per query term

#### 1. Getting literals of related matches
(Wikidata, AAT, PWN, ODWN)

#### Wikidata

In [None]:
# EN
rm_wikidata_en = wd.get_lit_related_matches_bow("en")

In [None]:
# export
with open('rm_bows_wikidata_en.json', 'w') as jf:
    json.dump(rm_wikidata_en, jf)

In [None]:
# NL
rm_wikidata_nl = wd.get_lit_related_matches_bow("nl")

In [None]:
# export
with open('rm_bows_wikidata_nl.json', 'w') as jf:
    json.dump(rm_wikidata_nl, jf)

#### AAT

In [None]:
# EN
rm_aat_en = aat.get_lit_related_matches_bow("en")

In [None]:
# export
with open('rm_bows_aat_en.json', 'w') as jf:
    json.dump(rm_aat_en, jf)

In [None]:
# NL
rm_aat_nl = aat.get_lit_related_matches_bow("nl")

In [None]:
# export
with open('rm_bows_aat_nl.json', 'w') as jf:
    json.dump(rm_aat_nl, jf)

#### PWN

In [None]:
rm_pwn = pwn31.get_lit_related_matches_bow()

In [None]:
# export
with open('rm_bows_pwn.json', 'w') as jf:
    json.dump(rm_pwn, jf)

#### ODWN

In [None]:
rm_odwn = odwn.get_lit_related_matches_bow()

In [None]:
# export
with open('rm_bows_odwn.json', 'w') as jf:
    json.dump(rm_odwn, jf)

#### Making a common file with all related matches and Words Matter BoWs

In [None]:
# reading all files
with open('rm_bows_wikidata_en.json','r') as jf:
    rm_wikidata_en = json.load(jf)
with open('rm_bows_wikidata_nl.json','r') as jf:
    rm_wikidata_nl = json.load(jf)
    
with open('rm_bows_aat_en.json','r') as jf:
    rm_aat_en = json.load(jf)
with open('rm_bows_aat_nl.json','r') as jf:
    rm_aat_nl = json.load(jf)
    
with open('rm_bows_pwn.json','r') as jf:
    rm_pwn = json.load(jf)
with open('rm_bows_odwn.json','r') as jf:
    rm_odwn = json.load(jf)

In [None]:
# WM terms
with open('en_wm_bows_tf_idf.json','r') as jf:
    en_wm_bows = json.load(jf)
    
with open('nl_wm_bows_tf_idf.json','r') as jf:
    nl_wm_bows = json.load(jf)

In [None]:
# importing query terms
with open('/LODlit/query_terms.json','r') as jf:
    query_terms = json.load(jf)

In [None]:
# getting a list of terms EN

query_terms_en = []
for l in list(query_terms["en"].values()):
    query_terms_en.extend(l)

In [None]:
# getting a list of terms NL

query_terms_nl = []
for l in list(query_terms["nl"].values()):
    query_terms_nl.extend(l)

In [None]:
# shaping a common file with backgroung info
# {lang:{term:{"wikidata":[], "aat":[], "pwn":[], "wm":[]}}

all_rm = {}
all_rm["en"] = {}
all_rm["nl"] = {}

for term in query_terms_en:
    
    dict_per_term = {}
    dict_per_term[term] = {}
    
    # Wikidata
    if rm_wikidata_en.get(term) != None:
        dict_per_term[term]["wikidata"] = rm_wikidata_en[term]["bow"]
    
    # AAT
    if rm_aat_en.get(term) != None:
        dict_per_term[term]["aat"] = rm_aat_en[term]["bow"]
        
    # PWN
    if rm_pwn.get(term) != None:
        dict_per_term[term]["pwn"] = rm_pwn[term]
        
    # WM bows    
    for lemma, wordforms in query_terms["en"].items():
        if term in wordforms:
            dict_per_term[term]["wm"] = en_wm_bows[lemma]["bow_tf_idf"]
    
    all_rm["en"].update(dict_per_term)
    
for term in query_terms_nl:
    
    dict_per_term = {}
    dict_per_term[term] = {}
    
    # Wikidata
    if rm_wikidata_nl.get(term) != None:
        dict_per_term[term]["wikidata"] = rm_wikidata_nl[term]["bow"]
    
    # AAT
    if rm_aat_nl.get(term) != None:
        dict_per_term[term]["aat"] = rm_aat_nl[term]["bow"]
    
    # ODWN
    if rm_odwn.get(term) != None:
        dict_per_term[term]["odwn"] = rm_odwn[term]
    
    # WM bows    
    for lemma, wordforms in query_terms["nl"].items():
        if term in wordforms:
            dict_per_term[term]["wm"] = nl_wm_bows[lemma]["bow_tf_idf"]
            
    all_rm["nl"].update(dict_per_term)

In [None]:
# export
with open('background_info_bows.json', 'w') as jf:
    json.dump(all_rm, jf)

#### Checking how many terms have background info from at least one dataset besides the Words Matter text

In [None]:
# reading the file generated above
with open("background_info_bows.json","r") as jf:
    bg_info = json.load(jf)

In [None]:
n_term_only_wm = 0
has_rm = 0
for term, bg in bg_info["nl"].items():
    if len(bg) == 1 and "wm" in bg:
        n_term_only_wm += 1
    else:
        has_rm += 1

In [None]:
print(f"EN total terms: {len(bg_info['en'].keys())}, has rm: {has_rm}, has wm only: {n_term_only_wm}")

In [None]:
print(f"NL total terms: {len(bg_info['nl'].keys())}, has rm: {has_rm}, has wm only: {n_term_only_wm}")