In [None]:
import json
import csv
import pandas as pd
from LODlit import wd

### Getting N of entities by term
* Sending requests to Wikidata to retrieve N of entities found by term with:
    * no filtering
    * filtering with keywords
    * filtering with keywords and statements
* Generating 2 csv files with (1) N entities by term (n_entities_by_term.csv), (2) N entities by term grouped by canonical forms (n_entities_by_lemma.csv)

In [None]:
# importing query terms
with open("/LODlit/query_terms.json","r") as jf:
    query_terms = json.load(jf)

In [None]:
en_query_terms = []
for lemma, wordforms in query_terms['en'].items():
    en_query_terms.extend(wordforms)
len(en_query_terms)

In [None]:
nl_query_terms = []
for lemma, wordforms in query_terms['nl'].items():
    nl_query_terms.extend(wordforms)
len(nl_query_terms)

In [None]:
user_agent = "Bot getting N of search hits (nesterov@cwi.nl)"

In [None]:
# hits with no filter EN
hits_en_no_filter = {}

for query_term in en_query_terms:
    hits_per_term = wd.get_search_hits(query_term,'en',user_agent)
    hits_en_no_filter.update(hits_per_term)

In [None]:
# hits with no filter NL
hits_nl_no_filter = {}

for query_term in nl_query_terms:
    hits_per_term = wd.get_search_hits(query_term,'nl',user_agent)
    hits_nl_no_filter.update(hits_per_term)

In [None]:
# hits with keywords EN
keywords_en = ['scientific','scholarly','article']
hits_en_filter_keywords = {}

for query_term in en_query_terms:
    hits_per_term = wd.get_search_hits(query_term,'en',user_agent,keywords_en)
    hits_en_filter_keywords.update(hits_per_term)

In [None]:
# hits with keywords NL
keywords_nl = ['wetenschappelijk','artikel']
hits_nl_filter_keywords = {}

for query_term in nl_query_terms:
    hits_per_term = wd.get_search_hits(query_term,'nl',user_agent,keywords_nl)
    hits_nl_filter_keywords.update(hits_per_term)

In [None]:
# importing statements to filter out
with open("statements_filter.json","r") as jf:
    statements_filter = json.load(jf)

In [None]:
# creating a list of tuples with statements to filter out
statements = []
for s in statements_filter.keys():
    statements.extend([('P31',s),('P279',s)])

In [None]:
# hits with keywords and statements EN
hits_en_filter_keywords_statements = {}

for query_term in en_query_terms:
    hits_per_term = wd.get_search_hits(query_term,'en',user_agent,keywords_en,statements)
    hits_en_filter_keywords_statements.update(hits_per_term)

In [None]:
# hits with keywords and statements NL
hits_nl_filter_keywords_statements = {}

for query_term in nl_query_terms:
    hits_per_term = wd.get_search_hits(query_term,'nl',user_agent,keywords_nl,statements)
    hits_nl_filter_keywords_statements.update(hits_per_term)

In [None]:
# exporting a resulting csv
with open('n_entities_by_term.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['term', 'lang', 'e_no_filter', 'e_filter_keywords', 'e_filter_keywords_statements']
    writer.writerow(header)
    for query_term in en_query_terms:
        row = [query_term,'en',hits_en_no_filter[query_term],hits_en_filter_keywords[query_term],hits_en_filter_keywords_statements[query_term]]
        writer.writerow(row)
    for query_term in nl_query_terms:
        row = [query_term,'nl',hits_nl_no_filter[query_term],hits_nl_filter_keywords[query_term],hits_nl_filter_keywords_statements[query_term]]
        writer.writerow(row)

In [None]:
# N entities by lemma

In [None]:
df = pd.read_csv('n_entities_by_term.csv')
en_df = df.loc[df['lang'] == 'en']
nl_df = df.loc[df['lang'] == 'nl']

In [None]:
en_df.insert(0, 'lemma','')

In [None]:
nl_df.insert(0, 'lemma','')

In [None]:
# EN
for lemma, wordforms in query_terms['en'].items():
    for row in en_df.iterrows():
        if row[1]['term'] in wordforms:
            en_df.loc[row[0],'lemma'] = lemma

In [None]:
# NL
for lemma, wordforms in query_terms['nl'].items():
    for row in nl_df.iterrows():
        if row[1]['term'] in wordforms:
            nl_df.loc[row[0],'lemma'] = lemma

In [None]:
# N entities by lemma
with open('n_entities_by_lemma.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['lemma','lang','e_no_filter','e_filter_keywords','e_filter_keywords_statements']
    writer.writerow(header)
    
    for group in en_df.groupby('lemma'):
        row = [group[0], 'en', sum(list(group[1]['e_no_filter'])), sum(list(group[1]['e_filter_keywords'])),\
               sum(list(group[1]['e_filter_keywords_statements']))]
        writer.writerow(row)
        
    for group in nl_df.groupby('lemma'):
        row = [group[0], 'nl', sum(list(group[1]['e_no_filter'])), sum(list(group[1]['e_filter_keywords'])),\
               sum(list(group[1]['e_filter_keywords_statements']))]
        writer.writerow(row)