## Getting N of entities by term
* no filtering
* filtering with keywords
* filtering with keywords and statements

In [1]:
import json
import csv

In [2]:
# Importing LODlitParser Wikidata module 
# this code is taken from
# https://stackoverflow.com/questions/67631/how-can-i-import-a-module-dynamically-given-the-full-path
import importlib.util
import sys
spec = importlib.util.spec_from_file_location("LODlitParser.wd", "/Users/anesterov/reps/LODlit/LODlitParser/wd.py")
wd = importlib.util.module_from_spec(spec)
sys.modules["LODlitParser.wd"] = wd
spec.loader.exec_module(wd)

ModuleNotFoundError: No module named 'bows'

In [3]:
# importing query terms
with open("/Users/anesterov/reps/LODlit/query_terms.json","r") as jf:
    query_terms = json.load(jf)

In [None]:
en_query_terms = []
for lemma, wordforms in query_terms['en'].items():
    en_query_terms.extend(wordforms)
len(en_query_terms)

In [None]:
nl_query_terms = []
for lemma, wordforms in query_terms['nl'].items():
    nl_query_terms.extend(wordforms)
len(nl_query_terms)

In [None]:
user_agent = "Bot getting N of search hits (nesterov@cwi.nl)"

In [None]:
# hits with no filter EN
hits_en_no_filter = {}

for query_term in en_query_terms:
    hits_per_term = wd.get_search_hits(query_term,'en',user_agent)
    hits_en_no_filter.update(hits_per_term)

In [None]:
# hits with no filter NL
hits_nl_no_filter = {}

for query_term in nl_query_terms:
    hits_per_term = wd.get_search_hits(query_term,'nl',user_agent)
    hits_nl_no_filter.update(hits_per_term)

In [None]:
# hits with keywords EN
keywords_en = ['scientific','scholarly','article']
hits_en_filter_keywords = {}

for query_term in en_query_terms:
    hits_per_term = wd.get_search_hits(query_term,'en',user_agent,keywords_en)
    hits_en_filter_keywords.update(hits_per_term)

In [None]:
# hits with keywords NL
keywords_nl = ['wetenschappelijk','artikel']
hits_nl_filter_keywords = {}

for query_term in nl_query_terms:
    hits_per_term = wd.get_search_hits(query_term,'nl',user_agent,keywords_nl)
    hits_nl_filter_keywords.update(hits_per_term)

In [None]:
# importing statements to filter out
with open("/Users/anesterov/reps/LODlit/Wikidata/statements_filter.json","r") as jf:
    statements_filter = json.load(jf)

In [None]:
# creating a list of tuples with statements to filter out
statements = []
for s in statements_filter.keys():
    statements.extend([('P31',s),('P279',s)])

In [None]:
# hits with keywords and statements EN
hits_en_filter_keywords_statements = {}

for query_term in en_query_terms:
    hits_per_term = wd.get_search_hits(query_term,'en',user_agent,keywords_en,statements)
    hits_en_filter_keywords_statements.update(hits_per_term)

In [None]:
# hits with keywords and statements NL
hits_nl_filter_keywords_statements = {}

for query_term in nl_query_terms:
    hits_per_term = wd.get_search_hits(query_term,'nl',user_agent,keywords_nl,statements)
    hits_nl_filter_keywords_statements.update(hits_per_term)

In [None]:
# exporting a resulting csv
with open('n_entities_by_term.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['term', 'lang', 'e_no_filter', 'e_filter_keywords', 'e_filter_keywords_statements']
    writer.writerow(header)
    for query_term in en_query_terms:
        row = [query_term,'en',hits_en_no_filter[query_term],hits_en_filter_keywords[query_term],hits_en_filter_keywords_statements[query_term]]
        writer.writerow(row)
    for query_term in nl_query_terms:
        row = [query_term,'nl',hits_nl_no_filter[query_term],hits_nl_filter_keywords[query_term],hits_nl_filter_keywords_statements[query_term]]
        writer.writerow(row)

In [4]:
# N entities by lemma
import pandas as pd

In [47]:
df = pd.read_csv('/Users/anesterov/reps/LODlit/Wikidata/n_entities_by_term.csv')
en_df = df.loc[df['lang'] == 'en']
nl_df = df.loc[df['lang'] == 'nl']

In [48]:
en_df.insert(0, 'lemma','')

In [64]:
nl_df.insert(0, 'lemma','')

In [49]:
# EN
for lemma, wordforms in query_terms['en'].items():
    for row in en_df.iterrows():
        if row[1]['term'] in wordforms:
            en_df.loc[row[0],'lemma'] = lemma

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [65]:
# NL
for lemma, wordforms in query_terms['nl'].items():
    for row in nl_df.iterrows():
        if row[1]['term'] in wordforms:
            nl_df.loc[row[0],'lemma'] = lemma

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [67]:
# N entities by lemma
with open('n_entities_by_lemma.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['lemma','lang','e_no_filter','e_filter_keywords','e_filter_keywords_statements']
    writer.writerow(header)
    
    for group in en_df.groupby('lemma'):
        row = [group[0], 'en', sum(list(group[1]['e_no_filter'])), sum(list(group[1]['e_filter_keywords'])),\
               sum(list(group[1]['e_filter_keywords_statements']))]
        writer.writerow(row)
        
    for group in nl_df.groupby('lemma'):
        row = [group[0], 'nl', sum(list(group[1]['e_no_filter'])), sum(list(group[1]['e_filter_keywords'])),\
               sum(list(group[1]['e_filter_keywords_statements']))]
        writer.writerow(row)