In [None]:
import json
import csv
import pandas as pd
from LODlit import wd

### This notebook generates csv files with statistics of the found terms in Wikidata

In [None]:
# importing search results (retrieved)
# these files are gzipped on GitHub with the prefix "gzip_"
with open('search_results_en.json','r') as jf:
    retrieved_en = json.load(jf)
with open('search_results_nl.json','r') as jf:
    retrieved_nl = json.load(jf)

In [None]:
# importing clean results (filtered out entities)
# these files are gzipped on GitHub with the prefix "gzip_"
with open('results_clean_en.json','r') as jf:
    results_en = json.load(jf)
with open('results_clean_nl.json','r') as jf:
    results_nl = json.load(jf)

In [None]:
# importing query terms
with open('LODlit/query_terms.json','r') as jf:
    query_terms = json.load(jf)

#### 1. N entities retrieved by all terms

In [None]:
n_entities_retrieved_en = {}

for query_term, results in retrieved_en.items():
    if 'searchinfo' in results:
        n_entities = 0
    else:
        n_entities = len(results)
        
    n_entities_retrieved_en[query_term] = n_entities

In [None]:
n_entities_retrieved_nl = {}

for query_term, results in retrieved_nl.items():
    if 'searchinfo' in results:
        n_entities = 0
    else:
        n_entities = len(results)
        
    n_entities_retrieved_nl[query_term] = n_entities

In [None]:
with open('n_entities_retrieved_by_term.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['lemma','term','lang','n_e_retrieved']
    writer.writerow(header)
    
    for query_term, e in n_entities_retrieved_en.items():
        # getting a lemma of the query term
        for l, wordforms in query_terms['en'].items():
            if query_term in wordforms:
                lemma = l
        row = [lemma,query_term,'en',e]
        writer.writerow(row)
        
    for query_term, e in n_entities_retrieved_nl.items():
        # getting a lemma of the query term
        for l, wordforms in query_terms['nl'].items():
            if query_term in wordforms:
                lemma = l
        row = [lemma,query_term,'nl',e]
        writer.writerow(row)

#### 2. N entities retrieved by terms' canonical forms

In [None]:
df = pd.read_csv('n_entities_retrieved_by_term.csv')
# lemmas are not unique in 2 lang, making seaprate dfs by lang
en_df = df.loc[df['lang'] == 'en']
nl_df = df.loc[df['lang'] == 'nl']

In [None]:
with open('n_entities_retrieved_by_lemma.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['lemma','lang','n_e_retrieved']
    writer.writerow(header)

    for group in en_df.groupby('lemma'):
        
        row = [group[0],'en',sum(group[1]['n_e_retrieved'])]
        writer.writerow(row)
        
    for group in nl_df.groupby('lemma'):
        
        row = [group[0],'nl',sum(group[1]['n_e_retrieved'])]
        writer.writerow(row)

#### 3. N entities after proper names filtering by all terms

In [None]:
n_entities_filtered_en = {}
for term, results in results_en.items():
    n_entities_filtered_en[term] = len(set([hit['QID'] for hit in results]))

In [None]:
n_entities_filtered_nl = {}
for term, results in results_nl.items():
    n_entities_filtered_nl[term] = len(set([hit['QID'] for hit in results]))

In [None]:
with open('n_entities_clean_by_term.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['lemma','term','lang','e_clean']
    writer.writerow(header)
    
    for query_term, e in n_entities_filtered_en.items():
        # getting a lemma of the query term
        for l, wordforms in query_terms['en'].items():
            if query_term in wordforms:
                lemma = l
        row = [lemma,query_term,'en',e]
        writer.writerow(row)
        
    for query_term, e in n_entities_filtered_nl.items():
        # getting a lemma of the query term
        for l, wordforms in query_terms['nl'].items():
            if query_term in wordforms:
                lemma = l
        row = [lemma,query_term,'nl',e]
        writer.writerow(row)

#### 4. N entities after proper names filtering by terms' canonical forms

In [None]:
df = pd.read_csv('n_entities_clean_by_term.csv')
# lemmas are not unique in 2 lang, making seaprate dfs by lang
en_df = df.loc[df['lang'] == 'en']
nl_df = df.loc[df['lang'] == 'nl']

In [None]:
with open('n_entities_clean_by_lemma.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['lemma','lang','e_clean']
    writer.writerow(header)

    for group in en_df.groupby('lemma'):
        
        row = [group[0],'en',sum(group[1]['e_clean'])]
        writer.writerow(row)
        
    for group in nl_df.groupby('lemma'):
        
        row = [group[0],'nl',sum(group[1]['e_clean'])]
        writer.writerow(row)

#### 5. N hits (occurences) by all terms
Count N of occurences of query terms in prefLabel, aliases, and description;

The json files with results are gzipped on GitHub with the prefix "gzip_"

In [None]:
# EN
n_query_terms = wd.get_n_hits_by_properties('results_clean_en.json','en')
n_query_terms.to_csv("n_hits_by_term_en.csv")

In [None]:
# NL
n_query_terms_nl = wd.get_n_hits_by_properties('results_clean_nl.json','nl')
n_query_terms_nl.to_csv("n_hits_by_term_nl.csv")

#### 6. N hits (occurences) by terms' canonical forms

In [None]:
# EN
n_lemmas = wd.get_n_hits_by_properties('results_clean_en.json','en',group_by_lemma=True)
n_lemmas.to_csv("n_hits_by_lemma_en.csv")

In [None]:
# NL
n_lemmas_nl = wd.get_n_hits_by_properties('results_clean_nl.json','nl',group_by_lemma=True)
n_lemmas_nl.to_csv("n_hits_by_lemma_nl.csv")