### This notebook generates csv files with statistics of found query terms and their lemmas in Wikidata

In [None]:
import json
import csv
import pandas as pd

In [None]:
# importing search results (retrieved)
with open('/Users/anesterov/wd/jan31/search_results_en.json','r') as jf:
    retrieved_en = json.load(jf)
with open('/Users/anesterov/wd/jan31/search_results_nl.json','r') as jf:
    retrieved_nl = json.load(jf)

In [None]:
# importing clean results
with open('/Users/anesterov/wd/jan31/results_clean_en.json','r') as jf:
    results_en = json.load(jf)
with open('/Users/anesterov/wd/jan31/results_clean_nl.json','r') as jf:
    results_nl = json.load(jf)

In [None]:
# importing query terms with lemmas
with open('/Users/anesterov/reps/LODlit/query_terms.json','r') as jf:
    query_terms = json.load(jf)

### 1. N entities retrieved by query terms

In [None]:
n_entities_retrieved_en = {}

for query_term, results in retrieved_en.items():
    if 'searchinfo' in results:
        n_entities = 0
    else:
        n_entities = len(results)
        
    n_entities_retrieved_en[query_term] = n_entities

In [None]:
n_entities_retrieved_nl = {}

for query_term, results in retrieved_nl.items():
    if 'searchinfo' in results:
        n_entities = 0
    else:
        n_entities = len(results)
        
    n_entities_retrieved_nl[query_term] = n_entities

In [None]:
with open('n_entities_retrieved_by_term.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['lemma','term','lang','n_e_retrieved']
    writer.writerow(header)
    
    for query_term, e in n_entities_retrieved_en.items():
        # getting a lemma of the query term
        for l, wordforms in query_terms['en'].items():
            if query_term in wordforms:
                lemma = l
        row = [lemma,query_term,'en',e]
        writer.writerow(row)
        
    for query_term, e in n_entities_retrieved_nl.items():
        # getting a lemma of the query term
        for l, wordforms in query_terms['nl'].items():
            if query_term in wordforms:
                lemma = l
        row = [lemma,query_term,'nl',e]
        writer.writerow(row)

### 2. N entities retrieved by lemmas

In [None]:
df = pd.read_csv('n_entities_retrieved_by_term.csv')
# lemmas are not unique in 2 lang, making seaprate dfs by lang
en_df = df.loc[df['lang'] == 'en']
nl_df = df.loc[df['lang'] == 'nl']

In [None]:
with open('n_entities_retrieved_by_lemma.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['lemma','lang','n_e_retrieved']
    writer.writerow(header)

    for group in en_df.groupby('lemma'):
        
        row = [group[0],'en',sum(group[1]['n_e_retrieved'])]
        writer.writerow(row)
        
    for group in nl_df.groupby('lemma'):
        
        row = [group[0],'nl',sum(group[1]['n_e_retrieved'])]
        writer.writerow(row)

### 3. N entities after proper names filtering by query terms

In [None]:
n_entities_filtered_en = {}
for term, results in results_en.items():
    n_entities_filtered_en[term] = len(set([hit['QID'] for hit in results]))

In [None]:
n_entities_filtered_nl = {}
for term, results in results_nl.items():
    n_entities_filtered_nl[term] = len(set([hit['QID'] for hit in results]))

In [None]:
with open('n_entities_clean_by_term.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['lemma','term','lang','e_clean']
    writer.writerow(header)
    
    for query_term, e in n_entities_filtered_en.items():
        # getting a lemma of the query term
        for l, wordforms in query_terms['en'].items():
            if query_term in wordforms:
                lemma = l
        row = [lemma,query_term,'en',e]
        writer.writerow(row)
        
    for query_term, e in n_entities_filtered_nl.items():
        # getting a lemma of the query term
        for l, wordforms in query_terms['nl'].items():
            if query_term in wordforms:
                lemma = l
        row = [lemma,query_term,'nl',e]
        writer.writerow(row)

### 4. N entities after proper names filtering by lemmas

In [None]:
df = pd.read_csv('n_entities_clean_by_term.csv')
# lemmas are not unique in 2 lang, making seaprate dfs by lang
en_df = df.loc[df['lang'] == 'en']
nl_df = df.loc[df['lang'] == 'nl']

In [None]:
with open('n_entities_clean_by_lemma.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['lemma','lang','e_clean']
    writer.writerow(header)

    for group in en_df.groupby('lemma'):
        
        row = [group[0],'en',sum(group[1]['e_clean'])]
        writer.writerow(row)
        
    for group in nl_df.groupby('lemma'):
        
        row = [group[0],'nl',sum(group[1]['e_clean'])]
        writer.writerow(row)

### 5. N hits (occurences) by query terms
Count N of occurences of query terms in prefLabel, aliases, and description

In [None]:
# EN
wd_where_terms_found_en = {}

for term, results in results_en.items():
    pref = 0
    alias = 0
    descr = 0
    for hit in results:
        if hit['found_in'] == 'prefLabel':
            pref += 1
        if hit['found_in'] == 'aliases':
            alias += 1
        if hit['found_in'] == 'description':
            descr += 1
            
    wd_where_terms_found_en[term] = [pref,alias,descr]

In [None]:
# NL
wd_where_terms_found_nl = {}

for term, results in results_nl.items():
    pref = 0
    alias = 0
    descr = 0
    for hit in results:
        if hit['found_in'] == 'prefLabel':
            pref += 1
        if hit['found_in'] == 'aliases':
            alias += 1
        if hit['found_in'] == 'description':
            descr += 1
            
    wd_where_terms_found_nl[term] = [pref,alias,descr]

In [None]:
with open('n_hits_by_term.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['lemma','term','lang','wd_prefLabel','wd_aliases','wd_description','total']
    writer.writerow(header)
    
    for term, stats in wd_where_terms_found_en.items():
        for l, wordforms in query_terms['en'].items():
            if term in wordforms:
                lemma = l
        row = [lemma, term, 'en', stats[0], stats[1], stats[2], sum(stats)]
        writer.writerow(row)
        
    for term, stats in wd_where_terms_found_nl.items():
        for l, wordforms in query_terms['nl'].items():
            if term in wordforms:
                lemma = l
        row = [lemma, term, 'nl', stats[0], stats[1], stats[2], sum(stats)]
        writer.writerow(row)

### 6. N hits (occurences) by lemmas

In [None]:
# grouping N hits by lemmas
df = pd.read_csv('n_hits_by_term.csv')
# lemmas are not unique in 2 lang, making sneaprate dfs by lang
en_df = df.loc[df['lang'] == 'en']
nl_df = df.loc[df['lang'] == 'nl']

In [None]:
# exporting a csv with stats by lemma

with open('n_hits_by_lemma.csv','w') as csv_file:
    writer = csv.writer(csv_file)
    header = ['lemma','lang','wd_pref_lemma','wd_aliases_lemma','wd_desc_lemma','total_lemma']
    writer.writerow(header)

    for group in en_df.groupby('lemma'):
        
        row = [group[0],'en',sum(group[1]['wd_prefLabel']),sum(group[1]['wd_aliases']),\
              sum(group[1]['wd_description']),sum(group[1]['total'])]
        
        writer.writerow(row)
        
    for group in nl_df.groupby('lemma'):
        
        row = [group[0],'nl',sum(group[1]['wd_prefLabel']),sum(group[1]['wd_aliases']),\
              sum(group[1]['wd_description']),sum(group[1]['total'])]
        
        writer.writerow(row)