In [1]:
import json
import csv
import requests
import io
import time
import zipfile
import re

In [None]:
# importing EN query terms
with open('query_terms_cont_en.json','r') as jf:
    query_terms_cont_en = json.load(jf)

In [None]:
list_of_terms_en = []

for stem, form in query_terms_cont_en.items():
    list_of_terms_en.append(stem)
    list_of_terms_en.extend(form)

In [None]:
len(list_of_terms_en)

In [None]:
# importing NL query terms
with open('query_terms_cont_nl.json','r') as jf:
    query_terms_cont_nl = json.load(jf)

In [None]:
list_of_terms_nl = []

for stem, form in query_terms_cont_nl.items():
    list_of_terms_nl.append(stem)
    list_of_terms_nl.extend(form)

In [None]:
len(list_of_terms_nl)

### Calculating total hits per term (without filtering on keywords and properties)

In [None]:
# 'query' with 'search' generator: constant params
url = "https://www.wikidata.org/w/api.php"
params_en = {"action":"query",
          "prop":"entityterms",
          "wbetlanguage":"en", # English
          "generator":"search",
          "gsrsearch":'', # term goes here (quotes for stemming off)
          "gsrlimit":"1", # getting all results
          "gsrinfo":"totalhits",
          "format":"json"} 
headers = {"user-agent":"bot getting labels aliases and descriptions of the requested pages (CWI; Human-Centered Data Analytics; nesterov@cwi.nl)"}

###### EN

In [None]:
# iterating over the list of terms
# updating the 'srsearch' param
with open ("total_hits_no_filter_en.csv","w") as csv_file:
    header = ['query_term', 'total_hits']
    writer = csv.writer(csv_file)
    writer.writerow(header)
    
    for term in list_of_terms_en:
        params_en["gsrsearch"] = f'"{term}"'
        r = requests.get(url,params=params_en,headers=headers)
        hits = r.json()['query']['searchinfo']['totalhits']
        data = [term,hits]
        
        writer.writerow(data)

###### NL

In [None]:
# 'query' with 'search' generator: constant params
url = "https://www.wikidata.org/w/api.php"
params_nl = {"action":"query",
          "prop":"entityterms",
          "wbetlanguage":"nl", # Dutch
          "generator":"search",
          "gsrsearch":'', # term goes here (quotes for stemming off)
          "gsrlimit":"1", # getting all results
          "gsrinfo":"totalhits",
          "format":"json"} 
headers = {"user-agent":"bot getting labels aliases and descriptions of the requested pages (CWI; Human-Centered Data Analytics; nesterov@cwi.nl)"}

In [None]:
# iterating over the list of terms
# updating the 'srsearch' param
with open ("total_hits_no_filter_nl.csv","w") as csv_file:
    header = ['query_term', 'total_hits']
    writer = csv.writer(csv_file)
    writer.writerow(header)
    
    for term in list_of_terms_nl:
        params_nl["gsrsearch"] = f'"{term}"'
        r = requests.get(url,params=params_nl,headers=headers)
        hits = r.json()['query']['searchinfo']['totalhits']
        data = [term,hits]
        
        writer.writerow(data)

### Calculating total hits per term with filtering on keywords and properties

In [None]:
# generating the search strings (srsearch) with filters

In [None]:
# json file with properties to exlude
with open("/Users/anesterov/reps/LODlit/Wikidata/to_exlude.json","r") as json_file:
     to_exclude = json.load(json_file)
        
property_list = ['P31','P279']
filter_str_en = "-scientific -scholarly -article"
filter_str_nl = "-wetenschappelijk -artikel"

# writing strings for 
for p in property_list:
    for k in to_exclude.keys():
        add_str = f" -haswbstatement:{p}={k}"
        filter_str_en = filter_str_en + add_str
        filter_str_nl = filter_str_nl + add_str

###### EN

In [None]:
# iterating over the list of terms
# updating the 'srsearch' param
with open ("total_hits_filter_en.csv","w") as csv_file:
    header = ['query_term', 'total_hits']
    writer = csv.writer(csv_file)
    writer.writerow(header)
    
    for term in list_of_terms_en:
        params_en["gsrsearch"] = f'"{term}" ' + filter_str_en
        r = requests.get(url,params=params_en,headers=headers)
        hits = r.json()['query']['searchinfo']['totalhits']
        data = [term,hits]
        
        writer.writerow(data)

###### NL

In [None]:
# iterating over the list of terms
# updating the 'srsearch' param
with open ("total_hits_filter_nl.csv","w") as csv_file:
    header = ['query_term', 'total_hits']
    writer = csv.writer(csv_file)
    writer.writerow(header)
    
    for term in list_of_terms_nl:
        params_nl["gsrsearch"] = f'"{term}" ' + filter_str_nl
        r = requests.get(url,params=params_nl,headers=headers)
        hits = r.json()['query']['searchinfo']['totalhits']
        data = [term,hits]
        
        writer.writerow(data)

### Searching EN terms and getting labels, aliases, and descriptions for every found entity

In [None]:
# 'query' with 'search' generator: constant params
url = "https://www.wikidata.org/w/api.php"
params = {"action":"query",
          "prop":"entityterms",
          "wbetlanguage":"en", #English
          "generator":"search",
          "gsrsearch":'', # term goes here (quotes for stemming off)
          "gsrlimit":"max", # getting all results
          "gsroffset":"0", # offset
          "gsrinfo":"totalhits",
          "gsrsort":"incoming_links_desc", # sorting results by incoming links
          "format":"json",} 
headers = {"user-agent":"bot getting labels aliases and descriptions of the requested pages (CWI; Human-Centered Data Analytics; nesterov@cwi.nl)"}

results = {} # dict to store the results

with open("wikidata_search_results_en.json", 'w') as results_file:
    
    for term in list_of_terms_en:
        # counter for offset
        gsroffset = 0
        params["gsroffset"] = gsroffset
        # filtering the search results
        params["gsrsearch"] = f'"{term}" ' + filter_str_en # quotes for stemming off
        # sending a request
        w = requests.get(url,params=params,headers=headers)
        wikidata_json = w.json()
        time.sleep(2) # to prevent 502
        
        # checking the number of hits
        hits = wikidata_json['query']['searchinfo']['totalhits']
        print("term:",term,"|","hits:",hits)
        
        # if there are no results
        if hits == 0:
            results[term] = wikidata_json['query']
            loops = 0
            
        # saving results for every term from the first query (the first loop)    
        else:
            results[term] = wikidata_json['query']['pages']
            
        # if there are less than 500 hits for a term, this will be the resulting dataset
        if hits < 500:
            loops = 0
            print("saved")
            
# - CONDITIONS - #

        # 10K is max; and if hits > 500, offset is needed
        if 10000 > hits > 500 and hits % 500 > 0:
            loops = hits // 500
            
        # minus one loop if there's no remainder 
        if 10000 > hits > 500 and hits % 500 == 0:
            loops = hits // 500 - 1
            
        # as the first loop is already done, max = 19
        if hits > 10000:
            loops = 19 

# - REQUEST LOOPS - #   

        for i in range(0,loops):
            gsroffset = gsroffset + 500

            # setting the offset and sending a new request
            params["gsroffset"] = gsroffset
            w_i = requests.get(url,params=params,headers=headers)
            wikidata_json_i = w_i.json()
            
            # saving the results
            results[term].update(wikidata_json_i['query']['pages'])
            time.sleep(2)
            print("offset:",gsroffset,"saved")
        
    json.dump(results, results_file)

### Searching NL terms and getting labels, aliases, and descriptions for every found entity

In [None]:
# 'query' with 'search' generator: constant params
url = "https://www.wikidata.org/w/api.php"
params = {"action":"query",
          "prop":"entityterms",
          "wbetlanguage":"nl", # Dutch
          "generator":"search",
          "gsrsearch":'', # term goes here (quotes for stemming off)
          "gsrlimit":"max", # getting all results
          "gsroffset":"0", # offset
          "gsrinfo":"totalhits",
          "gsrsort":"incoming_links_desc", # sorting results by incoming links
          "format":"json",} 
headers = {"user-agent":"bot getting labels aliases and descriptions of the requested pages (CWI; Human-Centered Data Analytics; nesterov@cwi.nl)"}

results = {} # dict to store the results

with open("wikidata_search_results_nl.json", 'w') as results_file:
    
    for term in list_of_terms_nl:
        # counter for offset
        gsroffset = 0
        params["gsroffset"] = gsroffset
        # filtering the search results
        params["gsrsearch"] = f'"{term}" ' + filter_str_nl # quotes for stemming off
        # sending a request
        w = requests.get(url,params=params,headers=headers)
        wikidata_json = w.json()
        time.sleep(2) # to prevent 502
        
        # checking the number of hits
        hits = wikidata_json['query']['searchinfo']['totalhits']
        print("term:",term,"|","hits:",hits)
        
        # if there are no results
        if hits == 0:
            results[term] = wikidata_json['query']
            loops = 0
            
        # saving results for every term from the first query (the first loop)    
        else:
            results[term] = wikidata_json['query']['pages']
            
        # if there are less than 500 hits for a term, this will be the resulting dataset
        if hits < 500:
            loops = 0
            print("saved")
            
# - CONDITIONS - #

        # 10K is max; and if hits > 500, offset is needed
        if 10000 > hits > 500 and hits % 500 > 0:
            loops = hits // 500
            
        # one loop less if there's no remainder 
        if 10000 > hits > 500 and hits % 500 == 0:
            loops = hits // 500 - 1
            
        # as the first loop is already done, max = 19
        if hits > 10000:
            loops = 19 

# - REQUEST LOOPS - #   

        for i in range(0,loops):
            gsroffset = gsroffset + 500

            # setting the offset and sending a new request
            params["gsroffset"] = gsroffset
            w_i = requests.get(url,params=params,headers=headers)
            wikidata_json_i = w_i.json()
            
            # saving the results
            results[term].update(wikidata_json_i['query']['pages'])
            time.sleep(2)
            print("offset:",gsroffset,"saved")
        
    json.dump(results, results_file)