In [None]:
import json
import csv
import requests
import time
import zipfile

# English

### Making a list of query terms
Exports "wikidata_query_terms_en.txt"

In [None]:
# reading all the wordforms of the EN query terms
# the imported file is the result of 'labels_in_LOD/getting_word_forms.ipynb'
# see README in labels_in_LOD

path_en_wordforms = 'https://raw.githubusercontent.com/cultural-ai/LODlit/main/en_wordforms.json'

parse = requests.get(path_en_wordforms)
wordforms_en = json.loads(parse.text)

In [None]:
# full text search of MediaWiki captures word forms for English:
# so, for one-word terms, get only lemmas
# for compound terms (with space ' ' characters), get all wordforms, because they will be queried in quotes (""), and
# stemming in MediaWiki in this case won't be applied

all_lemmas_en = []

for key, value in wordforms_en.items():
    if value['lemmata'] != []: # if there is lemmata
        for i in value['lemmata']: # get lemmas for every PoS
            
            # if compound (with ' ') and noun
            if ' ' in i['lemma'] and i['pos'] == 'noun': 
                # add all word forms for compound nouns; do not add adjective forms
                all_lemmas_en = all_lemmas_en + [w for w in i['wordforms']]
            
            # only get lemmas for (1) compound adjectives and (2) one-word terms   
            else:
                all_lemmas_en.append(i['lemma'])
                
    # if there is no lemmata, just add the initial term to the list         
    else:
        all_lemmas_en.append(key)

In [None]:
all_lemmas_en = [word.lower() for word in all_lemmas_en] # lowercase
unique_lemmas_en = list(set(all_lemmas_en)) # only unique lemmas

In [None]:
# adding quotes to compound terms; one-word terms are without quotes
en_query_terms = [f'"{l}"' if ' ' in l else l for l in unique_lemmas_en]

In [None]:
# 104 English query terms
len(en_query_terms)

In [None]:
# writing a txt file
with open("wikidata_query_terms_en.txt", "w") as txt_file:
    txt_file.write(str([w for w in en_query_terms]))

### Getting total hits for every term
complex search without the words "scientific scholarly article" in the results

In [None]:
# query to get total hits for every term
# constant params
url = "https://www.wikidata.org/w/api.php"
params = {"action":"query",
             "list":"search",
             "srsearch":"", # term goes here
             "srlimit":"1", # 1 result per term is enough to get meta on totalhits
             "srinfo":"totalhits",
             "srprop":"titlesnippet",
             "format":"json"}

# adjust header
headers = {"user-agent":"bot getting totalhits for the search terms (CWI; Human-Centered Data Analytics; nesterov@cwi.nl)"}

In [None]:
# iterating over the list of terms
# updating the 'srsearch' param
with open ("total_hits_en.csv","w") as csv_file:
    header = ['query_term', 'total_hits']
    writer = csv.writer(csv_file)
    writer.writerow(header)
    
    
    for term in en_query_terms:
        params["srsearch"] = f"{term} -scientific -scholarly -article"
        r = requests.get(url,params=params,headers=headers)
        hits = r.json()['query']['searchinfo']['totalhits']
        data = [term,hits]
        
        writer.writerow(data)

### Searching en terms and getting labels, aliases, and descriptions for every matched entity

In [None]:
# 'query' with 'search' generator: constant params
# MediaWiki API documentation: https://www.wikidata.org/w/api.php?action=help&modules=main

url = "https://www.wikidata.org/w/api.php"
params = {"action":"query",
          "prop":"entityterms",
          "wbetlanguage":"en", #English
          "generator":"search",
          "gsrsearch":"", # term goes here
          "gsrlimit":"max", # getting all results
          "gsroffset":"0", # offset
          "gsrinfo":"totalhits",
          "gsrsort":"incoming_links_desc", # sorting results by incoming links
          "format":"json",} 
# adjust header
headers = {"user-agent":"bot getting labels aliases and descriptions of the requested pages (CWI; Human-Centered Data Analytics; nesterov@cwi.nl)"}

results = {} # dict to store the results


# the resulting file is zipped on GitHub
with open("wikidata_search_results_en.json", 'w') as results_file:
    
    for term in en_query_terms:
        # counter for offset
        gsroffset = 0
        params["gsroffset"] = gsroffset
        # excluding the words "scientific", "scholarly", and "article" from the search results
        params["gsrsearch"] = f"{term} -scientific -scholarly -article"
        # sending a request
        w = requests.get(url,params=params,headers=headers)
        wikidata_json = w.json()
        time.sleep(2) # to prevent 502
        
        # checking the number of hits
        hits = wikidata_json['query']['searchinfo']['totalhits']
        print("term:",term,"|","hits:",hits)
        
        # if there are no results
        if hits == 0:
            results[term] = wikidata_json['query']
            loops = 0
            
        # saving results for every term from the first query (the first loop)    
        else:
            results[term] = wikidata_json['query']['pages']
            
        # if there are less than 500 hits for a term, this will be the resulting dataset
        if hits < 500:
            loops = 0
            print("saved")
            
# - CONDITIONS - #

        # 10K is max; and if hits > 500, offset is needed
        if 10000 > hits > 500 and hits % 500 > 0:
            loops = hits // 500
            
        # minus one loop if there's no remainder 
        if 10000 > hits > 500 and hits % 500 == 0:
            loops = hits // 500 - 1
            
        # as the first loop is already done, max = 19
        if hits > 10000:
            loops = 19 

# - REQUEST LOOPS - #   

        for i in range(0,loops):
            gsroffset = gsroffset + 500

            # setting the offset and sending a new request
            params["gsroffset"] = gsroffset
            w_i = requests.get(url,params=params,headers=headers)
            wikidata_json_i = w_i.json()
            
            # saving the results
            results[term].update(wikidata_json_i['query']['pages'])
            time.sleep(2)
            print("offset:",gsroffset,"saved")
        
    json.dump(results, results_file)

### Getting claims for every entity to retrieve info about properties

#### Reading the results file
the files were zipped

In [None]:
with zipfile.ZipFile("wikidata_search_results_en.json.zip","r") as unzip:
    unzip.extractall("") # set your path

In [None]:
with open("") as jf: # set the path of the unzipped file
    wd_results = json.load(jf)

#### Querying claims

In [None]:
# 'wbgetentities': constant params
url = "https://www.wikidata.org/w/api.php"
params = {"action":"wbgetentities",
          "ids":"", # string of entities (max=50) goes here
          "props":"claims",
          "languages":"en",
          "format":"json"
         }
# adjust header
headers = {"user-agent":"bot getting claims of the requested entities (CWI; Human-Centered Data Analytics; nesterov@cwi.nl)"}

In [None]:
list_of_requested_entities = [] # run once before querying

In [None]:
for term in en_query_terms[:]: # use slice to prevent 502 (~50 terms at a time)
    
    # entities per term
    list_of_entities = [v['title'] for v in wd_results[term].values() if 'title' in v]

    params["ids"] = ""
    all_claims_per_term = {}
    
    # for debugging
    
    print("term:",term,"|","entities:",len(list_of_entities))

# - CONDITIONS - #

    # if len of the list > 50 and there's a remainder
    if len(list_of_entities) > 50 and len(list_of_entities) % 50 > 0:
        loops = len(list_of_entities) // 50 + 1 # add another loop for requests
        
    # if len of the list > 50 and no remainder    
    if len(list_of_entities) > 50 and len(list_of_entities) % 50 == 0:
        loops = len(list_of_entities) // 50
        
    # otherwise loops = 1 (len < 50 or len = 50)    
    if len(list_of_entities) < 50:
        loops = 1
        
    # for debugging
    
    print("loops:",loops)
        
# - REQUEST LOOPS - #   
        
    # counters to slice list_of_entities
    start = 0
    end = 0
    query_result_list = []
    
    for i in range(0,loops):
        ids_string = "" # putting Qs in one string
        end = end + 50

        for q in list_of_entities[start:end]:
            if q not in list_of_requested_entities:
                ids_string = ids_string + f"{q}|"
                list_of_requested_entities.append(q) # remebering requested entities to prevent duplicates

        start = start + 50

        # updating params

        params["ids"] = ids_string.rstrip("|")

        # sending a request
        d = requests.get(url,params=params,headers=headers)
        claims = d.json() # claims per request
        
        if 'entities' in claims:
            query_result_list.append(claims['entities']) # saving all claims
    
    # for debugging        
    print("actual_results:",len(query_result_list),[len(i) for i in query_result_list])

# - SAVING RESULTS - #
        
    all_claims_per_term['entities'] = query_result_list
    
    # set your path
    # saving all the claims per term in a separate file
    # there will be as many files as query terms
    with open(f'/claims_en/{term}_claims.json', 'w') as json_file:
        json.dump(all_claims_per_term, json_file)
        
    # for debugging
    print(len(list_of_requested_entities),"SAVED","\n\n")

#### saving the 'list_of_requested_entities' in a file

In [None]:
with open("requested_entities_en.txt","w") as txt_file:
    txt_file.write(str(list_of_requested_entities))

# Dutch

### Making a list of query terms
Exports "wikidata_query_terms_nl_all.txt'

In [None]:
# reading all the wordforms of the NL query terms
# the imported file is the result of 'labels_in_LOD/getting_word_forms.ipynb'
# see README in labels_in_LOD

path_nl_wordforms = 'https://raw.githubusercontent.com/cultural-ai/LODlit/main/nl_wordforms.json'

parse = requests.get(path_nl_wordforms)
wordforms_nl = json.loads(parse.text)

In [None]:
# full text search of MediaWiki doesn't capture word forms in Dutch:
# so, we're getting all the Dutch wordforms

all_lemmas_nl = []

for key, value in wordforms_nl.items():
    if value['lemmata'] != []: # if there is lemmata
        for i in value['lemmata']: # get lemmas for every PoS
            all_lemmas_nl = all_lemmas_nl + [w for w in i['wordforms']]
    else:
        all_lemmas_nl.append(key)

In [None]:
all_lemmas_nl = [word.lower() for word in all_lemmas_nl] # lowercase
unique_lemmas_nl = list(set(all_lemmas_nl)) # only unique lemmas

In [None]:
# adding quotes to compound terms; one-word terms are without quotes
nl_query_terms = [f'"{l}"' if ' ' in l else l for l in unique_lemmas_nl]

In [None]:
# 280 NL query terms
len(nl_query_terms)

In [None]:
# writing a txt file
with open("wikidata_query_terms_nl_all.txt", "w") as txt_file:
    txt_file.write(str([w for w in nl_query_terms]))

### Getting total hits for every term
complex search without the words "wetenschappelijk artikel" in the results

In [None]:
# query to get total hits for every term
# using the search generator to request nl language 
# constant params
url = "https://www.wikidata.org/w/api.php"
params = {"format":"json",
          "action":"query",
          "prop":"entityterms",
          "wbetlanguage":"nl",
          "generator":"search",
          "gsrsearch":"", # term goes here
          "gsrlimit":"1", # just 1 hit
          "gsrinfo":"totalhits"}
# adjust header
headers = {"user-agent":"bot getting totalhits for the Dutch search terms (CWI; Human-Centered Data Analytics; nesterov@cwi.nl)"}

In [None]:
# iterating over the list of terms
# updating the 'srsearch' param
nl_hits = {}
for term in nl_query_terms:
    params["gsrsearch"] = f"{term} -wetenschappelijk -artikel"
    r = requests.get(url,params=params,headers=headers)
    hits = r.json()['query']['searchinfo']['totalhits']
    nl_hits[term] = hits

In [None]:
# take only terms with hits for queries; 74 terms were not included
terms_with_hits = [t for t,hits in nl_hits.items() if hits > 0]

In [None]:
# writing a txt file
with open("wikidata_query_terms_nl_with_hits.txt", "w") as txt_file:
    txt_file.write(str([w for w in terms_with_hits]))

### Searching nl terms and getting labels, aliases, and descriptions for every matched entity

In [None]:
# 'query' with 'search' generator: constant params
url = "https://www.wikidata.org/w/api.php"
params = {"action":"query",
          "prop":"entityterms",
          "wbetlanguage":"nl", # Dutch
          "generator":"search",
          "gsrsearch":"", # term goes here
          "gsrlimit":"max", # getting all results
          "gsroffset":"0", # offset
          "gsrinfo":"totalhits",
          "gsrsort":"incoming_links_desc", # sorting results by incoming links
          "format":"json"}
# adjust header
headers = {"user-agent":"bot getting labels aliases and descriptions of the requested pages (CWI; Human-Centered Data Analytics; nesterov@cwi.nl)"}

results = {} # dict to store the results

# the resulting file is zipped on GitHub
with open("wikidata_search_results_nl.json", 'w') as results_file:
    
    for term in terms_with_hits:
        # counter for offset
        gsroffset = 0
        params["gsroffset"] = gsroffset
        # excluding the words "wetenschappelijk" and "artikel" from the search results
        params["gsrsearch"] = f"{term} -wetenschappelijk -artikel"
        w = requests.get(url,params=params,headers=headers)
        wikidata_json = w.json()
        time.sleep(2) # to prevent 502
        
        # checking the number of hits
        hits = wikidata_json['query']['searchinfo']['totalhits']
        print("term:",term,"|","hits:",hits)
        
        # saving results for every term from the first query (the first loop)
        results[term] = wikidata_json['query']['pages']
        
        # if there are less than 500 hits for a term, this will be the resulting dataset
        if hits < 500:
            loops = 0
            print("saved")

# - CONDITIONS - #

        # 10K is max; and if hits > 500, offset is needed
        if 10000 > hits > 500 and hits % 500 > 0:
            loops = hits // 500
            
        # one loop less if there's no remainder 
        if 10000 > hits > 500 and hits % 500 == 0:
            loops = hits // 500 - 1
            
        # as the first loop is already done, max = 19
        if hits > 10000:
            loops = 19 
        
# - REQUEST LOOPS - #   

        for i in range(0,loops):
            gsroffset = gsroffset + 500

            # setting the offset and sending a new request
            params["gsroffset"] = gsroffset
            w_i = requests.get(url,params=params,headers=headers)
            wikidata_json_i = w_i.json()
            
            # saving the results
            results[term].update(wikidata_json_i['query']['pages'])
            time.sleep(2)
            print("offset:",gsroffset,"saved")
        
    json.dump(results, results_file)

### How many (unique) entities are there in the search results?

In [None]:
# unzipping the results file from GitHub
with zipfile.ZipFile("wikidata_search_results_nl.json.zip","r") as unzip:
    unzip.extractall("") # set your path

In [None]:
with open("") as jf: # set your path to the unzipped file
    wd_results_nl = json.load(jf)

In [None]:
# a list of all entities from the results

all_entities_nl = []
for value in wd_results_nl.values():
    list_of_entities = [v['title'] for v in value.values() if 'title' in v]
    for e in list_of_entities:
        all_entities_nl.append(e)
len(all_entities_nl)

In [None]:
# only unique entities
unique_entities_nl = list(set(all_entities_nl))
len(unique_entities_nl)

### How many entities were not alredy queried in the requests for English terms?

In [None]:
# loading the list of requested entities

with open("requested_entities_en.txt","r") as txt_file:
    txt = txt_file.read()
    
requested_entities_en = [s.strip('"\'') for s in txt.lstrip("[").rstrip("]").split(", ")]
len(requested_entities_en)

In [None]:
# getting only the entities that were not queried

nl_to_query = [nl_entity for nl_entity in unique_entities_nl if nl_entity not in requested_entities_en]
len(nl_to_query)

In [None]:
with open("wikidata_query_terms_nl_with_hits.txt", "r") as txt_file:
     txt = txt_file.read()
nl_query_terms = [t for t in txt.split('\n')]

#### Querying the claims for Dutch entities

In [None]:
# 'wbgetentities': constant params
url = "https://www.wikidata.org/w/api.php"
params = {"action":"wbgetentities",
          "ids":"", # string of entities (max=50) goes here
          "props":"claims",
          "languages":"nl", # Dutch
          "format":"json"
         }
# adjust header
headers = {"user-agent":"bot getting claims of the requested entities (CWI; Human-Centered Data Analytics; nesterov@cwi.nl)"}

In [None]:
list_of_requested_entities_nl = [] # run once before querying

In [None]:
for term in nl_query_terms[:]: # use slice to prevent 502 (~50 terms at a time)
    
    # entities per term
    list_of_entities = [v['title'] for v in wd_results_nl[term].values() if 'title' in v]
    
    # take only entities that were not queried before in the English queries
    list_of_entities_unique = [t for t in list_of_entities if t in nl_to_query]
    
    # query only entities that were not queried in the Dutch queries
    to_request = []
    
    for i in list_of_entities_unique:
        if i not in list_of_requested_entities_nl:
            list_of_requested_entities_nl.append(i) # remebering requested entities to prevent duplicates

            to_request.append(i)

    params["ids"] = ""
    all_claims_per_term = {}
    
    # for debugging
    
    print("term:",term,"|","entities:",len(to_request))

# - CONDITIONS - #

    # if len of the list > 50 and there's a remainder
    if len(to_request) > 50 and len(to_request) % 50 > 0:
        loops = len(to_request) // 50 + 1 # add another loop for requests
        
    # if len of the list > 50 and no remainder    
    if len(to_request) > 50 and len(to_request) % 50 == 0:
        loops = len(to_request) // 50
        
    # otherwise loops = 1 (len < 50 or len = 50)    
    if len(to_request) < 50:
        loops = 1
        
    # for debugging
    
    print("loops:",loops)
        
# - REQUEST LOOPS - #   
        
    # counters to slice list_of_entities
    start = 0 
    end = 0
    query_result_list = []
    
    for i in range(0,loops):
        ids_string = "" # putting Qs in one string
        end = end + 50

        for q in to_request[start:end]:
            ids_string = ids_string + f"{q}|"
            
        start = start + 50

        # updating params

        params["ids"] = ids_string.rstrip("|")

        # sending a request
        d = requests.get(url,params=params,headers=headers)
        print(i+1,d.ok)
        claims = d.json() # claims per request
        time.sleep(2)
        
        if 'entities' in claims:
            query_result_list.append(claims['entities']) # saving all claims
    
    # for debugging        
    print("actual_results:",len(query_result_list),[len(i) for i in query_result_list])

# - SAVING RESULTS - #
        
    all_claims_per_term['entities'] = query_result_list

    # set your path
    # saving all the claims per term in a separate file
    # there will be as many files as query terms
    with open(f'/claims_nl/{term}_claims.json', 'w') as json_file:
        json.dump(all_claims_per_term, json_file)
        
    # for debugging
    print((len(list_of_requested_entities_nl),"SAVED","\n\n")
    
print("COMPLETED")

#### saving the 'list_of_requested_entities_nl' in a file

In [None]:
with open("requested_entities_nl.txt","w") as txt_file:
    txt_file.write(str(list_of_requested_entities_nl))