In [1]:
import json
import csv
import pandas as pd
import requests
import io
import time

In [50]:
wiki_csv = pd.read_csv('wikihyper.csv',encoding = "latin", delimiter=';')

In [53]:
qids_depicted = [qid.split('/')[-1] for qid in wiki_csv['wikidata']]

In [68]:
qids_depicted_unique = list(set(qids_depicted))

In [69]:
len(qids_depicted_unique)

2691

In [17]:
headers = {"user-agent":"bot getting labels of the requested entities (CWI; Human-Centered Data Analytics; nesterov@cwi.nl)"}

In [74]:
def request_labels_of_property_values(qids:list, headers:dict, path='', index=1) -> list:
    """
    Getting labels of entities in EN and NL
    qids: list of entities (str) to work on;
    headers: for Wikidata to define "user-agent";
    path: optional; str path where to save json files (including '/');
    index: optional, to indicate the output file index;
    Saves results in json
    Returns a list of entities claims of which were not requested due to request errors
    (empty if everything was successfully requested)
    """
    
    # 'wbgetentities' constant params
    url = "https://www.wikidata.org/w/api.php"
    params = {"action":"wbgetentities",
              "ids":"", # string of entities (max=50)
              "props":"labels|aliases",
              "languages":"en|nl", # requesting 2 languages at the same time
              "format":"json"}
    
    results = {}
    failed_entities = []
    
    # - N LOOPS - #

    # if there's a remainder
    if len(qids) % 50 > 0:
        loops = len(qids) // 50 + 1 # add another loop for requests
    else:
        loops = len(qids) // 50

    # - REQUEST LOOPS - #   

    # counters to slice qids

    start_quid_str = 0
    end_quid_str = 0

    for i in range(0,loops):
        ids_string = "" # putting Qs in one string
        end_quid_str = end_quid_str + 50 # max 50 entities per request

        for q in qids[start_quid_str:end_quid_str]:
            ids_string = ids_string + f"{q}|"

        start_quid_str = start_quid_str + 50

        # updating params
        params["ids"] = ids_string.rstrip("|")

        # sending a request
        d = requests.get(url,params=params,headers=headers)
        labels = d.json() # claims per request
        
        if 'entities' in labels:
            
            for entity,l in labels['entities'].items():
                
                label_data = {'labels':{},'aliases':{}}
                if 'labels' in l:
                    label_data['labels'] = l['labels']
                if 'aliases' in l:
                    label_data['aliases'] = l['aliases']
            
                results[entity] = label_data

            # - SAVING RESULTS - #
    
            with open(f'{path}entity_labels.json', 'w') as json_file:
                json.dump(results, json_file)
                
            # to prevent server errors    
            time.sleep(3)

        else:
            failed_entities.extend(qids[start_quid_str:end_quid_str])

    return failed_entities

In [76]:
request_labels_of_property_values(qids_depicted_unique,headers)

[]