In [None]:
import json
import requests

## Simplifying entities' claims
The claims of retrieved entities have a lot of extra information, which is unnecessary in this research. They were simplified for more efficients processing. The simplified claims include only labels and Qids of the P31 and P279 properties for every entity.

## EN

### Importing files

In [None]:
# entities from the clean results

with zipfile.ZipFile('wikidata_search_results_en_clean.json.zip','r') as unzip:
    unzip.extractall('') # set your path (~47 MB)

# set your path
with open('/wikidata_search_results_en_clean.json','r') as json_file:
    wd_en = json.load(json_file)

In [None]:
# claims
# set your path to the merged json file from the previous step

with open('/claims_en/merged_claims_en.json','r') as json_file:
    claims_en = json.load(json_file)

### Collecting P31 and P279 for every EN entity from the local files

In [None]:
list_of_all_entities_en = []
for target_word, entity in wd_en.items():
    for page_id, values in entity.items():
        list_of_all_entities_en.append(values['title'])

In [None]:
claims_dict = {}

for target_word, claims_list in claims_en.items():
    for i in claims_list:
        
        for e_id, claims in i.items():
            
            if e_id in list_of_all_entities_en and 'claims' in claims:
                
                subdict = {}
                sub_subdict_P31 = {}
                sub_subdict_P279 = {}
                
                for v in claims['claims'].values():
                    
                    for l in v:
                        
                        if 'datavalue' in l['mainsnak'] and l['mainsnak']['property'] == 'P31':
                            sub_subdict_P31[l['mainsnak']['datavalue']['value']['id']] = ""
                                                    
                        if 'datavalue' in l['mainsnak'] and l['mainsnak']['property'] == 'P279':
                            sub_subdict_P279[l['mainsnak']['datavalue']['value']['id']] = ""
                    
                
                subdict['P31'] = sub_subdict_P31
                subdict['P279'] = sub_subdict_P279
                claims_dict[e_id] = subdict

In [None]:
# making a list of all the values of P31 and P279 

all_values_en = []
for i,v in claims_dict.items():
    all_values_en.extend(list(v['P31'].keys()))
    all_values_en.extend(list(v['P279'].keys()))

In [None]:
# only unique values:

all_values_en_unique = list(set(all_values_en))

### Getting the labels of values of P31 and P279 from Wikidata
Additional requests are needed

In [None]:
# 'wbgetentities': constant params
url = "https://www.wikidata.org/w/api.php"
params = {"action":"wbgetentities",
          "ids":"", # string of entities (max=50) goes here
          "props":"labels",
          "languages":"en",
          "format":"json"
         }
# set your header
headers = {"user-agent":"bot getting labels of the requested entities (CWI; Human-Centered Data Analytics; nesterov@cwi.nl)"}

In [None]:
loops = len(all_values_en_unique) // 50
start = 0
end = 0

values_dict = {}

for i in range(0,loops+1):
    end = end + 50
    ids_string = ""
    
    for value_id in all_values_en_unique[start:end]:
        ids_string = ids_string + f"{value_id}|"
    
    params["ids"] = ids_string.rstrip("|")
    d = requests.get(url,params=params,headers=headers)
    labels = d.json() # labels per request
    
    for k,l in labels['entities'].items():
        if 'labels' in l and 'en' in l['labels']:
            values_dict[k] = l['labels']['en']['value']
    
    start = start + 50

In [None]:
# linking property ids to their labels

for entity_id,values in claims_dict.items():
    
    for i in values['P31'].keys():
        if i in values_dict.keys():
            values['P31'][i] = values_dict[i]
        
    for i in values['P279'].keys():
        if i in values_dict.keys():
            values['P279'][i] = values_dict[i]

In [None]:
# exporting simplified claims EN
# this file is zipped on GitHub

with open('/simple_claims_en.json','w') as json_write:
    json.dump(claims_dict,json_write)

## NL

### Importing files

In [None]:
# entities from the clean results NL

with zipfile.ZipFile('wikidata_search_results_nl_clean.json.zip','r') as unzip:
    unzip.extractall('') # set your path (~28,7 MB)

# set your path
with open('/wikidata_search_results_nl_clean.json','r') as json_file:
    wd_nl = json.load(json_file)

In [None]:
# claims NL
# set your path to the merged json file from the previous step

with open('/claims_nl/merged_claims_nl.json','r') as json_file:
    claims_nl = json.load(json_file)

### Collecting P31 and P279 for every NL entity from the local files

In [None]:
list_of_all_entities_nl = []
for target_word, entity in wd_nl.items():
    for page_id, values in entity.items():
        list_of_all_entities_nl.append(values['title'])

In [None]:
claims_dict_nl = {}

for target_word, claims_list in claims_nl.items():
    for i in claims_list:
        
        for e_id, claims in i.items():
            
            if e_id in list_of_all_entities_nl and 'claims' in claims:
                
                subdict = {}
                sub_subdict_P31 = {}
                sub_subdict_P279 = {}
                
                for v in claims['claims'].values():
                    
                    for l in v:
                        
                        if 'datavalue' in l['mainsnak'] and l['mainsnak']['property'] == 'P31':
                            sub_subdict_P31[l['mainsnak']['datavalue']['value']['id']] = ""
                                                    
                        if 'datavalue' in l['mainsnak'] and l['mainsnak']['property'] == 'P279':
                            sub_subdict_P279[l['mainsnak']['datavalue']['value']['id']] = ""
                    
                
                subdict['P31'] = sub_subdict_P31
                subdict['P279'] = sub_subdict_P279
                claims_dict_nl[e_id] = subdict

In [None]:
# making a list of all the values of the properties P31 and P279

all_values_nl = []
for i,v in claims_dict_nl.items():
    all_values_nl.extend(list(v['P31'].keys()))
    all_values_nl.extend(list(v['P279'].keys()))

In [None]:
# only unique values:

all_values_nl_unique = list(set(all_values_nl))

### Getting the labels of values of P31 and P279 from Wikidata

In [None]:
# 'wbgetentities': constant params
url = "https://www.wikidata.org/w/api.php"
params = {"action":"wbgetentities",
          "ids":"", # string of entities (max=50) goes here
          "props":"labels",
          "languages":"nl", # Dutch
          "format":"json"
         }
# set your header
headers = {"user-agent":"bot getting labels of the requested entities (CWI; Human-Centered Data Analytics; nesterov@cwi.nl)"}

In [None]:
loops = len(all_values_nl_unique) // 50
start = 0
end = 0

values_dict = {}

for i in range(0,loops+1):
    end = end + 50
    ids_string = ""
    
    for value_id in all_values_nl_unique[start:end]:
        ids_string = ids_string + f"{value_id}|"
    
    params["ids"] = ids_string.rstrip("|")
    d = requests.get(url,params=params,headers=headers)
    labels = d.json() # labels per request
    
    for k,l in labels['entities'].items():
        if 'labels' in l and 'nl' in l['labels']:
            values_dict[k] = l['labels']['nl']['value']
    
    start = start + 50
        

In [None]:
# linking property ids to their labels

for entity_id,values in claims_dict_nl.items():
    
    for i in values['P31'].keys():
        if i in values_dict.keys():
            values['P31'][i] = values_dict[i]
        
    for i in values['P279'].keys():
        if i in values_dict.keys():
            values['P279'][i] = values_dict[i]

In [None]:
# exporting simplified claims EN
# this file is zipped on GitHub

with open('/simple_claims_nl.json','w') as json_write:
    json.dump(claims_dict_nl,json_write)