In [2]:
import sys
sys.path.append('../src')

from tqdm import tqdm
import requests
from wikidataEmbed import WikidataTextifier
from wikidataDB import WikidataEntity
import pandas as pd
import json
import re

### Extract and clean-up entities from Wikidata API

In [None]:
def remove_keys(data, keys_to_remove):
    """Removes all keys in a nested dictionary that are in the keys_to_remove list.

    Args:
        data (dict): Dictionary to process
        keys_to_remove (list): A list of strings representing the keys to remove.

    Returns:
        dict: A cleaned-up dictionary.
    """
    if isinstance(data, dict):
        data = {key: remove_keys(value, keys_to_remove) for key, value in data.items() if key not in keys_to_remove}
    elif isinstance(data, list):
        data = [remove_keys(item, keys_to_remove) for item in data]
    return data

def clean_datavalue(data):
    """Remove unnecessary nested arrays or dictionaries with one key. Keep keys that represent a Wikidata property or entity ID.

    Args:
        data (dict): Dictionary to process

    Returns:
        dict: A cleaned-up dictionary.
    """
    if isinstance(data, dict):
        if (len(data.keys()) == 1) and not re.match(r"^[PQ]\d+$", list(data.keys())[0]):
            data = clean_datavalue(data[list(data.keys())[0]])
        else:
            data = {key: clean_datavalue(value) for key, value in data.items()}
    elif isinstance(data, list):
        data = [clean_datavalue(item) for item in data]
    return data

def get_labels(qid):
    """Get the labels of a Wikidata property or entity from the API.

    Args:
        qid (str): QID or PID of a Wikidata property or entity.

    Returns:
        dict: A dictionary of labels in different languages.
    """
    try:
        r = requests.get(f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={qid}&format=json")
        entity = r.json()
        return entity['entities'][qid]['labels']
    except:
        print(qid)
        print(r.text)

def add_labels(data):
    """Add the labels in the entity dictionary where they are missing. For example, for properties, and entities in claims...

    Args:
        data (_type_): Dictionary to process

    Returns:
        dict: The dictionary with the added labels
    """
    if isinstance(data, dict):
        if 'property' in data:
            data = {
                **data,
                'property-labels': get_labels(data['property'])
            }
        if ('unit' in data) and (data['unit'] != '1'):
            data = {
                **data,
                'unit-labels': get_labels(data['unit'].split('/')[-1])
            }
        if ('datatype' in data) and ('datavalue' in data) and ((data['datatype'] == 'wikibase-item') or (data['datatype'] == 'wikibase-property')):
            data['datavalue'] = {
                'id': data['datavalue'],
                'labels': get_labels(data['datavalue'])
            }

        data = {key: add_labels(value) for key, value in data.items()}
    elif isinstance(data, list):
        data = [add_labels(item) for item in data]
    return data

In [None]:
data_perentity = []
for QID in tqdm(['Q2', 'Q42', 'Q90', 'Q5588', 'Q95']):
    r = requests.get(f"https://www.wikidata.org/wiki/Special:EntityData/{QID}.json")
    entity = r.json()['entities'][QID]

    clean_claims = remove_keys(entity['claims'], ['hash', 'snaktype', 'type', 'entity-type', 'numeric-id', 'qualifiers-order', 'snaks-order'])
    clean_claims = clean_datavalue(clean_claims)
    clean_claims = remove_keys(clean_claims, ['id'])
    clean_claims = add_labels(clean_claims)

    clean_entity = {
        'id': entity['id'],
        'labels': entity['labels'],
        'descriptions': entity['descriptions'],
        'aliases': entity['aliases'],
        'sitelinks': remove_keys(entity['sitelinks'], ['badges']),
        'claims': clean_claims
    }
    data_perentity.append(clean_entity)

100%|██████████| 5/5 [00:04<00:00,  1.19it/s]


### Turn the JSON into a table and spit the data by language

In [None]:
def get_value(value):
    value = value['datavalue'] if 'datavalue' in value else None
    value = value['id'] if isinstance(value, dict) and ('id' in value) else value
    value = value['time'] if isinstance(value, dict) and ('time' in value) else value
    value = value['amount'] if isinstance(value, dict) and ('amount' in value) else value
    return value

def get_triplets(data):
    """Extract the triplets as QIDs and PIDs only. Include the value of the destination of the triplet if it's not an entity or property

    Args:
        data (dict): The extracted and cleaned-up entity.

    Returns:
        list: A list of dictionaries which include the property ID, and value.
    """
    triplet_data = []
    for pid,values in data['claims'].items():
        for val in values:
            triplet = {"Property": pid, "Value": get_value(val['mainsnak'])}

            references = []
            if 'references' in val:
                for i in range(len(val['references'])):
                    for ref_id, ref_values in val['references'][i].items():
                        for ref_val in ref_values:
                            references.append({"Property": ref_id, "Value": get_value(ref_val)})
            triplet['references'] = references

            qualifiers = []
            if 'qualifiers' in val:
                for qual_id, qual_values in val['qualifiers'].items():
                    for qual_val in qual_values:
                        qualifiers.append({"Property": qual_id, "Value": get_value(qual_val)})
            triplet['qualifiers'] = qualifiers

            triplet_data.append(triplet)
    return triplet_data


In [16]:
data_perlang = []
for d in data_perentity:
    language = 'de'
    textifier = WikidataTextifier(language=language)
    qid = d['id']
    entity = WikidataEntity.get_entity(qid)
    content = textifier.entity_to_text(entity)
    triplets = get_triplets(d)
    data_perlang.append({
        'id': qid,
        'content': content,
        'triplets': triplets,
        'language': language,
        'label': d['labels'].get(language, d['labels'].get('mul', {'value': ''}))['value'],
        'description': d['descriptions'].get(language, d['descriptions'].get('mul', {'value': ''}))['value'],
        'aliases': d['aliases'].get(language, []) + d['aliases'].get('mul', []),
        'sitelinks': d['sitelinks']
    })

In [None]:
json.dump(data_perentity, open("wikidata_sample_perentity.json", "rb"))
data_perlang.to_excel('wikidata_sample_perlang.xlsx')