In [None]:
def get_value(value):
    value = value['datavalue'] if 'datavalue' in value else None
    value = value['id'] if isinstance(value, dict) and ('id' in value) else value
    value = value['time'] if isinstance(value, dict) and ('time' in value) else value
    value = value['amount'] if isinstance(value, dict) and ('amount' in value) else value
    return value

def get_triplets(data):
    triplet_data = []
    for pid,values in data['claims'].items():
        for val in values:
            triplet = {"Property": pid, "Value": get_value(val['mainsnak'])}

            references = []
            if 'references' in val:
                for i in range(len(val['references'])):
                    # if isinstance(val['references'][i], list):
                    #     val['references'][i] = {val['references'][i][0]['property']: val['references'][i]}
                    for ref_id, ref_values in val['references'][i].items():
                        for ref_val in ref_values:
                            references.append({"Property": ref_id, "Value": get_value(ref_val)})
            triplet['references'] = references

            qualifiers = []
            if 'qualifiers' in val:
                # if isinstance(val['qualifiers'], list):
                #     val['qualifiers'] = {val['qualifiers'][0]['property']: val['qualifiers']}
                for qual_id, qual_values in val['qualifiers'].items():
                    for qual_val in qual_values:
                        qualifiers.append({"Property": qual_id, "Value": get_value(qual_val)})
            triplet['qualifiers'] = qualifiers

            triplet_data.append(triplet)
    return triplet_data

In [None]:
from tqdm import tqdm
import requests

def remove_keys(data, keys_to_remove):
    if isinstance(data, dict):
        data = {key: remove_keys(value, keys_to_remove) for key, value in data.items() if key not in keys_to_remove}
    elif isinstance(data, list):
        data = [remove_keys(item, keys_to_remove) for item in data]
    return data

def clean_datavalue(data):
    if isinstance(data, dict):
        if len(data.keys()) == 1:
            data = clean_datavalue(data[list(data.keys())[0]])
        else:
            data = {key: clean_datavalue(value) for key, value in data.items()}
    elif isinstance(data, list):
        data = [clean_datavalue(item) for item in data]
    return data

def get_labels(qid):
    try:
        r = requests.get(f"https://www.wikidata.org/w/api.php?action=wbgetentities&ids={qid}&format=json")
        entity = r.json()
        return entity['entities'][qid]['labels']
    except:
        print(qid)
        print(r.text)

def add_labels(data):
    if isinstance(data, dict):
        if 'property' in data:
            data = {
                **data,
                'property-labels': get_labels(data['property'])
            }
        if ('unit' in data) and (data['unit'] != '1'):
            data = {
                **data,
                'unit-labels': get_labels(data['unit'].split('/')[-1])
            }
        if ('datatype' in data) and ('datavalue' in data) and ((data['datatype'] == 'wikibase-item') or (data['datatype'] == 'wikibase-property')):
            data['datavalue'] = {
                'id': data['datavalue'],
                'labels': get_labels(data['datavalue'])
            }

        data = {key: add_labels(value) for key, value in data.items()}
    elif isinstance(data, list):
        data = [add_labels(item) for item in data]
    return data

import requests

data = []
for QID in tqdm(['Q2', 'Q42', 'Q90', 'Q5588', 'Q95']):
    r = requests.get(f"https://www.wikidata.org/wiki/Special:EntityData/{QID}.json")
    entity = r.json()['entities'][QID]

    clean_claims = remove_keys(entity['claims'], ['hash', 'snaktype', 'type', 'entity-type', 'numeric-id', 'qualifiers-order', 'snaks-order'])
    clean_claims = clean_datavalue(clean_claims)
    clean_claims = remove_keys(clean_claims, ['id'])
    clean_claims = add_labels(clean_claims)

    clean_entity = {
        'id': entity['id'],
        'labels': entity['labels'],
        'descriptions': entity['descriptions'],
        'aliases': entity['aliases'],
        'sitelinks': remove_keys(entity['sitelinks'], ['badges']),
        'claims': clean_claims
    }
    data.append(clean_entity)

In [None]:
from wikidataEmbed import WikidataTextifier
import pandas as pd

for d in data:
    language = 'de'
    textifier = WikidataTextifier(language=language)
    qid = d['id']
    entity = WikidataEntity.get_entity(qid)
    content = textifier.entity_to_text(entity)
    triplets = get_triplets(d)
    data2.append({
        'id': qid,
        'content': content,
        'triplets': triplets,
        'language': language,
        'label': d['labels'].get(language, d['labels'].get('mul', {'value': ''}))['value'],
        'description': d['descriptions'].get(language, d['descriptions'].get('mul', {'value': ''}))['value'],
        'aliases': d['aliases'].get(language, []) + d['aliases'].get('mul', []),
        'sitelinks': d['sitelinks']
    })

In [None]:
import json
import pandas as pd

json.dump(data, open("wikidata_sample_perentity.json", "rb"))
data2.to_excel('wikidata_sample_perlang.xlsx')