# How to read and process the Wikdata dump file.

In [None]:
import sys
sys.path.append('../src')

from wikidataDumpReader import WikidataDumpReader
from multiprocessing import Manager

In [None]:
FILEPATH = '../data/Wikidata/latest-all.json.bz2'
QUEUE_SIZE = 15000
NUM_PROCESSES = 4
SKIPLINES = 0

In [None]:
multiprocess_manager = Manager()

wikipedialang_counts = multiprocess_manager.dict() # Per language, count the the items that are connected to the Wikipedia page of the language

wikidatalang_counts = multiprocess_manager.dict() # Per language, count the the items that have a label and description supported in the language
wikidatalang_counts_wikionly = multiprocess_manager.dict() # Same as wikidatalang_counts but for items that are connected to a Wikipedia page

wikidata_wikipedia_lang_counts = multiprocess_manager.dict() # The intersection of wikipedialang_counts and wikidatalang_counts.

claim_counts = multiprocess_manager.dict() # Per claim, count how many times it's been included in an item
claim_counts_wikionly = multiprocess_manager.dict() # Same as claim_counts but for items that are connected to a Wikipedia page

instanceof_counts = multiprocess_manager.dict() # Count the distinct values of instance of claim
instanceof_counts_wikionly = multiprocess_manager.dict() # Same as instanceof_counts but for items that are connected to a Wikipedia page

item_type_count = multiprocess_manager.dict() # Number of QIDs vs PIDs vs LIDs...

In [None]:
def get_wikipedia_lang(entity):
    """
    Return the languages of all Wikipedia pages connected to the Wikidata entity.
    """
    langs = set()
    if ('sitelinks' in entity):
        for s in entity['sitelinks']:
            if s.endswith('wiki'):
                langs.add(s.split('wiki')[0])
    return langs

def get_wikidata_lang(entity):
    """
    Return the languages supported in this Wikidata entity (label and description)
    """
    label_langs = set(entity.get('labels', {}).keys())
    desc_langs = set(entity.get('descriptions', {}).keys())
    return label_langs.intersection(desc_langs)

def get_claims_pids(entity):
    """
    Return the list of properties connected to the Wikidata entity.
    """
    pids_count = {}
    for pid,claim in entity.get('claims', {}).items():
        pids_count[pid] = pids_count.get(pid, 0) +1

        for c in claim:
            if 'qualifiers' in c:
                for pid,_ in c['qualifiers'].items():
                    pids_count[pid] = pids_count.get(pid, 0) +1
    return pids_count

def get_instanceof(entity):
    """
    Return the instance of QID values
    """
    instanceof = set()
    if 'P31' in entity.get('claims', {}):
        for c in entity['claims']['P31']:
            if ('mainsnak' in c) and ('datavalue' in c['mainsnak']):
                if (c['mainsnak'].get('datatype', '') == 'wikibase-item'):
                    qid = c['mainsnak']['datavalue']['value']['id']
                    instanceof.add(qid)
    return instanceof

In [None]:
def calculate_stats(item, item_type_count, wikipedialang_counts, wikidatalang_counts, wikidatalang_counts_wikionly, claim_counts, claim_counts_wikionly, instanceof_counts, instanceof_counts_wikionly, wikidata_wikipedia_lang_counts):
    if (item is not None):
        id_type = item['id'][0]
        wikipedialangs = get_wikipedia_lang(item)
        wikidatalangs = get_wikidata_lang(item)
        claimspids = get_claims_pids(item)
        instanceof = get_instanceof(item)

        item_type_count[id_type] = item_type_count.get(id_type, 0) +1

        if id_type == 'Q':
            for lang in wikipedialangs:
                wikipedialang_counts[lang] = wikipedialang_counts.get(lang, 0) +1

            for lang in wikidatalangs:
                wikidatalang_counts[lang] = wikidatalang_counts.get(lang, 0) +1

            for pid, count in claimspids.items():
                claim_counts[pid] = claim_counts.get(pid, 0) + count

            for qid in instanceof:
                instanceof_counts[qid] = instanceof_counts.get(qid, 0) +1

            if len(wikipedialangs) > 0:
                wikipedialang_counts['total']  = wikipedialang_counts.get('total', 0) +1

                for lang in wikidatalangs:
                    wikidatalang_counts_wikionly[lang] = wikidatalang_counts_wikionly.get(lang, 0) +1

                for pid, count in claimspids.items():
                    claim_counts_wikionly[pid] = claim_counts_wikionly.get(pid, 0) + count

                for qid in instanceof:
                    instanceof_counts_wikionly[qid] = instanceof_counts_wikionly.get(qid, 0) +1

            for lang in wikidatalangs.intersection(wikipedialangs):
                wikidata_wikipedia_lang_counts[lang] = wikidata_wikipedia_lang_counts.get(lang, 0) +1


wikidata = WikidataDumpReader(FILEPATH, num_processes=NUM_PROCESSES, queue_size=QUEUE_SIZE, skiplines=SKIPLINES)
wikidata.run(lambda item: calculate_stats(item, item_type_count, wikipedialang_counts, wikidatalang_counts, wikidatalang_counts_wikionly, claim_counts, claim_counts_wikionly, instanceof_counts, instanceof_counts_wikionly, wikidata_wikipedia_lang_counts), max_iterations=1000, verbose=True)