In [2]:
import sys
sys.path.append('../src')

import os
os.environ["LANGUAGE"] = 'en'

from wikidata_dumpreader import WikidataDumpReader
from wikidataDB import WikidataID, WikidataEntity, Session
from sqlalchemy import select
from multiprocessing import Manager
from tqdm import tqdm
import asyncio
import gc
import os
import json

FILEPATH = os.getenv("FILEPATH", '../data/Wikidata/latest-all.json.bz2')
BATCH_SIZE = int(os.getenv("BATCH_SIZE", 10000))
PUSH_SIZE = int(os.getenv("PUSH_SIZE", 20000))
QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", 15000))
NUM_PROCESSES = int(os.getenv("NUM_PROCESSES", 4))
SKIPLINES = int(os.getenv("SKIPLINES", 0))
LANGUAGE = os.getenv("LANGUAGE", 'en')

In [None]:
count_languages = 0
count_entities = 0
count_entities_in_wikipedia = 0

def get_language_count(item):
    langs = []
    if ('sitelinks' in item):
        langs = [s[:-4] for s in item['sitelinks'] if s.endswith('wiki')]
        langs = [lang for lang in langs if ((lang in item['labels']) or ('mul' in item['labels'])) and ((lang in item['descriptions']) or ('mul' in item['descriptions']))]
    return len(langs)

def get_counts(item, sqlitDBlock):
    global count_languages
    global count_entities
    global count_entities_in_wikipedia

    if (item is not None):
        count = get_language_count(item)

        with sqlitDBlock:
            count_entities += 1
            count_languages += count
            if count > 0:
                count_entities_in_wikipedia += 1
            if count_entities%50000 == 0:
                print(f"Entites: {count_entities} Average Languages: {count_languages/count_entities} Average In Wiki: {count_entities_in_wikipedia/count_entities}")
                gc.collect()

async def run_processor(wikidata, sqlitDBlock):
    await wikidata.run(lambda item: get_counts(item, sqlitDBlock), max_iterations=None, verbose=True)

multiprocess_manager = Manager()
sqlitDBlock = multiprocess_manager.Lock()

wikidata = WikidataDumpReader(FILEPATH, num_processes=NUM_PROCESSES, batch_size=BATCH_SIZE, queue_size=QUEUE_SIZE, skiplines=SKIPLINES)

await run_processor(wikidata, sqlitDBlock)

Items Processes: 0 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 297.27 MB
Items Processes: 0 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 482.52 MB
Items Processes: 0 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 3915.31 MB
Items Processes: 9999 	 Line Process Avg: 791 items/sec 	 Memory Usage Avg: 3310.36 MB
Items Processes: 9999 	 Line Process Avg: 600 items/sec 	 Memory Usage Avg: 5181.00 MB
Items Processes: 19999 	 Line Process Avg: 1017 items/sec 	 Memory Usage Avg: 4777.81 MB
Items Processes: 19999 	 Line Process Avg: 794 items/sec 	 Memory Usage Avg: 6510.66 MB
Items Processes: 29999 	 Line Process Avg: 1065 items/sec 	 Memory Usage Avg: 6417.62 MB
Items Processes: 29999 	 Line Process Avg: 951 items/sec 	 Memory Usage Avg: 6332.20 MB
Items Processes: 39999 	 Line Process Avg: 1158 items/sec 	 Memory Usage Avg: 6278.12 MB
Items Processes: 49999 	 Line Process Avg: 1332 items/sec 	 Memory Usage Avg: 5548.50 MB
Items Processes: 49999 	 Line Process Avg: 1233 it

#### Reading the Wikidata dump ZIP file and saving the IDs of entities and properties to SQLite (Only the ones connected to the English Wikipedia)

In [None]:
def save_ids_to_sqlite(item, bulk_ids, sqlitDBlock):
    if (item is not None) and WikidataID.is_in_wikipedia(item, language=LANGUAGE):
        ids = WikidataID.extract_entity_ids(item, language=LANGUAGE)
        bulk_ids.extend(ids)

        with sqlitDBlock:
            if len(bulk_ids) > PUSH_SIZE:
                worked = WikidataID.add_bulk_ids(list(bulk_ids[:PUSH_SIZE]))
                if worked:
                    del bulk_ids[:PUSH_SIZE]
                    gc.collect()

async def run_processor(wikidata, bulk_ids, sqlitDBlock):
    await wikidata.run(lambda item: save_ids_to_sqlite(item, bulk_ids, sqlitDBlock), max_iterations=None, verbose=True)

multiprocess_manager = Manager()
sqlitDBlock = multiprocess_manager.Lock()
bulk_ids = multiprocess_manager.list()

wikidata = WikidataDumpReader(FILEPATH, num_processes=NUM_PROCESSES, batch_size=BATCH_SIZE, queue_size=QUEUE_SIZE, skiplines=SKIPLINES)

await run_processor(wikidata, bulk_ids, sqlitDBlock)

while len(bulk_ids) > 0:
    worked = WikidataID.add_bulk_ids(list(bulk_ids))
    if worked:
        bulk_ids[:] = []
    else:
        asyncio.sleep(1)

### Adding entities (label, description, claims, and aliases) of IDs found in WikidataID to WikidataEntity

In [None]:
FILEPATH = os.getenv("FILEPATH", '../data/Wikidata/latest-all.json.bz2')
BATCH_SIZE = int(os.getenv("BATCH_SIZE", 1000))
PUSH_SIZE = int(os.getenv("PUSH_SIZE", 2000))
QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", 1500))
NUM_PROCESSES = int(os.getenv("NUM_PROCESSES", 8))
SKIPLINES = int(os.getenv("SKIPLINES", 0))
LANGUAGE = os.getenv("LANGUAGE", 'de')

In [None]:
def save_entities_to_sqlite(item, data_batch, sqlitDBlock):
    if (item is not None) and WikidataID.get_id(item['id']):
        item = WikidataEntity.normalise_item(item, language=LANGUAGE)
        data_batch.append(item)

        with sqlitDBlock:
            if len(data_batch) > PUSH_SIZE:
                worked = WikidataEntity.add_bulk_entities(list(data_batch[:PUSH_SIZE]))
                if worked:
                    del data_batch[:PUSH_SIZE]
                    gc.collect()

async def run_processor(wikidata, bulk_ids, sqlitDBlock):
    await wikidata.run(lambda item: save_entities_to_sqlite(item, bulk_ids, sqlitDBlock), max_iterations=None, verbose=True)

multiprocess_manager = Manager()
sqlitDBlock = multiprocess_manager.Lock()
data_batch = multiprocess_manager.list()

wikidata = WikidataDumpReader(FILEPATH, num_processes=NUM_PROCESSES, batch_size=BATCH_SIZE, queue_size=QUEUE_SIZE, skiplines=SKIPLINES)

await run_processor(wikidata, data_batch, sqlitDBlock)

while len(data_batch) > 0:
    worked = WikidataEntity.add_bulk_entities(list(data_batch))
    if worked:
        data_batch[:] = []
    else:
        asyncio.sleep(1)

### Find IDs that are in WikidataID but not in WikidataEntity

In [None]:
with Session() as session:
    result = session.execute(
        select(WikidataID.id)
        .outerjoin(WikidataEntity, WikidataID.id == WikidataEntity.id)
        .filter(WikidataEntity.id == None)
        .filter(WikidataID.in_wikipedia == True)
    )
    missing_ids = set(result.scalars().all())

print(len(missing_ids))

### Find IDs that are not in WikidataEntity but are in the claims, qualifiers, and quantity units of entities connected to Wikipedia

In [None]:
def get_missing_entities(session, ids):
    existing_entities = session.query(WikidataEntity.id).filter(WikidataEntity.id.in_(ids)).all()
    existing_ids = {entity.id for entity in existing_entities}
    return set(ids) - existing_ids

with Session() as session:
    entities = session.query(WikidataEntity).join(WikidataID, WikidataEntity.id == WikidataID.id).filter(WikidataID.in_wikipedia == True).yield_per(100000)

    progressbar = tqdm(total=9203531)
    found = False
    missing_ids = set()

    batch_size = 10000
    ids_to_check = set()

    for entity in entities:
        progressbar.update(1)
        for pid, claim in entity.claims.items():
            ids_to_check.add(pid)
            for c in claim:
                if ('datavalue' in c['mainsnak']):
                    if ((c['mainsnak']['datatype'] == 'wikibase-item') or (c['mainsnak']['datatype'] == 'wikibase-property')):
                        id = c['mainsnak']['datavalue']['value']['id']
                        ids_to_check.add(id)
                    elif (c['mainsnak']['datatype'] == 'quantity') and (c['mainsnak']['datavalue']['value']['unit'] != '1'):
                        id = c['mainsnak']['datavalue']['value']['unit'].rsplit('/', 1)[1]
                        ids_to_check.add(id)

                if 'qualifiers' in c:
                    for pid, qualifier in c['qualifiers'].items():
                        ids_to_check.add(pid)
                        for q in qualifier:
                            if ('datavalue' in q):
                                if ((q['datatype'] == 'wikibase-item') or (q['datatype'] == 'wikibase-property')):
                                    id = q['datavalue']['value']['id']
                                    ids_to_check.add(id)
                                elif (q['datatype'] == 'quantity') and (q['datavalue']['value']['unit'] != '1'):
                                    id = q['datavalue']['value']['unit'].rsplit('/', 1)[1]
                                    ids_to_check.add(id)

        if len(ids_to_check) >= batch_size:
            missing_ids.update(get_missing_entities(session, ids_to_check))
            ids_to_check.clear()

        if progressbar.n % 1000 == 0:
            progressbar.set_description(f"Missing IDs: {len(missing_ids)}")

    if ids_to_check:
        missing_ids.update(get_missing_entities(session, ids_to_check))

    progressbar.close()