In [2]:
import sys
sys.path.append('../src')

from wikidata_dumpreader import WikidataDumpReader
from wikidataDB import WikidataID, WikidataEntity, Session
from multiprocessing import Manager
from sqlalchemy import select
from tqdm import tqdm

FILEPATH = '../data/Wikidata/latest-all.json.bz2'
BATCH_SIZE = 1000
NUM_PROCESSES = 4
SKIPLINES = 0
LANGUAGE = 'en'

#### Reading the Wikidata dump ZIP file and saving the IDs of entities and properties to a JSON file (Only the ones connected to the English Wikipedia)

In [3]:
multiprocess_manager = Manager()
sqlitDBlock = multiprocess_manager.Lock()
bulk_ids = multiprocess_manager.list()

wikidata = WikidataDumpReader(FILEPATH, num_processes=NUM_PROCESSES, batch_size=BATCH_SIZE, skiplines=SKIPLINES)

def save_ids_to_sqlite(item):
    if (item is not None) and WikidataID.is_in_wikipedia(item, language=LANGUAGE):
        ids = WikidataID.extract_entity_ids(item, language=LANGUAGE)
        bulk_ids.extend(ids)

        with sqlitDBlock:
            if len(bulk_ids) > BATCH_SIZE:
                worked = WikidataID.add_bulk_ids(list(bulk_ids))
                if worked:
                    bulk_ids[:] = []

async def run_processor():
    await wikidata.run(save_ids_to_sqlite, max_iterations=None, verbose=True)

await run_processor()

if len(bulk_ids) > 0:
    worked = WikidataID.add_bulk_ids(list(bulk_ids))

0it [00:00, ?it/s]0:00, ?it/s]
Line Process Avg: 406 items/sec 	 Memory Usage Avg: 2306.29 MB: : 15999it [00:39, 405.66it/s]

### Adding entities (label, description, claims, and aliases) of IDs found in WikidataID to WikidataEntity

In [3]:
multiprocess_manager = Manager()
sqlitDBlock = multiprocess_manager.Lock()
data_batch = multiprocess_manager.list()

wikidata = WikidataDumpReader(FILEPATH, num_processes=NUM_PROCESSES, batch_size=BATCH_SIZE, skiplines=SKIPLINES)

def save_entities_to_sqlite(item):
    if (item is not None) and WikidataID.get_id(item['id']):
        norm_item = WikidataEntity.normalise_item(item, language=LANGUAGE)
        data_batch.append(norm_item)

        with sqlitDBlock:
            if len(data_batch) >= BATCH_SIZE:
                worked = WikidataEntity.add_bulk_entities(list(data_batch))
                if worked:
                    data_batch[:] = []

async def run_processor():
    await wikidata.run(save_entities_to_sqlite, max_iterations=None, verbose=True)

await run_processor()

if len(data_batch) > 0:
    WikidataEntity.add_bulk_entities(list(data_batch))

0it [00:00, ?it/s]


Line Process Avg: 599 items/sec 	 Memory Usage Avg: 1468.22 MB: : 68999it [01:55, 599.02it/s]

### Find IDs that are in WikidataID but not in WikidataEntity

In [None]:
with Session() as session:
    result = session.execute(
        select(WikidataID.id)
        .outerjoin(WikidataEntity, WikidataID.id == WikidataEntity.id)
        .filter(WikidataEntity.id == None)
        .filter(WikidataID.in_wikipedia == True)
    )
    missing_ids = set(result.scalars().all())

print(len(missing_ids))

### Find IDs that are not in WikidataEntity but are in the claims, qualifiers, and quantity units of entities connected to Wikipedia

In [None]:
def get_missing_entities(session, ids):
    existing_entities = session.query(WikidataEntity.id).filter(WikidataEntity.id.in_(ids)).all()
    existing_ids = {entity.id for entity in existing_entities}
    return set(ids) - existing_ids

with Session() as session:
    entities = session.query(WikidataEntity).join(WikidataID, WikidataEntity.id == WikidataID.id).filter(WikidataID.in_wikipedia == True).yield_per(100000)

    progressbar = tqdm(total=9203531)
    found = False
    missing_ids = set()

    batch_size = 10000
    ids_to_check = set()

    for entity in entities:
        progressbar.update(1)
        for pid, claim in entity.claims.items():
            ids_to_check.add(pid)
            for c in claim:
                if ('datavalue' in c['mainsnak']):
                    if ((c['mainsnak']['datatype'] == 'wikibase-item') or (c['mainsnak']['datatype'] == 'wikibase-property')):
                        id = c['mainsnak']['datavalue']['value']['id']
                        ids_to_check.add(id)
                    elif (c['mainsnak']['datatype'] == 'quantity') and (c['mainsnak']['datavalue']['value']['unit'] != '1'):
                        id = c['mainsnak']['datavalue']['value']['unit'].rsplit('/', 1)[1]
                        ids_to_check.add(id)

                if 'qualifiers' in c:
                    for pid, qualifier in c['qualifiers'].items():
                        ids_to_check.add(pid)
                        for q in qualifier:
                            if ('datavalue' in q):
                                if ((q['datatype'] == 'wikibase-item') or (q['datatype'] == 'wikibase-property')):
                                    id = q['datavalue']['value']['id']
                                    ids_to_check.add(id)
                                elif (q['datatype'] == 'quantity') and (q['datavalue']['value']['unit'] != '1'):
                                    id = q['datavalue']['value']['unit'].rsplit('/', 1)[1]
                                    ids_to_check.add(id)


        if len(ids_to_check) >= batch_size:
            missing_ids.update(get_missing_entities(session, ids_to_check))
            ids_to_check.clear()

        if progressbar.n % 1000 == 0:
            progressbar.set_description(f"Missing IDs: {len(missing_ids)}")

    if ids_to_check:
        missing_ids.update(get_missing_entities(session, ids_to_check))

    progressbar.close()