#### Reading the Wikidata dump ZIP file and saving the IDs of entities and properties to a JSON file (Only the ones connected to the Egnlish Wikipedia)

In [1]:
import sys
sys.path.append('../src')

from wikidata_dumpreader import WikidataDumpReader
import json

FILEPATH = '../data/Wikidata/latest-all.json.bz2'
BATCH_SIZE = 1000
NUM_PROCESSES = 4

wikidata = WikidataDumpReader(FILEPATH, num_processes=NUM_PROCESSES, batch_size=BATCH_SIZE, skiplines=94244000)

Wikitypes = {}
isEnglish = {}
total = 0
outputfile = '../data/Wikidata/En_Wikidata_EnWiki_Connections.json'
language = 'en'
def count_types(item):
    global total
    global Wikitypes
    global isEnglish
    
    if item is not None:
        if (language in item['descriptions']) and (language in item['labels']) and ('sitelinks' in item) and (f'{language}wiki' in item['sitelinks']):
            claims = {}
            for pid,claim in item.get('claims', {}).items():
                claims[pid] = [
                    c['mainsnak']['datavalue']['value']['id'] for c in claim if ('mainsnak' in c) and ('datavalue' in c['mainsnak']) and ('value' in c['mainsnak']['datavalue']) and ('id' in c['mainsnak']['datavalue']['value']) and (c['mainsnak'].get('datatype', '') == 'wikibase-item')
                ]
            temp = {
                'id': item.get('id', ''),
                'claims': claims
            }

            with open(outputfile, mode="a+") as file:
                file.write(json.dumps(temp)+",\n")

        total += 1

async def run_processor():
    await wikidata.run(count_types, max_iterations=None, verbose=True)

with open(outputfile, 'w+') as file:
    file.write("[")

await run_processor()

with open(outputfile, 'a+') as file:
    # Remove the last comma and new line
    file.seek(0, 2)
    file_size = file.tell()
    file.seek(file_size - 2)
    file.truncate()

    file.write("]")

  0%|          | 0/94244000 [00:00<?, ?it/s]

  0%|          | 4966/94244000 [00:02<11:58:12, 2186.92it/s]0 Lines Processed 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 73.75 MB
  0%|          | 11306/94244000 [00:05<10:30:39, 2490.32it/s]0 Lines Processed 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 73.75 MB
  0%|          | 18951/94244000 [00:08<12:54:31, 2027.59it/s]0 Lines Processed 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 73.75 MB
  0%|          | 25377/94244000 [00:11<13:27:06, 1945.59it/s]0 Lines Processed 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 73.75 MB
  0%|          | 33614/94244000 [00:14<8:48:59, 2968.23it/s] 0 Lines Processed 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 73.75 MB
  0%|          | 42313/94244000 [00:17<7:50:59, 3333.49it/s] 0 Lines Processed 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 73.75 MB
  0%|          | 52214/94244000 [00:20<8:27:24, 3093.85it/s]0 Lines Processed 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 73.75 MB
  0%|          | 60952

#### Reading the JSON file of IDs and saving the IDs to an SQLite Database

In [7]:
import sys
sys.path.append('../src')

from wikidata_dumpreader import WikidataDumpReader
from wikidataDB import WikidataID
import json

FILEPATH = '../data/Wikidata/En_Wikidata_EnWiki_Connections.json'
BATCH_SIZE = 10000
NUM_PROCESSES = 1

wikidata = WikidataDumpReader(FILEPATH, num_processes=NUM_PROCESSES, batch_size=BATCH_SIZE)

bulk_ids = []
def save_main_ids_to_sqlite(item):
    global bulk_ids
    
    if item is not None:
        bulk_ids.append({'id': item['id'], 'in_wikipedia': True, 'is_property': False})
        if len(bulk_ids) > BATCH_SIZE:
            WikidataID.add_bulk_ids(bulk_ids)
            bulk_ids = []

async def run_processor():
    await wikidata.run(save_main_ids_to_sqlite, max_iterations=None, verbose=True)

await run_processor()

if len(bulk_ids) > 0:
    WikidataID.add_bulk_ids(bulk_ids)

0it [00:00, ?it/s]
202500 Lines Processed 	 Line Process Avg: 67158 items/sec 	 Memory Usage Avg: 5318.93 MB
462585 Lines Processed 	 Line Process Avg: 76414 items/sec 	 Memory Usage Avg: 6148.29 MB
890103 Lines Processed 	 Line Process Avg: 97176 items/sec 	 Memory Usage Avg: 6148.29 MB
1389459 Lines Processed 	 Line Process Avg: 113943 items/sec 	 Memory Usage Avg: 6151.59 MB
1908326 Lines Processed 	 Line Process Avg: 125110 items/sec 	 Memory Usage Avg: 6168.00 MB
2302483 Lines Processed 	 Line Process Avg: 124957 items/sec 	 Memory Usage Avg: 6158.59 MB
2605617 Lines Processed 	 Line Process Avg: 121242 items/sec 	 Memory Usage Avg: 6158.59 MB
3004248 Lines Processed 	 Line Process Avg: 122292 items/sec 	 Memory Usage Avg: 6158.59 MB
3481967 Lines Processed 	 Line Process Avg: 125888 items/sec 	 Memory Usage Avg: 6160.47 MB
3938108 Lines Processed 	 Line Process Avg: 127996 items/sec 	 Memory Usage Avg: 6159.49 MB
4423031 Lines Processed 	 Line Process Avg: 130984 items/sec 	 Memo

In [2]:
bulk_ids = []
def save_claim_ids_to_sqlite(item):
    global bulk_ids
    
    if item is not None:
        for property, entities in item['claims'].items():
            bulk_ids.append({'id': property, 'in_wikipedia': False, 'is_property': True})
            for entity in entities:
                bulk_ids.append({'id': entity, 'in_wikipedia': False, 'is_property': False})

        if len(bulk_ids) > BATCH_SIZE:
            WikidataID.add_bulk_ids(bulk_ids)
            bulk_ids = []

async def run_processor():
    await wikidata.run(save_claim_ids_to_sqlite, max_iterations=None, verbose=True)

await run_processor()

if len(bulk_ids) > 0:
    WikidataID.add_bulk_ids(bulk_ids)

0it [00:00, ?it/s]
0 Lines Processed 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 4964.27 MB
0 Lines Processed 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 6054.49 MB
22741 Lines Processed 	 Line Process Avg: 2526 items/sec 	 Memory Usage Avg: 6129.38 MB
22741 Lines Processed 	 Line Process Avg: 1894 items/sec 	 Memory Usage Avg: 6129.38 MB
50206 Lines Processed 	 Line Process Avg: 3346 items/sec 	 Memory Usage Avg: 6149.91 MB
50206 Lines Processed 	 Line Process Avg: 2788 items/sec 	 Memory Usage Avg: 6149.91 MB
79930 Lines Processed 	 Line Process Avg: 3805 items/sec 	 Memory Usage Avg: 6156.09 MB
111690 Lines Processed 	 Line Process Avg: 4653 items/sec 	 Memory Usage Avg: 6157.03 MB
111690 Lines Processed 	 Line Process Avg: 4136 items/sec 	 Memory Usage Avg: 6157.03 MB
144565 Lines Processed 	 Line Process Avg: 4817 items/sec 	 Memory Usage Avg: 6176.53 MB
170861 Lines Processed 	 Line Process Avg: 5176 items/sec 	 Memory Usage Avg: 6176.53 MB
170861 Lines Processed 

In [1]:
import sys
sys.path.append('../src')

from wikidata_dumpreader import WikidataDumpReader
from wikidataDB import WikidataID, WikidataEntity
from multiprocessing import Lock
import json
from tqdm import tqdm 

FILEPATH = '../data/Wikidata/latest-all.json.bz2'
BATCH_SIZE = 1000
NUM_PROCESSES = 6
skiplines = 777000+1549988+1119408
wikidata = WikidataDumpReader(FILEPATH, num_processes=NUM_PROCESSES, batch_size=BATCH_SIZE, skiplines=skiplines)

def in_mul_and_not_en(item):
    return ('sitelinks' in item) and (f'{language}wiki' in item['sitelinks']) and (((language not in item['labels']) and ('mul' in item['labels'])) or ((language not in item['descriptions']) and ('mul' in item['descriptions'])))

def remove_keys(data, keys_to_remove=['hash', 'property', 'numeric-id', 'qualifiers-order']):
    if isinstance(data, dict):
        return {
            key: remove_keys(value, keys_to_remove) 
            for key, value in data.items() if key not in keys_to_remove
        }
    elif isinstance(data, list):
        return [remove_keys(item, keys_to_remove) for item in data]
    else:
        return data
    
def get_claims(item):
    claims = {}
    if 'claims' in item:
        for pid,x in item['claims'].items():
            pid_claims = []
            for i in x:
                if (i['type'] == 'statement') and (i['rank'] != 'deprecated'):
                    pid_claims.append({
                        'mainsnak': remove_keys(i['mainsnak']) if 'mainsnak' in i else {},
                        'qualifiers': remove_keys(i['qualifiers']) if 'qualifiers' in i else {},
                        'rank': i['rank']
                    })
            if len(pid_claims) > 0:
                claims[pid] = pid_claims
    return claims

def get_aliases(item):
    aliases = set()
    if language in item['aliases']:
        aliases = set([x['value'] for x in item['aliases'][language]])
    if 'mul' in item['aliases']:
        aliases = aliases | set([x['value'] for x in item['aliases']['mul']])
    return list(aliases)

data_batch = []
progressbar = tqdm(total=12327824, desc="Running...")
progressbar.update(skiplines)
sqlitDBlock = Lock()
language = 'en'
def save_entites_to_sqlite(item):
    global data_batch

    if item is not None:
        if WikidataID.get_id(item['id']) or in_mul_and_not_en(item):
            label = item['labels'][language]['value'] if (language in item['labels']) else (item['labels']['mul']['value'] if ('mul' in item['labels']) else '')
            description = item['descriptions'][language]['value'] if (language in item['descriptions']) else (item['descriptions']['mul']['value'] if ('mul' in item['descriptions']) else '')
            aliases = get_aliases(item)
            claims = get_claims(item)
            data_batch.append({
                'id': item['id'],
                'label': label,
                'description': description,
                'aliases': json.dumps(aliases, separators=(',', ':')),
                'claims': json.dumps(claims, separators=(',', ':')),
            })
            progressbar.update(1)
            progressbar.set_description(f"Batch Size: {len(data_batch)}")
            with sqlitDBlock:
                if len(data_batch) >= BATCH_SIZE:
                    worked = WikidataEntity.add_bulk_entities(data_batch)
                    if worked:
                        data_batch = []
            
async def run_processor():
    await wikidata.run(save_entites_to_sqlite, max_iterations=None, verbose=False)

await run_processor()

if len(data_batch) > 0:
    WikidataEntity.add_bulk_entities(data_batch)

100%|██████████| 3446396/3446396 [07:41<00:00, 7474.81it/s]
Batch Size: 543:   2%|▏         | 144773/8881428 [44:44<73:41:27, 32.93it/s]  