In [1]:
import sys
sys.path.append('../src')

from wikidata_dumpreader import WikidataDumpReader
import json
import aiofiles

FILEPATH = '../data/Wikidata/latest-all.json.bz2'
BATCH_SIZE = 10
NUM_PROCESSES = 4

wikidata = WikidataDumpReader(FILEPATH, num_processes=NUM_PROCESSES, batch_size=BATCH_SIZE)

outputfile = '../data/Wikidata/test.json'
async def append_to_json(item):
    async with aiofiles.open(outputfile, mode="a+") as file:
        await file.write(json.dumps(item)+",\n")

async def run_processor():
    await wikidata.run(append_to_json, max_iterations=3, verbose=True)

with open(outputfile, 'w+') as file:
    file.write("[")

await run_processor()

with open(outputfile, 'a+') as file:
    # Remove the last comma and new line
    file.seek(0, 2)
    file_size = file.tell()
    file.seek(file_size - 2)
    file.truncate()

    file.write("]")

Processed batch at 150 items/sec (avg 150) 	 Memory Usage: 119.72 MB
Processed batch at 1128 items/sec (avg 639) 	 Memory Usage: 116.70 MB
Processed batch at 474 items/sec (avg 584) 	 Memory Usage: 120.26 MB


In [32]:
import sys
sys.path.append('../src')

from wikidata_dumpreader import WikidataDumpReader
import json
import aiofiles

FILEPATH = '../data/Wikidata/latest-all.json.bz2'
BATCH_SIZE = 10000
NUM_PROCESSES = 8

wikidata = WikidataDumpReader(FILEPATH, num_processes=NUM_PROCESSES, batch_size=BATCH_SIZE)

Wikitypes = {}
isEnglish = {}
total = 0
outputfile = '../data/Wikidata/En_Wikidata_EnWiki_Connections.json'
language = 'en'
async def count_types(item):
    global total
    global Wikitypes
    global isEnglish
    
    if item is not None:
        if (language in item['descriptions']) and (language in item['labels']) and ('sitelinks' in item) and (f'{language}wiki' in item['sitelinks']):
            # temp = {
            #     'type': item.get('type', ''),
            #     'id': item.get('id', ''),
            #     'labels': item['labels'][language],
            #     'descriptions': item['descriptions'][language],
            #     'aliases': item['aliases'].get(language, []),
            #     f'{language}wiki': item['sitelinks'][f'{language}wiki'],
            #     'claims': item.get('claims', {}),

            #     'ns': item.get('ns'),
            #     'modified': item.get('modified'),
            #     'lastrevid': item.get('lastrevid'),
            # }

            claims = {}
            for pid,claim in item.get('claims', {}).items():
                claims[pid] = [
                    c['mainsnak']['datavalue']['value']['id'] for c in claim if ('mainsnak' in c) and ('datavalue' in c['mainsnak']) and ('value' in c['mainsnak']['datavalue']) and ('id' in c['mainsnak']['datavalue']['value']) and (c['mainsnak'].get('datatype', '') == 'wikibase-item')
                ]
            temp = {
                'id': item.get('id', ''),
                'claims': claims
            }

            async with aiofiles.open(outputfile, mode="a+") as file:
                await file.write(json.dumps(temp)+",\n")

        total += 1

async def run_processor():
    await wikidata.run(count_types, max_iterations=None, verbose=True)

with open(outputfile, 'w+') as file:
    file.write("[")

await run_processor()

with open(outputfile, 'a+') as file:
    # Remove the last comma and new line
    file.seek(0, 2)
    file_size = file.tell()
    file.seek(file_size - 2)
    file.truncate()

    file.write("]")

Processed batch at 1089 items/sec (avg 1089) 	 Memory Usage: 12050.89 MB
Processed batch at 1200 items/sec (avg 1144) 	 Memory Usage: 11823.94 MB
Processed batch at 1555 items/sec (avg 1281) 	 Memory Usage: 11368.87 MB
Processed batch at 4951 items/sec (avg 2199) 	 Memory Usage: 10569.07 MB
Processed batch at 5262 items/sec (avg 2811) 	 Memory Usage: 9330.11 MB


In [2]:
import json

data = json.load(open(outputfile, 'r+'))
data[1]['labels']['en']
data[1]['descriptions']['en']
data[1]['aliases']['en']

[{'language': 'en', 'value': 'joy'}, {'language': 'en', 'value': 'happy'}]

In [27]:
for pid,claim in data[1]['claims'].items():
    if claim[0]['mainsnak']['datatype'] == 'wikibase-item':
        print(claim[0]['mainsnak']['datavalue']['value']['id'])
        break

Q331769


In [16]:
data[1]['claims']

{'P1245': [{'mainsnak': {'snaktype': 'value',
    'property': 'P1245',
    'datavalue': {'value': '885155', 'type': 'string'},
    'datatype': 'external-id'},
   'type': 'statement',
   'id': 'Q8$475e48c8-41fa-5739-e46e-27ff20507977',
   'rank': 'normal'}],
 'P373': [{'mainsnak': {'snaktype': 'value',
    'property': 'P373',
    'datavalue': {'value': 'Happiness', 'type': 'string'},
    'datatype': 'string'},
   'type': 'statement',
   'id': 'q8$0FC7FB9A-B5CA-4762-98AE-1B0BDC1EEF39',
   'rank': 'normal'}],
 'P31': [{'mainsnak': {'snaktype': 'value',
    'property': 'P31',
    'datavalue': {'value': {'entity-type': 'item',
      'numeric-id': 331769,
      'id': 'Q331769'},
     'type': 'wikibase-entityid'},
    'datatype': 'wikibase-item'},
   'type': 'statement',
   'id': 'q8$E2EFA381-BA5D-4F52-AF74-660B9A044C1E',
   'rank': 'normal'},
  {'mainsnak': {'snaktype': 'value',
    'property': 'P31',
    'datavalue': {'value': {'entity-type': 'item',
      'numeric-id': 60539479,
      'id'

In [18]:
data[1]['sitelinks']['enwiki']

{'site': 'enwiki', 'title': 'Happiness', 'badges': []}

In [12]:
data[1]['claims']

{'P1245': [{'mainsnak': {'snaktype': 'value',
    'property': 'P1245',
    'datavalue': {'value': '885155', 'type': 'string'},
    'datatype': 'external-id'},
   'type': 'statement',
   'id': 'Q8$475e48c8-41fa-5739-e46e-27ff20507977',
   'rank': 'normal'}],
 'P373': [{'mainsnak': {'snaktype': 'value',
    'property': 'P373',
    'datavalue': {'value': 'Happiness', 'type': 'string'},
    'datatype': 'string'},
   'type': 'statement',
   'id': 'q8$0FC7FB9A-B5CA-4762-98AE-1B0BDC1EEF39',
   'rank': 'normal'}],
 'P31': [{'mainsnak': {'snaktype': 'value',
    'property': 'P31',
    'datavalue': {'value': {'entity-type': 'item',
      'numeric-id': 331769,
      'id': 'Q331769'},
     'type': 'wikibase-entityid'},
    'datatype': 'wikibase-item'},
   'type': 'statement',
   'id': 'q8$E2EFA381-BA5D-4F52-AF74-660B9A044C1E',
   'rank': 'normal'},
  {'mainsnak': {'snaktype': 'value',
    'property': 'P31',
    'datavalue': {'value': {'entity-type': 'item',
      'numeric-id': 60539479,
      'id'

In [None]:
2000*420*60

In [34]:
data[1]['sitelinks']['enwiki']

{'site': 'enwiki', 'title': 'Belgium', 'badges': []}

In [18]:
data[1]['claims']['P361'][0]

{'mainsnak': {'snaktype': 'value',
  'property': 'P361',
  'datavalue': {'value': {'entity-type': 'item',
    'numeric-id': 215669,
    'id': 'Q215669'},
   'type': 'wikibase-entityid'},
  'datatype': 'wikibase-item'},
 'type': 'statement',
 'id': 'Q31$2BBFC346-FD6A-4EB1-89CC-5ABB0D5987F8',
 'rank': 'normal'}

In [29]:
data[1]['claims']['P571'][0]['mainsnak']

{'snaktype': 'value',
 'property': 'P571',
 'datavalue': {'value': {'time': '+1830-10-04T00:00:00Z',
   'timezone': 0,
   'before': 0,
   'after': 0,
   'precision': 11,
   'calendarmodel': 'http://www.wikidata.org/entity/Q1985727'},
  'type': 'time'},
 'datatype': 'time'}