In [1]:
import sys
sys.path.append('../src')

import os
os.environ["LANGUAGE"] = 'de'

from wikidata_dumpreader import WikidataDumpReader
from wikidataDB import WikidataID, WikidataEntity, Session
from multiprocessing import Manager
import asyncio
import gc
import os

FILEPATH = os.getenv("FILEPATH", '../data/Wikidata/latest-all.json.bz2')
BATCH_SIZE = int(os.getenv("BATCH_SIZE", 5000))
PUSH_SIZE = int(os.getenv("PUSH_SIZE", 10000))
QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", 7000))
NUM_PROCESSES = int(os.getenv("NUM_PROCESSES", 8))
SKIPLINES = int(os.getenv("SKIPLINES", 0))
LANGUAGE = os.getenv("LANGUAGE", 'de')

In [2]:
from wikidataEmbed import WikidataTextifier

textifier = WikidataTextifier(language='de')
entity = WikidataEntity.get_entity('Q2')
print(textifier.entity_to_text(entity))

Erde, dritter Planet von der Sonne aus im Sonnensystem, auch bekannt als 🜨, Terra, 🌎, Blauer Planet, Sol 3, 🌏, Erdkörper, 🗺, ♁, 🌍, Sol III, Welt. Attribute umfassen: 
- tiefster Punkt: „Challengertief (Höhe über dem Meeresspiegel: -10994 Meter)“
- Form: „Geoid (beschreibendes Datenobjekt: Erdfigur)“
- besteht aus: „Erdkruste“, 
 „Erdkern“, 
 „Erdmantel“, 
 „Erdatmosphäre“, 
 „Erdoberfläche“, 
 „Biosphäre“, 
 „Hydrosphäre“, 
 „Lithosphäre“, 
 „Erdmagnetfeld“, 
 „Kontinent“, 
 „westliche und östliche Hemisphäre“, 
 „Europa“, 
 „Asien“, 
 „Afrika“, 
 „Eurasien“, 
 „Amerika“, 
 „Antarktika“, 
 „Australien“
- Umlaufbahn: „heliozentrischer Orbit (beschreibendes Datenobjekt: Erdbahn)“
- Abplattung: „+0.0033528“
- Exzentrizität: „+0.016710219“
- untergeordnete astronomischer Körper: „Mond (Startzeitpunkt: 4527 Millionen Jahre v. Chr. ; Nachweisumstände: circa)“, 
 „Transiting Exoplanet Survey Satellite“, 
 „Internationale Raumstation (Startzeitpunkt: 20 Nov 1998)“, 
 „Mir (Startzeitpunkt: 20 F

#### Reading the Wikidata dump ZIP file and saving the IDs of entities and properties to a JSON file (Only the ones connected to the English Wikipedia)

In [3]:
def save_ids_to_sqlite(item, bulk_ids, sqlitDBlock):
    if (item is not None) and ((item['id'] in sample_ids) or WikidataID.is_in_wikipedia(item, language=LANGUAGE)):
        ids = WikidataID.extract_entity_ids(item, language=LANGUAGE)
        bulk_ids.extend(ids)

        with sqlitDBlock:
            if len(bulk_ids) > PUSH_SIZE:
                worked = WikidataID.add_bulk_ids(list(bulk_ids[:PUSH_SIZE]))
                if worked:
                    del bulk_ids[:PUSH_SIZE]
                    gc.collect()

async def run_processor(wikidata, bulk_ids, sqlitDBlock):
    await wikidata.run(lambda item: save_ids_to_sqlite(item, bulk_ids, sqlitDBlock), max_iterations=None, verbose=True)

multiprocess_manager = Manager()
sqlitDBlock = multiprocess_manager.Lock()
bulk_ids = multiprocess_manager.list()

wikidata = WikidataDumpReader(FILEPATH, num_processes=NUM_PROCESSES, batch_size=BATCH_SIZE, queue_size=QUEUE_SIZE, skiplines=SKIPLINES)

await run_processor(wikidata, bulk_ids, sqlitDBlock)

while len(bulk_ids) > 0:
    worked = WikidataID.add_bulk_ids(list(bulk_ids))
    if worked:
        bulk_ids[:] = []
    else:
        asyncio.sleep(1)

Items Processes: 0 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 688.04 MB
Items Processes: 0 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 2114.58 MB
Items Processes: 0 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 2128.03 MB
Items Processes: 0 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 2135.30 MB
Items Processes: 0 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 2142.05 MB
Items Processes: 0 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 2150.12 MB
Items Processes: 0 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 2156.68 MB
Items Processes: 0 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 2164.55 MB
Items Processes: 4999 	 Line Process Avg: 172 items/sec 	 Memory Usage Avg: 1810.75 MB
Items Processes: 4999 	 Line Process Avg: 151 items/sec 	 Memory Usage Avg: 2980.86 MB
Items Processes: 4999 	 Line Process Avg: 137 items/sec 	 Memory Usage Avg: 2995.87 MB
Items Processes: 4999 	 Line Process Avg: 125 items/sec 	 Memory Usage Avg: 3004.74 

### Adding entities (label, description, claims, and aliases) of IDs found in WikidataID to WikidataEntity

In [3]:
FILEPATH = os.getenv("FILEPATH", '../data/Wikidata/latest-all.json.bz2')
BATCH_SIZE = int(os.getenv("BATCH_SIZE", 1000))
PUSH_SIZE = int(os.getenv("PUSH_SIZE", 2000))
QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", 1500))
NUM_PROCESSES = int(os.getenv("NUM_PROCESSES", 8))
SKIPLINES = int(os.getenv("SKIPLINES", 0))
LANGUAGE = os.getenv("LANGUAGE", 'de')

In [None]:
SKIPLINES = 7050000

def save_entities_to_sqlite(item, data_batch, sqlitDBlock):
    if (item is not None) and WikidataID.get_id(item['id']):
        item = WikidataEntity.normalise_item(item, language=LANGUAGE)
        data_batch.append(item)

        with sqlitDBlock:
            if len(data_batch) > PUSH_SIZE:
                worked = WikidataEntity.add_bulk_entities(list(data_batch[:PUSH_SIZE]))
                if worked:
                    del data_batch[:PUSH_SIZE]
                    gc.collect()

async def run_processor(wikidata, bulk_ids, sqlitDBlock):
    await wikidata.run(lambda item: save_entities_to_sqlite(item, bulk_ids, sqlitDBlock), max_iterations=None, verbose=True)

multiprocess_manager = Manager()
sqlitDBlock = multiprocess_manager.Lock()
data_batch = multiprocess_manager.list()

wikidata = WikidataDumpReader(FILEPATH, num_processes=NUM_PROCESSES, batch_size=BATCH_SIZE, queue_size=QUEUE_SIZE, skiplines=SKIPLINES)

await run_processor(wikidata, data_batch, sqlitDBlock)

while len(data_batch) > 0:
    worked = WikidataEntity.add_bulk_entities(list(data_batch))
    if worked:
        data_batch[:] = []
    else:
        asyncio.sleep(1)

Items Processes: 0 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 238.11 MB
Items Processes: 0 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 238.11 MB
Items Processes: 0 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 238.11 MB
Items Processes: 0 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 238.11 MB
Items Processes: 0 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 238.11 MB
Items Processes: 0 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 238.11 MB
Items Processes: 0 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 238.11 MB
Items Processes: 0 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 238.11 MB
Items Processes: 0 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 238.11 MB
Items Processes: 0 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 238.11 MB
Items Processes: 0 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 238.11 MB
Items Processes: 0 	 Line Process Avg: 0 items/sec 	 Memory Usage Avg: 238.11 MB
Items Processes: 0 	 Line Pr

### Find IDs that are in WikidataID but not in WikidataEntity

In [None]:
with Session() as session:
    result = session.execute(
        select(WikidataID.id)
        .outerjoin(WikidataEntity, WikidataID.id == WikidataEntity.id)
        .filter(WikidataEntity.id == None)
        .filter(WikidataID.in_wikipedia == True)
    )
    missing_ids = set(result.scalars().all())

print(len(missing_ids))

In [19]:
import requests
import json

time_data = {
  "value": {
    "amount": "-10994",
    "unit": "http://www.wikidata.org/entity/Q11573",
    "upperBound": "-10954",
    "lowerBound": "-11034"
  },
  "type": "quantity"
}

data = {
    'action': 'wbformatvalue',
    'format': 'json',
    'datavalue': json.dumps(time_data),
    'datatype': 'quantity',
    'uselang': 'ar',
    'formatversion': 2
}
r = requests.get('https://www.wikidata.org/w/api.php', params=data)
r.json()['result']

'−١٠٬٩٩٤±٤٠ متر'

### Find IDs that are not in WikidataEntity but are in the claims, qualifiers, and quantity units of entities connected to Wikipedia

In [None]:
def get_missing_entities(session, ids):
    existing_entities = session.query(WikidataEntity.id).filter(WikidataEntity.id.in_(ids)).all()
    existing_ids = {entity.id for entity in existing_entities}
    return set(ids) - existing_ids

with Session() as session:
    entities = session.query(WikidataEntity).join(WikidataID, WikidataEntity.id == WikidataID.id).filter(WikidataID.in_wikipedia == True).yield_per(100000)

    progressbar = tqdm(total=9203531)
    found = False
    missing_ids = set()

    batch_size = 10000
    ids_to_check = set()

    for entity in entities:
        progressbar.update(1)
        for pid, claim in entity.claims.items():
            ids_to_check.add(pid)
            for c in claim:
                if ('datavalue' in c['mainsnak']):
                    if ((c['mainsnak']['datatype'] == 'wikibase-item') or (c['mainsnak']['datatype'] == 'wikibase-property')):
                        id = c['mainsnak']['datavalue']['value']['id']
                        ids_to_check.add(id)
                    elif (c['mainsnak']['datatype'] == 'quantity') and (c['mainsnak']['datavalue']['value']['unit'] != '1'):
                        id = c['mainsnak']['datavalue']['value']['unit'].rsplit('/', 1)[1]
                        ids_to_check.add(id)

                if 'qualifiers' in c:
                    for pid, qualifier in c['qualifiers'].items():
                        ids_to_check.add(pid)
                        for q in qualifier:
                            if ('datavalue' in q):
                                if ((q['datatype'] == 'wikibase-item') or (q['datatype'] == 'wikibase-property')):
                                    id = q['datavalue']['value']['id']
                                    ids_to_check.add(id)
                                elif (q['datatype'] == 'quantity') and (q['datavalue']['value']['unit'] != '1'):
                                    id = q['datavalue']['value']['unit'].rsplit('/', 1)[1]
                                    ids_to_check.add(id)

        if len(ids_to_check) >= batch_size:
            missing_ids.update(get_missing_entities(session, ids_to_check))
            ids_to_check.clear()

        if progressbar.n % 1000 == 0:
            progressbar.set_description(f"Missing IDs: {len(missing_ids)}")

    if ids_to_check:
        missing_ids.update(get_missing_entities(session, ids_to_check))

    progressbar.close()

In [None]:
import json
from langchain_astradb import AstraDBVectorStore
from astrapy.info import CollectionVectorServiceOptions

datastax_token = json.load(open("../API tokens/datastax_wikidata_nvidia.json"))
ASTRA_DB_DATABASE_ID = datastax_token['ASTRA_DB_DATABASE_ID']
ASTRA_DB_APPLICATION_TOKEN = datastax_token['ASTRA_DB_APPLICATION_TOKEN']
ASTRA_DB_API_ENDPOINT = datastax_token["ASTRA_DB_API_ENDPOINT"]
ASTRA_DB_KEYSPACE = datastax_token["ASTRA_DB_KEYSPACE"]

collection_vector_service_options = CollectionVectorServiceOptions(
    provider="nvidia",
    model_name="NV-Embed-QA"
)

graph_store = AstraDBVectorStore(
    collection_name="wikidata",
    collection_vector_service_options=collection_vector_service_options,
    token=ASTRA_DB_APPLICATION_TOKEN,
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    namespace=ASTRA_DB_KEYSPACE,
)

results = graph_store.similarity_search("This is a question?", k=10)

for result in results:
    print(result.metadata['QID'])