In [None]:
import os
os.environ["LANGUAGE"] = 'ar'

import sys
sys.path.append('../src')

from wikidataDB import Session, WikidataID, WikidataEntity
from wikidataEmbed import WikidataTextifier
from JinaAI import JinaAIEmbedder
from wikidataRetriever import WikidataCirrusSeach, AstraDBConnect
from tqdm import tqdm
import json
import os
import torch
from langchain_core.documents import Document
from ragstack_langchain.graph_store import CassandraGraphStore
import cassio
from langchain_astradb import AstraDBVectorStore, AstraDBGraphVectorStore
from astrapy.info import CollectionVectorServiceOptions
from astrapy import DataAPIClient
from transformers import AutoTokenizer
import requests
import time
import pickle

In [None]:
sample_ids = pickle.load(open("../data/Evaluation Data/Sample IDs (AR EN DE).pkl", "rb"))

In [None]:
datastax_token = json.load(open("../API_tokens/datastax_wikidata_nvidia.json"))

COLLECTION_NAME = 'wikidata_test_v1'
client = DataAPIClient(datastax_token['ASTRA_DB_APPLICATION_TOKEN'])
database0 = client.get_database(datastax_token['ASTRA_DB_API_ENDPOINT'])
wikiDataCollection = database0.get_collection(COLLECTION_NAME)

COLLECTION_NAME = 'wikidata_langtest'
graph_store = AstraDBConnect(datastax_token, COLLECTION_NAME, model='jina', batch_size=4, cache_embeddings=False)

with tqdm(total=1347786) as progressbar:
    for item in wikiDataCollection.find():
        progressbar.update(1)
        if item['metadata']['QID'] in sample_ids['QID'].values:
            graph_store.add_document(id=item['_id'], text=item['content'], metadata=item['metadata'])

    graph_store.push_batch()

In [None]:
BATCH_SIZE = 100
OFFSET = 410
LANGUAGE = 'de'
textifier = WikidataTextifier(language=LANGUAGE)

with tqdm(total=3226638) as progressbar:
    with Session() as session:
        entities = session.query(WikidataEntity).join(WikidataID, WikidataEntity.id == WikidataID.id).filter(WikidataID.in_wikipedia == True).offset(OFFSET).yield_per(BATCH_SIZE)
        progressbar.update(OFFSET)

        for entity in entities:
            progressbar.update(1)
            claims = entity.claims.copy()
            for pid, claim in claims.items():
                for c in claim:
                    if ('datavalue' in c['mainsnak']):
                        if (c['mainsnak']['datatype'] == 'quantity') or (c['mainsnak']['datatype'] == 'time'):
                            c['mainsnak']['text_value'] = textifier.data_to_text(c['mainsnak']['datavalue'], c['mainsnak']['datatype'])

                    if 'qualifiers' in c:
                        for pid, qualifier in c['qualifiers'].items():
                            for q in qualifier:
                                if ('datavalue' in q):
                                    if (q['datatype'] == 'quantity') or (q['datatype'] == 'time'):
                                        q['text_value'] = textifier.data_to_text(q['datavalue'], q['datatype'])

            entity.claims = claims
            session.add(entity)
            session.commit()

In [None]:
import os
import cassio
from cassio.config import check_resolve_keyspace, check_resolve_session

cassio.init(
    token=datastax_token["ASTRA_DB_APPLICATION_TOKEN"],
    database_id=datastax_token["ASTRA_DB_DATABASE_ID"],
    keyspace=datastax_token["ASTRA_DB_KEYSPACE"]
)

session = check_resolve_session()
keyspace = check_resolve_keyspace()
table = "default_keyspace.wikidata_test_v2"
page_size = 1000
# last_token = None

with tqdm(total=530994) as progressbar:
    while True:
        # Construct query
        if last_token is None:
            query = f"SELECT key, token(key) FROM {table} LIMIT {page_size};"
        else:
            query = f"SELECT key, token(key) FROM {table} WHERE token(key) > {last_token} LIMIT {page_size};"

        # Execute query
        rows = session.execute(query)
        rows_fetched = 0

        # Process rows
        for row in rows:
            qid = row.key[1].split("_")[0]
            sample_ids.loc[sample_ids['QID'] == qid, 'in_wikidata_test_v2'] = True
            last_token = row.system_token_key  # Update the last token
            rows_fetched += 1
            progressbar.update(1)

        # Break if no more rows are fetched
        if rows_fetched < page_size:
            break

In [None]:
datastax_token = json.load(open("../API_tokens/datastax_wikidata_nvidia.json"))
COLLECTION_NAME = 'wikidata_test_v1'

client = DataAPIClient(datastax_token['ASTRA_DB_APPLICATION_TOKEN'])
database0 = client.get_database(datastax_token['ASTRA_DB_API_ENDPOINT'])
wikiDataCollection = database0.get_collection(COLLECTION_NAME)

In [None]:
while True:
    try:
        for item in tqdm(wikiDataCollection.find({
            '$not': {'metadata.Language': 'en'}
        })):
            metadata = item['metadata']
            metadata['Language'] = 'en'
            metadata["DumpDate"]= '09/18/2024'
            wikiDataCollection.update_one({'_id': item['_id']}, {'$set': {"metadata": metadata}})
        break
    except Exception as e:
        print(e)
        time.sleep(2)

In [None]:
datastax_token = json.load(open("../API_tokens/datastax_wikidata_nvidia.json"))
COLLECTION_NAME = 'wikidata_test_v1'

client = DataAPIClient(datastax_token['ASTRA_DB_APPLICATION_TOKEN'])
database0 = client.get_database(datastax_token['ASTRA_DB_API_ENDPOINT'])
wikiDataCollection = database0.get_collection(COLLECTION_NAME)

sample_ids = pickle.load(open("../data/Evaluation Data/Sample IDs (EN).pkl", "rb"))
sample_ids['in_wikidata_test_v1'] = False

for qid in tqdm((sample_ids[~sample_ids['in_wikidata_test_v1']]['QID'].values)):
  item = wikiDataCollection.find_one({"_id": f"{qid}_1"})
  if item is not None:
    sample_ids.loc[sample_ids['QID'] == qid, 'in_wikidata_test_v1'] = True

In [None]:
from datetime import datetime
import hashlib  # Import the hashlib library
import json

#OMIITED

current_date = datetime.now().date().isoformat()

#OMITTED

def clean_claims(claims):
    cleaned_claims = {}
    for pid,value in claims.items():
        cleaned_claims[pid] = [clean_item(item) for item in value]
    return cleaned_claims
def clean_item(item):
    if 'datavalue' not in item['mainsnak']:
        return {'type': item['mainsnak']['snaktype']}
    if type(item['mainsnak']['datavalue']['value']) is dict:
        value = {'type': item['mainsnak']['datavalue']['type'], **item['mainsnak']['datavalue']['value']}
        if 'entity-type' in value:
            del value['entity-type']
        return value
    return {'type': item['mainsnak']['datavalue']['type'], 'value': item['mainsnak']['datavalue']['value']}

with tqdm(total=9203786) as progressbar:
    with Session() as session:
        entities = session.query(WikidataEntity).join(WikidataID, WikidataEntity.id == WikidataID.id).filter(WikidataID.in_wikipedia == True).offset(OFFSET).yield_per(BATCH_SIZE)
        progressbar.update(OFFSET)
        doc_batch = []
        ids_batch = []

        for entity in entities:
            progressbar.update(1)
            ##if SAMPLE and (entity.id in sample_ids['QID'].values):
            chunks = textifier.chunk_text(entity, tokenizer, max_length=512)
            for chunk_i in range(len(chunks)):
                # Processing Claims
                md5_hash = hashlib.md5(chunks[chunk_i].encode('utf-8')).hexdigest()
                doc = Document(page_content=chunks[chunk_i],
                                metadata={ "MD5": md5_hash,
                                            "Claims": clean_claims(entity.claims),
                                            "Label": entity.label,
                                            "Description": entity.description,
                                            "Aliases": entity.aliases,
                                            "Date": current_date,
                                            "QID": entity.id,
                                            "ChunkID": chunk_i+1,
                                            "Language": "en",
                                            "Dump Date": "09/18/2024"})
                doc_batch.append(doc)
                ids_batch.append(f"{entity.id}_{chunk_i+1}")

In [None]:
import sys
sys.path.append('../src')

from wikidataDB import Session, WikidataID, WikidataEntity
from wikidataEmbed import WikidataTextifier
from wikidataRetriever import AstraDBConnect

import json
from tqdm import tqdm
import os
import pickle
from datetime import datetime
import hashlib

SAMPLE = True
BATCH_SIZE = 100
OFFSET = 0
LANGUAGE = 'en'
DUMPDATE = "09/18/2024"
datastax_token = json.load(open("../API_tokens/datastax_wikidata_nvidia.json"))
COLLECTION_NAME = 'wikidata_test_v2'

textifier = WikidataTextifier(with_claim_aliases=False, with_property_aliases=False)
graph_store = AstraDBConnect(datastax_token, COLLECTION_NAME, model='nvidia', batch_size=BATCH_SIZE, cache_embeddings=False)

# Load the Sample IDs
sample_ids = None
if SAMPLE:
    sample_ids = pickle.load(open("../data/Evaluation Data/Sample IDs (EN).pkl", "rb"))
    sample_ids = sample_ids[sample_ids['In Wikipedia']]
    sample_ids = sample_ids[sample_ids['Sample 2']]

with tqdm(total=len(sample_ids)) as progressbar:
    for qid in range(0, len(sample_ids), 1000):
        qid_list = sample_ids.iloc[qid:qid+1000]['QID'].tolist()
        with Session() as session:
            entities = session.query(WikidataEntity).filter(WikidataEntity.id.in_(qid_list)).all()

        for entity in entities:
            progressbar.update(1)
            chunks = textifier.chunk_text(entity, graph_store.tokenizer, max_length=graph_store.max_token_size)
            for chunk_i in range(len(chunks)):
                metadata={
                    "QID": entity.id,
                    "ChunkID": chunk_i+1
                }
                graph_store.add_document(id=f"{entity.id}_{chunk_i+1}", text=chunks[chunk_i], metadata=metadata)
    graph_store.push_batch()