# Checking available documents in Vespa

Once indexing in Vespa, this is a notebook to get high level stats of the docs in the Vespa index.

In [1]:
import os
from pathlib import Path
import shutil

from vespa.application import Vespa
from vespa.io import VespaQueryResponse
from vespa.exceptions import VespaError
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv(), override=True)

True

In [2]:
app = Vespa(
    url=os.environ["VESPA_URL"],
    cert=os.environ["VESPA_CERT_LOCATION"],
    key=os.environ["VESPA_KEY_LOCATION"],
)

app.get_application_status()

Using mTLS (key,cert) Authentication against endpoint https://a6a329f3.acc2de3b.z.vespa-app.cloud/ApplicationStatus


<Response [200]>

In [3]:
yql = """select * from sources document_passage where true limit 0 | all(group(document_import_id) each(output(count())));"""
        
        
with app.syncio() as session:
    response: VespaQueryResponse = session.query(
        yql=yql,
        hits=10,
    )

res = response.json


In [4]:
document_ids_and_passage_counts_in_vespa = [(i["value"], i["fields"]["count()"]) for i in res["root"]["children"][0]["children"][0]["children"]]

print("Smallest docs in the index. You might want to check these")
sorted(document_ids_and_passage_counts_in_vespa, key=lambda x: x[1], reverse=False)[:10]

Smallest docs in the index. You might want to check these


[('CCLW.executive.9967.4478', 6),
 ('CCLW.executive.9724.4203', 7),
 ('CCLW.document.i00000160.n0000', 8),
 ('UNFCCC.party.1221.0', 8),
 ('CCLW.executive.10211.4813', 9),
 ('CCLW.executive.11049.6349', 10),
 ('UNFCCC.party.1572.0', 10),
 ('UNFCCC.party.1038.0', 13),
 ('CCLW.document.i00000437.n0000', 13),
 ('CCLW.executive.10524.5502', 13)]

In [5]:
unique_doc_ids_in_vespa = {i[0] for i in document_ids_and_passage_counts_in_vespa}

doc_ids_for_loading = set(fname.stem for fname in Path("../data/documents_unece/cpr_embeddings_output").glob("*.json"))

print(f"Num docs in vespa: {len(unique_doc_ids_in_vespa)}")
print(f"Num docs in local dir: {len(doc_ids_for_loading)}")

Num docs in vespa: 552
Num docs in local dir: 715


In [21]:
missing_docs = doc_ids_for_loading - unique_doc_ids_in_vespa

print(f"{len(missing_docs)} docs are missing")

163 docs are missing


## Checking docs missing in `document_passage` schema

All of the docs missing in the `document_passage` schema should be empty or have `html_data.has_valid_text == False`

In [19]:
# check whether doc ids in docs are the same as their titles
import json 

def _is_empty(doc: dict) -> bool:
    if doc["pdf_data"] is None and doc["html_data"] is None:
        return True
    
    if doc["html_data"] is None and doc["pdf_data"]["text_blocks"] == []:
        return True
    
    if doc["pdf_data"] is None and doc["html_data"]["text_blocks"] == []:
        return True
    
    if doc["pdf_data"] is None and doc["html_data"]["has_valid_text"] == False:
        return True
    
    return False

docs_to_check = []

for doc in missing_docs:
    doc_text = Path(f"../data/documents_unece/cpr_embeddings_output/{doc}.json").read_text()
    doc_json = json.loads(doc_text)
    assert doc_json["document_id"] == doc
    if not _is_empty(doc_json):
        docs_to_check.append(doc_json)
        
print(len(docs_to_check))

0
