# Spot-checking single document search

A notebook for quickly checking search based on dense retrieval only. 

In [57]:
from pathlib import Path
import json
import sys

sys.path.append("..")

import numpy as np
import pandas as pd
from sentence_transformers.util import semantic_search

from src.utils import filter_on_block_type
from src.config import BLOCKS_TO_FILTER
from sentence_transformers import SentenceTransformer
from cpr_data_access.parser_models import ParserOutput

## Load embeddings and documents

In [47]:
DOC_EMBEDDINGS_PATH = Path(
    "/Users/kalyan/documents/cpr/rag-labs/data/documents_unece/cpr_embeddings_output_windows/"
)

available_document_ids = [fname.stem for fname in DOC_EMBEDDINGS_PATH.glob("*.json")]

print(f"{len(available_document_ids)} available document IDs with embeddings")

32 available document IDs with embeddings


In [48]:
DOCUMENT_ID = "CCLW.executive.1714.2200"
print(DOCUMENT_ID)

assert DOCUMENT_ID in available_document_ids

embeddings_files = list(DOC_EMBEDDINGS_PATH.glob(f"{DOCUMENT_ID}_*.npy"))
embeddings_files

CCLW.executive.1714.2200


[PosixPath('/Users/kalyan/documents/cpr/rag-labs/data/documents_unece/cpr_embeddings_output_windows/CCLW.executive.1714.2200__msmarco-distilbert-base-tas-b.npy'),
 PosixPath('/Users/kalyan/documents/cpr/rag-labs/data/documents_unece/cpr_embeddings_output_windows/CCLW.executive.1714.2200__baai-bge-small-en-v1-5.npy'),
 PosixPath('/Users/kalyan/documents/cpr/rag-labs/data/documents_unece/cpr_embeddings_output_windows/CCLW.executive.1714.2200__msmarco-distilbert-dot-v5.npy'),
 PosixPath('/Users/kalyan/documents/cpr/rag-labs/data/documents_unece/cpr_embeddings_output_windows/CCLW.executive.1714.2200__baai-bge-base-en-v1-5.npy')]

In [41]:
MODEL_NAME = "baai-bge-base-en-v1-5"

MODEL = SentenceTransformer("BAAI/bge-base-en-v1.5")

In [42]:
# load document json
document_content = json.loads((DOC_EMBEDDINGS_PATH / f"{DOCUMENT_ID}.json").read_text())

parser_output_filtered = filter_on_block_type(
    [ParserOutput.model_validate(document_content)], BLOCKS_TO_FILTER
)[0]

text_blocks = []

if parser_output_filtered.html_data:
    text_blocks = parser_output_filtered.html_data.text_blocks
elif parser_output_filtered.pdf_data:
    text_blocks = parser_output_filtered.pdf_data.text_blocks
else:
    print("No text blocks found")

assert len(text_blocks) > 0

# load embeddings

embeddings_filename = DOC_EMBEDDINGS_PATH / f"{DOCUMENT_ID}__{MODEL_NAME}.npy"
embeddings = np.load(embeddings_filename)[1:]

assert len(text_blocks) == embeddings.shape[0]

In [55]:
print("Text in document: ")
print("\n".join([block.to_string() for block in text_blocks]))

Text in document: 
Government of The Republic of Trinidad & Tobago
culture
growth investment productivity competitiveness
ideas good jobs
Innovation for
Lasting Prosperity
creativity inclusion equity
talent people knowledge
MEDIUM-TERM POLICY FRAMEWORK 2011-2014
Ministry of Planning and the Economy October 2011
GOVERNMENT OF THE REPUBLIC OF TRINIDAD AND TOBAGO TOGETHER WE ASPIRE TOGETHER WE ACHIEVE
Innovation for Lasting Prosperity
MEDIUM-TERM POLICY FRAMEWORK 2011-2014
Ministry of Planning and the Economy October 2011
TABLE OF CONTENTS
PREFACE BY THE MINISTER OF PLANNING AND THE ECONOMY LIST OF ACRONYMS LIST OF FIGURES AND BOXES APPENDICES
NATIONAL VISION MISSION SEVEN INTERCONNECTED PILLARS FOR SUSTAINABLE DEVELOPMENT
OVERVIEW
1
PART I - THE POLICY CONTEXT
CHAPTER I
THE FRAMEWORK FOR ECONOMIC AND SOCIAL TRANSFORMATION
5
CHAPTER II
MAKING THE POLICY SHIFTS
23
PART II- STRATEGIC PRIORITIES FOR 2011-2014
CHAPTER III
CRIME AND LAW AND ORDER
29
CHAPTER IV
AGRICULTURE AND FOOD SECURITY
39


## Load generated queries

If you don't have this, you could just overwrite the `QUERY` in the next section.

In [58]:
GENERATED_QUERIES_PATH = Path(
    "/Users/kalyan/Documents/CPR/rag-labs/data/dataset_generation/unece_sprint/final_2_all.jsonl"
)

generated_queries_df = pd.read_json(GENERATED_QUERIES_PATH, lines=True)

generated_queries_df.head()

Unnamed: 0,query,query_type,query_timestamp,document_id,query_prompt_template,query_user,query_model,query_uuid,generation
0,What is the purpose of this document?,synthetic,1717088261967,CCLW.legislative.8544.rtl_85,query_from_product_queries.txt,,gpt-4-32k,1c6ba46d140ad7b98821c8976d31b444,"{'config': {'generation_engine': 'openai', 'mo..."
1,What is the purpose of this document?,synthetic,1717088261967,CCLW.legislative.8544.rtl_85,query_from_product_queries.txt,,gpt-4-32k,1c6ba46d140ad7b98821c8976d31b444,"{'config': {'generation_engine': 'openai', 'mo..."
2,What is the total budget for 2018?,synthetic,1717088261967,CCLW.legislative.8544.rtl_85,query_from_product_queries.txt,,gpt-4-32k,617f7e809d86979067c42c61534965ce,{'config': {'generation_engine': 'huggingface'...
3,What is the total budget for 2018?,synthetic,1717088261967,CCLW.legislative.8544.rtl_85,query_from_product_queries.txt,,gpt-4-32k,617f7e809d86979067c42c61534965ce,{'config': {'generation_engine': 'huggingface'...
4,What are the main sectors this budget is alloc...,synthetic,1717088261967,CCLW.legislative.8544.rtl_85,query_from_product_queries.txt,,gpt-4-32k,75991e66a9106545e8ce6268fe005aab,"{'config': {'generation_engine': 'openai', 'mo..."


In [61]:
queries = generated_queries_df.query(f"document_id == '{DOCUMENT_ID}'")[
    "query"
].unique()

queries

array(['Is there a mention of carbon dioxide removal in this document?',
       'Does this plan involve any restructuring of the energy sector?',
       'What is the role of the private sector in this plan?',
       'Is there a specific poverty reduction target in this plan?',
       'Is there any mention of the role of the Diaspora in this policy framework?'],
      dtype=object)

## Run search

If `corpus_id` for each text block is adjacent, windows probably aren't working so well. It seems like that's not the case.

In [70]:
# run semantic search, returning text and idxs

query_embeddings = MODEL.encode(queries)
semantic_search_results = semantic_search(query_embeddings, embeddings, top_k=20)

for idx, query in enumerate(queries):
    print(f"Query: {query}")
    print("\n")

    idxs = [item["corpus_id"] for item in semantic_search_results[idx]]

    print(f"Result indices: {idxs}")
    print("Top 5 results: ")
    for idx in idxs[:5]:
        print("  - " + text_blocks[idx].to_string())
    print("---- \n")

Query: Is there a mention of carbon dioxide removal in this document?


Result indices: [1833, 1832, 1231, 1225, 527, 1229, 1834, 124, 1224, 250, 116, 252, 123, 125, 1072, 526, 525, 1230, 1831, 1073]
Top 5 results: 
  - 6. Phase out of the ozone depleting substances (ODS) Hydro chlorofluorocarbon (HCFC)
  - 5. Reduce greenhouse gas emissions through the promotion of the 'use cleaner technologies', renewable energy and energy efficiency
  - Critical to the success of this sub-sector is the creation of a supporting environment that features a strong legislative framework, investment incentives for both producers and consumers and a skilled workforce. Government is committed to creating the facilitating environment necessary to propel this Sub-sector's revenue earning capabilities and has identified the transportation and electricity generation sectors as areas for future development of renewable energy technology.
  - · Small Scale LNG
  - · The 'Greening' of the Priority Bus Route throu

In [64]:
semantic_search_idxs

[[{'corpus_id': 1833, 'score': 0.6536381840705872},
  {'corpus_id': 1832, 'score': 0.646479070186615},
  {'corpus_id': 1231, 'score': 0.6127904653549194},
  {'corpus_id': 1225, 'score': 0.6017967462539673},
  {'corpus_id': 527, 'score': 0.5974105000495911},
  {'corpus_id': 1229, 'score': 0.5940102338790894},
  {'corpus_id': 1834, 'score': 0.592460036277771},
  {'corpus_id': 124, 'score': 0.5915940999984741},
  {'corpus_id': 1224, 'score': 0.5865622758865356},
  {'corpus_id': 250, 'score': 0.5865479111671448},
  {'corpus_id': 116, 'score': 0.5840904116630554},
  {'corpus_id': 252, 'score': 0.58382248878479},
  {'corpus_id': 123, 'score': 0.5824770927429199},
  {'corpus_id': 125, 'score': 0.5820780992507935},
  {'corpus_id': 1072, 'score': 0.5809653401374817},
  {'corpus_id': 526, 'score': 0.5799942016601562},
  {'corpus_id': 525, 'score': 0.5788995027542114},
  {'corpus_id': 1230, 'score': 0.5786868929862976},
  {'corpus_id': 1831, 'score': 0.5769801139831543},
  {'corpus_id': 1073, 'sc