In [1]:
import ollama
from ollama import chat

from huggingface_hub import snapshot_download

from docling.datamodel.pipeline_options import PdfPipelineOptions, RapidOcrOptions
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.chunking import HybridChunker

from pydantic.v1 import BaseModel

from transformers import AutoTokenizer
from sentence_transformers import CrossEncoder

from BM25 import load_bm25, create_bm25, bm25_search
from util import docling_util
from util.text_splitter import RecursiveTextSplitter, DataFrameFormatter

import os
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# document local path or URL

doc10 = "./test_doc/ÂúãÊ≥∞ÈÜ´Èô¢113Âπ¥Â∑•‰ΩúË®àÁï´.pdf" # cross page table and HAS index, scanned document OCR needed
doc11 = "./test_doc/ÂúãÊ≥∞ÈÜ´Èô¢113Âπ¥Á∂ìË≤ªÈ†êÁÆó.pdf" # large number table, scanned document OCR needed
doc12 = "./test_doc/ÂúãÊ≥∞ÈÜ´Èô¢112Âπ¥Â∑•‰ΩúÂ†±Âëä.pdf" # table with check board inside, scanned document OCR needed
doc13 = "./test_doc/112Âπ¥ÂúãÊ≥∞ÈÜ´ÁôÇË≤°ÂúòÊ≥ï‰∫∫Ë≤°ÂãôÂ†±Âëä.pdf" # loose structured table, scanned document OCR needed
doc15 = "./test_doc/Áï¢Ê•≠Â≠∏ÂàÜÂØ©Ê†∏Ë°®.pdf" # vertical table

# ollama embedding model
embed_model = "bge-m3:latest"

# reranker 
#rerank_model = CrossEncoder('BAAI/bge-reranker-large', max_length=512)
rerank_model = CrossEncoder('BAAI/bge-reranker-v2-m3', max_length=1024)

# ollama llm
#llm = "deepseek-r1:7b"
#llm = "deepseek-r1:14b"
#llm = "deepseek-r1:32b"
#llm = "deepseek-r1:14b-max-context"
#llm = "deepseek-r1:14b-quarter-context"
llm = "deepseek-r1:14b-eighth-context"

def get_embeddings(texts, model=embed_model):
    embed_response = ollama.embeddings(model=model, prompt=texts)
    embedded_vector = embed_response["embedding"]
    
    return embedded_vector

# PyPdfium without EasyOCR
# --------------------
"""pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = False
#pipeline_options.table_structure_options.do_cell_matching = True

#pipeline_options.table_structure_options.mode = 'accurate'

pipeline_options.images_scale = 2
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

pypdfium_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
        pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
        )
    }
)"""

# PyPdfium with RapidOCR
# ----------------------
# Download RappidOCR models from HuggingFace
print("Downloading RapidOCR models")
download_path = snapshot_download(repo_id="SWHL/RapidOCR")

det_model_path = os.path.join(
    download_path, "PP-OCRv4", "ch_PP-OCRv4_det_infer.onnx"
)
rec_model_path = os.path.join(
    download_path, "PP-OCRv4", "ch_PP-OCRv4_rec_infer.onnx"
)
cls_model_path = os.path.join(
    download_path, "PP-OCRv3", "ch_ppocr_mobile_v2.0_cls_train.onnx"
)
ocr_options = RapidOcrOptions(
    det_model_path=det_model_path,
    rec_model_path=rec_model_path,
    cls_model_path=cls_model_path,
    #force_full_page_ocr=True
)

pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = True
pipeline_options.do_table_structure = True
#pipeline_options.table_structure_options.do_cell_matching = False
pipeline_options.table_structure_options.do_cell_matching = True
pipeline_options.table_structure_options.mode = 'accurate'

pipeline_options.images_scale = 2
pipeline_options.generate_page_images = True
pipeline_options.generate_picture_images = True

pipeline_options.ocr_options = ocr_options

pypdfium_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(
        pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
        )
   }
)

# print json
def show_json(data):
    if isinstance(data, str):
        obj = json.loads(data)
        print(json.dumps(obj, indent=4, ensure_ascii=False))
    elif isinstance(data, dict) or isinstance(data, list):
        print(json.dumps(data, indent=4, ensure_ascii=False))
    elif issubclass(type(data), BaseModel):
        print(json.dumps(data.dict(), indent=4, ensure_ascii=False))

Downloading RapidOCR models


Fetching 21 files: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 21/21 [00:00<00:00, 117440.51it/s]


# Docling reader, docling format parser output

In [25]:
#doc_source = [doc, doc1, doc2, doc3, doc4, doc5, doc6]
#doc_source = [doc4, doc5, doc7, doc8, doc9, doc10, doc11, doc12, doc13, doc14]
#doc_source = [doc10, doc11, doc12, doc13]                                      # scanned docs and loose table
doc_source = [doc15]

conv_results = pypdfium_converter.convert_all(
    doc_source,
    raises_on_error=True,  # to let conversion run through all and examine results at the end
)

conv_results_list = list(conv_results)

# Do hybrid chunking to merge similar chunk
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-m3")

hybrid_chunker = HybridChunker(
    tokenizer=tokenizer,
    max_tokens=8192,
    merge_peers=True  # optional, defaults to True
)

# text chunks
all_chunks = []
for conv_res in conv_results_list:
    docling_docs = conv_res.document
    chunk_iter = hybrid_chunker.chunk(dl_doc=docling_docs)
    chunks = list(chunk_iter)
    all_chunks += chunks




In [27]:
for i, chunk in enumerate(all_chunks[:]):
    print(f"=== {i} ===")
    txt_tokens = len(tokenizer.tokenize(chunk.text))
    print(f"chunk.text ({txt_tokens} tokens):\n{repr(chunk.text)}")

    ser_txt = hybrid_chunker.serialize(chunk=chunk)
    ser_tokens = len(tokenizer.tokenize(ser_txt))
    print(f"chunker.serialize(chunk) ({ser_tokens} tokens):\n{repr(ser_txt)}")
    #print(f"chunker.serialize(chunk) ({ser_tokens} tokens):\n{ser_txt}")

    print()

=== 0 ===
chunk.text (6117 tokens):
'Â≠∏Á®ãÔºöË≥áË®äÁßëÂ≠∏Á≥ªÁè≠Á∫ßÔºöË≥áÂõõÁî≤Â≠∏ËôüÔºö111016035ÂßìÂêçÔºöËêßÂ§ßÁ•êÁîüÊó•Ôºö911118\nÂàóÂç∞Êó•ÊúüÔºö114/4/15\nÂàóÂç∞ÊôÇÈñìÔºö09Ôºö46Ôºö53\nÊ•≠Â≠∏ÂàÜÔºö133\nÈÄöË≠òË™≤Á®ãÔºàÂ∑≤‰øÆ18Â≠¶ÂàÜÔºâ ÔºàÂ∑≤‰øÆÂøÖ‰øÆ0Â≠¶ÂàÜÔºåÈÅ•‰øÆ18Â≠¶,  = . ÈÄöË≠òË™≤Á®ãÔºàÂ∑≤‰øÆ18Â≠¶ÂàÜÔºâ ÔºàÂ∑≤‰øÆÂøÖ‰øÆ0Â≠¶ÂàÜÔºåÈÅ•‰øÆ18Â≠¶, Â≠∏ÂàÜÊàê = . ÈÄöË≠òË™≤Á®ãÔºàÂ∑≤‰øÆ18Â≠¶ÂàÜÔºâ ÔºàÂ∑≤‰øÆÂøÖ‰øÆ0Â≠¶ÂàÜÔºåÈÅ•‰øÆ18Â≠¶, ÁßëÁõÆÂêç = Âü∫ÂøÖ‰øÆËØæÁ®ãÔºàÂ∑≤‰øÆ58Â≠∏ÂàÜÔºâ. ÈÄöË≠òË™≤Á®ãÔºàÂ∑≤‰øÆ18Â≠¶ÂàÜÔºâ ÔºàÂ∑≤‰øÆÂøÖ‰øÆ0Â≠¶ÂàÜÔºåÈÅ•‰øÆ18Â≠¶, Â≠∏ÂàÜ = . ÈÄöË≠òË™≤Á®ãÔºàÂ∑≤‰øÆ18Â≠¶ÂàÜÔºâ ÔºàÂ∑≤‰øÆÂøÖ‰øÆ0Â≠¶ÂàÜÔºåÈÅ•‰øÆ18Â≠¶, Êàê = . ÈÄöË≠òË™≤Á®ãÔºàÂ∑≤‰øÆ18Â≠¶ÂàÜÔºâ ÔºàÂ∑≤‰øÆÂøÖ‰øÆ0Â≠¶ÂàÜÔºåÈÅ•‰øÆ18Â≠¶, ÁßëÁõÆÂêç = ÈñÄÈÅ∏Ê©üÂô®Â≠∏Áøí. ÈÄöË≠òË™≤Á®ãÔºàÂ∑≤‰øÆ18Â≠¶ÂàÜÔºâ ÔºàÂ∑≤‰øÆÂøÖ‰øÆ0Â≠¶ÂàÜÔºåÈÅ•‰øÆ18Â≠¶, Â≠∏ÂàÜ = 3. ÈÄöË≠òË™≤Á®ãÔºàÂ∑≤‰øÆ18Â≠¶ÂàÜÔºâ ÔºàÂ∑≤‰øÆÂøÖ‰øÆ0Â≠¶ÂàÜÔºåÈÅ•‰øÆ18Â≠¶, Êàê = 90. ÈÄöË≠òË™≤Á®ãÔºàÂ∑≤‰øÆ18Â≠¶ÂàÜÔºâ ÔºàÂ∑≤‰øÆÂøÖ‰øÆ0Â≠¶ÂàÜÔºåÈÅ•‰øÆ18Â≠¶, ÁßëÁõÆÂêç = . ÈÄö

In [26]:
print(conv_results_list[0].document.export_to_markdown())

## ÂúãÁ´ãËá∫ÂåóÊïôËÇ≤Â§ßÂ≠∏Â≠∏Áîü‰∏öÂ≠∏ÂàÜÊ†∏Ë°®Á¢∫ÂñÆ

Â≠∏Á®ãÔºöË≥áË®äÁßëÂ≠∏Á≥ªÁè≠Á∫ßÔºöË≥áÂõõÁî≤Â≠∏ËôüÔºö111016035ÂßìÂêçÔºöËêßÂ§ßÁ•êÁîüÊó•Ôºö911118

ÂàóÂç∞Êó•ÊúüÔºö114/4/15

ÂàóÂç∞ÊôÇÈñìÔºö09Ôºö46Ôºö53

Ê•≠Â≠∏ÂàÜÔºö133

| ÁßëÁõÆÂêç                                              |    | Â≠∏ÂàÜÊàê   | ÁßëÁõÆÂêç                                              | Â≠∏ÂàÜ   | Êàê   | ÁßëÁõÆÂêç                        | Â≠∏ÂàÜ   | Êàê   | ÁßëÁõÆÂêç   | Â≠∏ÂàÜÊàê   |
|-----------------------------------------------------|----|----------|-----------------------------------------------------|--------|------|-------------------------------|--------|------|----------|----------|
| ÈÄöË≠òË™≤Á®ãÔºàÂ∑≤‰øÆ18Â≠¶ÂàÜÔºâ ÔºàÂ∑≤‰øÆÂøÖ‰øÆ0Â≠¶ÂàÜÔºåÈÅ•‰øÆ18Â≠¶    |    |          | Âü∫ÂøÖ‰øÆËØæÁ®ãÔºàÂ∑≤‰øÆ58Â≠∏ÂàÜÔºâ                            |        |      | ÈñÄÈÅ∏Ê©üÂô®Â≠∏Áøí                  | 3      | 90   |          |          |
| ÂàÜ)                                                 |    |          | ÔºàÂ∑≤‰

# Rechunk large chunks with RecursiveTextSplitter

In [None]:
big_chunk = []

for i, chunk in enumerate(all_chunks[:]):
    
    ser_txt = hybrid_chunker.serialize(chunk=chunk)
    ser_tokens = len(tokenizer.tokenize(ser_txt))
    if ser_tokens > 1024:
        print(f"=== {i} ===")
        #print(f"chunker.serialize(chunk) ({ser_tokens} tokens):\n{repr(ser_txt)}")
        print(f"chunker.serialize(chunk) ({ser_tokens} tokens):\n{ser_txt}")
        big_chunk.append(chunk)
        print()
    else:
        pass

In [None]:
splitter = RecursiveTextSplitter(tokenizer=tokenizer, max_tokens=1024, overlap=150, min_length_ratio=1)

rechunked_large_chunks = []

for i, chunk in enumerate(big_chunk):
    ser_txt = hybrid_chunker.serialize(chunk=chunk)
    if splitter.tokenize_len(ser_txt) > splitter.max_tokens:
        sub_chunks = splitter.split_text(ser_txt)
        rechunked_large_chunks.extend(sub_chunks)
    else:
        rechunked_large_chunks.append(ser_txt)


"""ser_txt = hybrid_chunker.serialize(chunk=rechunked_large_chunks[1])
if splitter.tokenize_len(ser_txt) > splitter.max_tokens:
    sub_chunks = splitter.split_text(ser_txt)
    rechunked_large_chunks.extend(sub_chunks)
else:
    rechunked_large_chunks.append(ser_txt)"""

In [None]:
for i, chunk in enumerate(rechunked_large_chunks[:]):
    
    ser_txt = chunk
    ser_tokens = len(tokenizer.tokenize(ser_txt))

    print(f"=== {i} ===")
    #print(f"chunker.serialize(chunk) ({ser_tokens} tokens):\n{repr(ser_txt)}")
    print(f"chunker.serialize(chunk) ({ser_tokens} tokens):\n{ser_txt}")
    print()

# Or table extraction result

In [None]:
# table chunks
all_tables = docling_util.extract_tables(conv_results_list)

In [None]:
table_formatter = DataFrameFormatter(tokenizer=tokenizer, show_index=False, max_tokens=1024)

table_chunks = []
for table in all_tables:
    chunks = table_formatter.chunk_rows(table)
    table_chunks.extend(chunks)

In [None]:
for i, chunk in enumerate(table_chunks[:]):
    
    ser_txt = chunk
    ser_tokens = len(tokenizer.tokenize(ser_txt))

    print(f"=== {i} ===")
    #print(f"chunker.serialize(chunk) ({ser_tokens} tokens):\n{repr(ser_txt)}")
    print(f"chunker.serialize(chunk) ({ser_tokens} tokens):\n{ser_txt}")
    print()

# Merge all chunks

In [None]:
all_chunks.extend(rechunked_large_chunks)
all_chunks.extend(table_chunks)
all_chunks

In [None]:
node_text, node_metadatas = [], []
for chunk in all_chunks:
    if type(chunk) == str:
        node_text.append(chunk)
        node_metadatas.append(None)
    else:
        node_text.append(hybrid_chunker.serialize(chunk=chunk))
        node_metadatas.append(chunk.meta.export_json_dict())

In [None]:
embedded_text = []
for i in node_text:
    embedded_text.append(get_embeddings(i))

embedded_tables = []
for i in all_tables_chunks:
    embedded_tables.append(get_embeddings(i))

# Test on BM25 retrieval

In [None]:
#query = "ÂéªÊñ∞Âä†Âù°Âá∫Â∑ÆÂèØ‰ª•Áî≥Ë´ãÂ§öÂ∞ëË≤ªÁî®Ôºü"
#query = "Áßë‰∏ªÁÆ°ÂúãÂÖßÂá∫Â∑ÆÂèØ‰ª•Áî≥Ë´ãÂ§öÂ∞ëÈ§êÈõúÂÆøË≤ªÔºü"
query = "Á≥ªÁµ±ÈñãÁôº‰πãÂÆâÂÖ®ÁÆ°ÁêÜÔºåÊáâÂåÖÂê´Âì™‰∫õÈ†ÖÁõÆÔºü"

output_dir = 'test_BM25index'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# as json
bm25_mixed_json = create_bm25(node_text, 'mixed')
bm25_mixed_json.save(os.path.join(output_dir, 'bm25_mixed.json'))

loaded_bm25_mixed_json = load_bm25(os.path.join(output_dir, 'bm25_mixed.json'), node_text)

print("load from json and mixure search:", query)
print()
results_json = loaded_bm25_mixed_json.search(query, top_k=5)
for doc_id, score in results_json:
    print(f"Doc ID: {doc_id}, Score: {score:.4f}, Text: {node_text[doc_id]}\n")

# Qdrant


In [3]:
from qdrant_client import QdrantClient
from qdrant_client.http import models
from qdrant_client.http.models import PointStruct

class qdrant_DBConnector:
    def __init__(self, collection_name):#, embedding_fn):
        self.qdrant_client = QdrantClient("http://localhost:6333")

        self.collection_name = collection_name

        # create collection
        self.collection = self.qdrant_client.recreate_collection(
            collection_name = collection_name,
            vectors_config = models.VectorParams(
                distance = models.Distance.COSINE,
                size=len(get_embeddings("‰Ω†Â•Ω"))),
            optimizers_config = models.OptimizersConfigDiff(memmap_threshold=20000),
            hnsw_config = models.HnswConfigDiff(on_disk=True, m=16, ef_construct=100)
        )
        '''
        # embed models
        self.embedding_fn = embedding_fn
        qdrant_client.set_model(self.embedding_fn)
        '''

    ''' WARNING: NO CHECK ON EQUAL LENGTH YET'''
    def upsert_vector(self, vectors, data):
        # insert 'points' to qdrant by vector, 
        # payload with original text and metadata
        for i, vector in enumerate(vectors):
            """ WARNING: SHOULD CHECK DIMENSION==EMBEDDING_DIMENSION INSTEAD"""
            if len(vector) == 0:
                continue
            self.qdrant_client.upsert(
                collection_name=self.collection_name,
                points=[PointStruct(id=i,
                                    vector=vectors[i],
                                    payload={"text": data.text[i],
                                             "metadata": data.metadata[i]})]
                                    
            )

        print("upsert finish")

    def retrieved_all(self):
        count_points = self.qdrant_client.count(
            collection_name=self.collection_name,
            exact=True,
        )
        result = self.qdrant_client.retrieve(
            collection_name=self.collection_name,
            ids=list(range(0, count_points.count)),
        )
        return result

    def vector_search(self, vector, top_k):
        # vector search qdrant DB
        result = self.qdrant_client.search(
            collection_name=self.collection_name,
            query_vector=vector,
            limit=top_k,
            append_payload=True,
        )
        return result
    
    def vector_search_json(self, vector, top_k):
        # vector search qdrant DB with json format output
        result = self.qdrant_client.search(
            collection_name=self.collection_name,
            query_vector=vector,
            limit=top_k,
            append_payload=True,
        )
        vector_result_json = {
            f"chunk_{item.id}": {
                "text": item.payload['text'],
                "rank": index,
                "score": item.score
            }
            for index, item in enumerate(result)
        }

        return vector_result_json

In [None]:
class DataObject:
    def __init__(self, text, metadata):
        self.text = text
        self.metadata = metadata if metadata else [{} for _ in range(len(text))]

data = DataObject(node_text, node_metadatas)
data.text[0]

In [None]:
# create a qdrant vectorDB object
vector_db = qdrant_DBConnector("qdrant_test")
# add data to db
vector_db.upsert_vector(embedded_text, data)

table_datas = DataObject(all_tables_chunks, [])
# create a qdrant vectorDB object
vector_db_table = qdrant_DBConnector("qdrant_table")
# add data to db
vector_db_table.upsert_vector(embedded_tables, table_datas)

In [None]:
#query = "ÂúãÂÖßÂá∫Â∑ÆË≤ªÁî®Ê®ôÊ∫ñ"
#query = "ÂúãÂ§ñÂá∫Â∑ÆÔºåÈôÑË°®‰∫î "
query = "Á≥ªÁµ±ÈñãÁôº‰πãÂÆâÂÖ®ÁÆ°ÁêÜÔºåÊáâÂåÖÂê´Âì™‰∫õÈ†ÖÁõÆÔºü"

embedded_query = get_embeddings(query)
results = vector_db.vector_search(embedded_query, 5)
#results = vector_db_table.vector_search(embedded_query, 5)
print(f"Â∞ãÊâæ „Äå{query}„Äç:")
#print(results[0])

for i, point in enumerate(results):
    print(f"\nüîπ Score: {point.score:.2f} - point ID: {point.id}:")
    print(point.payload['text'])
    print()


# Try chatting!

In [None]:
"""stream = chat(
    model='deepseek-r1:7b',
    messages=[{'role': 'user', 'content': 'Â¶≥Â•Ω'}],
    stream=True,
)

for chunk in stream:
  print(chunk['message']['content'], end='', flush=True)"""

In [None]:
def bm25_retrieval(query, vector_db_name=vector_db, top_k=3):
    result = vector_db_name.retrieved_all()
    retrieved_text = []
    for i, points in enumerate(result):
        retrieved_text.append(result[i].payload['text'])

    bm25_result = bm25_search(corpus=retrieved_text, query=query, top_k=top_k)
    bm25_result_json = {
        f"chunk_{item[0]}": {
            "text": item[2],
            "rank": index,
            "score": item[1]
        }
        for index, item in enumerate(bm25_result)
    }

    return bm25_result_json

def rrf(ranks, k=1):
    ret = {}
    # recursive through all retrieved method
    for rank in ranks:
        for id, val in rank.items():
            if id not in ret:
                ret[id] = {"score": 0, "text": val["text"]}
            # calculate rrf score
            ret[id]["score"] += 1.0/(k+val["rank"])
    # sort and return according to rrf score
    return dict(sorted(ret.items(), key=lambda item: item[1]["score"], reverse=True))

def get_completion(prompt, model=llm):
    messages = [{"role": "user", "content": prompt}]
    response = chat(
        model=model,
        messages=messages
        #stream=True,
        #format="json",
        #options={"temperature":0}
    )
    return response.message.content


In [None]:
query = "Âéª‰∏≠ÂúãÂ§ßÈô∏Âá∫Â∑ÆÂèØ‰ª•Áî≥Ë´ãÂ§öÂ∞ëË≤ªÁî®Ôºü"

print("search for:", query)
bm25_retrieve_result = bm25_retrieval(query, top_k=3)
show_json(bm25_retrieve_result)

In [None]:
embedded_query = get_embeddings(query)
vector_result_json = vector_db.vector_search_json(embedded_query, 3)
print("search for:", query)

show_json(vector_result_json)

In [None]:
hybrid_result = rrf([vector_result_json, bm25_retrieve_result])

print(json.dumps(hybrid_result, indent=4, ensure_ascii=False))

In [None]:
def hybrid_retriever(query, top_k=3):
    embedded_query = get_embeddings(query)
    result = rrf([vector_db.vector_search_json(embedded_query, top_k), bm25_retrieval(query, top_k=top_k)])
    return result

def reranker(query, retrieved_result, threshold=0):
    text_chunks = []
    for chunk_id, val in retrieved_result.items():
        text_chunks.append(val['text'])
    scores = rerank_model.predict([(query, doc) for doc in text_chunks])
    sorted_list = sorted(zip(scores, text_chunks), key=lambda x: x[0], reverse=True)
    reranked_result = [chunk for chunk in sorted_list if chunk[0] > threshold]
    if len(reranked_result) < 3:
        reranked_result = sorted_list[:3]
    elif len(reranked_result) > 5:
        reranked_result = reranked_result[:5]

    return reranked_result
    #return [chunk for chunk in sorted_list if chunk[0] > threshold]

def format_rag_output(reranked_list):
    formatted_docs = "\n\n".join([f"Document {i+1}:\n{text[1]}" for i, text in enumerate(reranked_list)])
    return formatted_docs

def hybrid_retriever_with_BM25table(query, top_k=3):
    embedded_query = get_embeddings(query)
    result = rrf([vector_db.vector_search_json(embedded_query, top_k), bm25_retrieval(query, top_k=top_k), bm25_retrieval(query, vector_db_table, top_k)])
    return result

In [None]:
#query = "ÂéªÂçóÈùûÂÖ±ÂíåÂúãÂá∫Â∑ÆÂèØ‰ª•Áî≥Ë´ãÂ§öÂ∞ëË≤ªÁî®Ôºü"
query = "ÈáçË¶ÅÊúÉË®àÈ†ÖÁõÆ‰∏≠ÔºåÁèæÈáëÂèäÁ¥ÑÁï∂ÁèæÈáëÁöÑÈäÄË°åÂ≠òÊ¨æÊòØÂ§öÂ∞ëÔºü"

#print(format_rag_output(reranker(query, hybrid_retriever(query, 20), 0.45)))
print(format_rag_output(reranker(query, hybrid_retriever(query, top_k=35), 0.45)))

print()
#for chunk in reranker(query, hybrid_retriever(query, 20)):
for chunk in reranker(query, hybrid_retriever(query, top_k=35)):
    print(chunk)
    print()

In [None]:
# Job instruction
instruction = """
‰Ω†ÊòØÂè∞ÁÅ£ÂúãÊ≥∞ÈáëÊéßÁöÑËÅäÂ§©Ê©üÂô®‰∫∫ÁßòÊõ∏ÔºåÂ∞àÈñÄÁÇ∫Áî®Êà∂Ëß£Á≠îÂÖ¨Âè∏ÂÖßÈÉ®ÁöÑÂ∑ÆÊóÖË≤ªÁî®„ÄÅÂ†±Èä∑Ê®ôÊ∫ñÁ≠âÁõ∏ÈóúÂïèÈ°å„ÄÇ
‰Ω†ÁöÑ‰ªªÂãôÊòØÊ†πÊìö‰Ω†Áç≤ÂæóÁöÑ„ÄåÂèÉËÄÉÊñá‰ª∂„ÄçÔºåÂ∞ç„ÄåÁî®Êà∂ÂïèÈ°å„ÄçÊÆµËêΩÁöÑÂïèÈ°åÈÄ≤Ë°åÂõûÁ≠î„ÄÇ

Ë´ãÂãôÂøÖÊ†πÊìö„ÄåÂèÉËÄÉÊñá‰ª∂„Äç‰∏≠ÁöÑÂÖ∑È´îË≥áË®ä‰ΩúÁ≠îÔºå‰∏¶Ê≥®ÊÑè‰ª•‰∏ãË¶ÅÊ±ÇÔºö
1. Ëã•Êüê‰∫õÊñá‰ª∂ÂÖßÂÆπÂ∞çÂõûÁ≠îÁÑ°Âπ´Âä©ÔºåÂèØ‰ª•ÂøΩÁï•Ôºå‰∏çÊé°Áî®„ÄÇ
2. ÂõûÁ≠îÊáâÁ∞°ÊΩî„ÄÅÊòéÁ¢∫ÔºåÈÅøÂÖçÂÜóÈï∑ÔºåÂÉÖÊèêÂèñÈóúÈçµË≥áË®ä„ÄÇ
3. Ëã•ÂèÉËÄÉÊñá‰ª∂ÁÑ°Ê≥ïÊèê‰æõÁ≠îÊ°àÔºåË´ãÁõ¥Êé•ÂõûÁ≠î„ÄåÊàëÁÑ°Ê≥ïÊ†πÊìöÁèæÊúâË≥áÊñôÂõûÁ≠îÈÄôÂÄãÂïèÈ°å„ÄçÔºå‰∏¶‰∏çË¶ÅËá™Ë°åË£úÂÖÖ„ÄÇ

Âö¥Ê†º‰ΩøÁî®ÁπÅÈ´î‰∏≠ÊñáÔºåÈÅøÂÖçËã±ÊñáÊàñÁ∞°È´î‰∏≠Êñá„ÄÇ
"""

# User input
#input_text = "Âéª‰∏≠ÂúãÂ§ßÈô∏Âá∫Â∑ÆÂèØ‰ª•Áî≥Ë´ãÂ§öÂ∞ëË≤ªÁî®Ôºü"
#input_text = "ÂéªÂçóÈùûÂÖ±ÂíåÂúãÂá∫Â∑ÆÂèØ‰ª•Áî≥Ë´ãÂ§öÂ∞ëË≤ªÁî®Ôºü"
#input_text = "ÂéªÊô∫Âà©Âá∫Â∑ÆÂèØ‰ª•Áî≥Ë´ãÂ§öÂ∞ëË≤ªÁî®Ôºü"
#input_text = "Áßë‰∏ªÁÆ°ÂúãÂÖßÂá∫Â∑ÆÂèØ‰ª•Áî≥Ë´ãÂ§öÂ∞ëÈ§êÈõúÂÆøË≤ªÔºü"
#input_text = "ÂïÜÂìÅË≤®Âπ£ÊòØ‰ªÄÈ∫ºÔºü"
#input_text = "Á≥ªÁµ±ÈñãÁôº‰πãÂÆâÂÖ®ÁÆ°ÁêÜÔºåÊáâÂåÖÂê´Âì™‰∫õÈ†ÖÁõÆÔºü"
#input_text = "ÂúãÊ≥∞ÂÑ™ÊÉ†APP ÂæåÂè∞Á∂≠Ë≠∑Á≥ªÁµ±ÔºåÊúâ‰ªÄÈ∫ºÂäüËÉΩÔºü"
#input_text = "ÂúãÊ≥∞ÂÑ™ÊÉ†APP ÂæåÂè∞Á∂≠Ë≠∑Á≥ªÁµ±ÔºåÂäüËÉΩÊ¨äÈôêÁî≥Ë´ãÊµÅÁ®ãÁÇ∫‰ΩïÔºü" # Fail, didn't have the ability to recognize flow chart
#input_text = "ÂúãÊ≥∞ÂÑ™ÊÉ† APPÂæåÂè∞Á∂≠Ë≠∑Á≥ªÁµ±‰ΩøÁî®ËÄÖÊ¨äÈôêË°®ÔºåÊúâ‰ªÄÈ∫ºÊ¨Ñ‰ΩçÔºü"
#input_text = "ÈáëËûçÁßëÊäÄÂâµÊñ∞Ê•≠Âãô‰πãÁ´ãÊ°àÂèäÊ¶ÇÂøµÊ∏¨Ë©¶ÊµÅÁ®ãÁÇ∫‰ΩïÔºü"  # Fail? didn't recognize flow chart but there's explaination on previous page
#input_text = "Ë®≠Ë®àÈòøÁôºÊôÇÊúâ‰ªÄÈ∫ºËÉåÊôØËâ≤ÂΩ©ÈôêÂà∂Ôºü" # Fail? on picture detail explaination, but did catch the words on pics
#input_text = "Ê†πÊìö111Âπ¥Â∫¶4-6Êúà‰ªΩÊçêÊ¨æÂèäÊçêË¥àÁâ©Ë≥áÂæµ‰ø°ÂêçÂÜäÔºåË™∞Êçê‰∫ÜÈµùËÇâÊπØÔºü" # doc14 fail on parsing compact table index, but seems like it does not interfere with llm answer
#input_text = "Ê†πÊìö111Âπ¥Â∫¶4-6Êúà‰ªΩÊçêÊ¨æÂèäÊçêË¥àÁâ©Ë≥áÂæµ‰ø°ÂêçÂÜäÔºåÊ¢ÖÂäõÂåñÂ≠∏Â∑•Ê•≠ÊúâÈôêÂÖ¨Âè∏ÂÅö‰∫Ü‰ªÄÈ∫ºÔºü" # same as above
#input_text = "ÈáçË¶ÅÊúÉË®àÈ†ÖÁõÆ‰∏≠ÔºåÁèæÈáëÂèäÁ¥ÑÁï∂ÁèæÈáëÁöÑÈäÄË°åÂ≠òÊ¨æÔºå111Âπ¥Ëàá112Âπ¥ÂàÜÂà•ÊòØÂ§öÂ∞ëÔºü" # loose structured table, success
#input_text = "Âπ´ÊàëÁµ±Êï¥‰∏Ä‰∏ã‰∏çÂãïÁî¢„ÄÅÂª†ÊàøÂèäË®≠ÂÇô‰∏≠ÔºåÊàêÊú¨ÔºåÊäòËàä‰ª•ÂèäÊ∑®Â∏≥Èù¢ÈáëÈ°ç" # loose structured table, poor recognize result, but seemingly success
#input_text = "Ë´ãÁµ¶ÊàëÊáâÊî∂Â∏≥Ê¨æ(Ê∑®È°ç)‰πãÂ∏≥ÈΩ°ÂàÜÊûêÁöÑÂÖßÂÆπ"
input_text = "Ë´ãÁµ¶Êàë112Âπ¥ÔºåËó•ÂìÅÈÄ≤Ë≤®‰∫§ÊòìÂ∞çË±°ÁöÑÂêçÁ®±ÔºåÈáëÈ°çÂèäÊØîÁéá"

# RAG retrieved documents
reranked_list = reranker(input_text, hybrid_retriever(input_text, 20), 0.45)
rag_docs = format_rag_output(reranked_list)

# Prompt template
prompt = f"""
# ‰ªªÂãô
{instruction}

# ÂèÉËÄÉÊñá‰ª∂
{rag_docs}

# Áî®Êà∂ÂïèÈ°å
{input_text}
"""

print("==== Prompt ====")
print(prompt)
print("================")

# llm calling
if len(rag_docs) != 0:
    response = get_completion(prompt)
else:
    response = "YAh, you retrieved NOTHING!"
print(response)

In [None]:
text = """

==== Prompt ====

# ‰ªªÂãô

‰Ω†ÊòØÂè∞ÁÅ£ÂúãÊ≥∞ÈáëÊéßÁöÑËÅäÂ§©Ê©üÂô®‰∫∫ÁßòÊõ∏ÔºåÂ∞àÈñÄÁÇ∫Áî®Êà∂Ëß£Á≠îÂÖ¨Âè∏ÂÖßÈÉ®ÁöÑÂ∑ÆÊóÖË≤ªÁî®„ÄÅÂ†±Èä∑Ê®ôÊ∫ñÁ≠âÁõ∏ÈóúÂïèÈ°å„ÄÇ
‰Ω†ÁöÑ‰ªªÂãôÊòØÊ†πÊìö‰Ω†Áç≤ÂæóÁöÑ„ÄåÂèÉËÄÉÊñá‰ª∂„ÄçÔºåÂ∞ç„ÄåÁî®Êà∂ÂïèÈ°å„ÄçÊÆµËêΩÁöÑÂïèÈ°åÈÄ≤Ë°åÂõûÁ≠î„ÄÇ

Ë´ãÂãôÂøÖÊ†πÊìö„ÄåÂèÉËÄÉÊñá‰ª∂„Äç‰∏≠ÁöÑÂÖ∑È´îË≥áË®ä‰ΩúÁ≠îÔºå‰∏¶Ê≥®ÊÑè‰ª•‰∏ãË¶ÅÊ±ÇÔºö
1. Ëã•Êüê‰∫õÊñá‰ª∂ÂÖßÂÆπÂ∞çÂõûÁ≠îÁÑ°Âπ´Âä©ÔºåÂèØ‰ª•ÂøΩÁï•Ôºå‰∏çÊé°Áî®„ÄÇ
2. ÂõûÁ≠îÊáâÁ∞°ÊΩî„ÄÅÊòéÁ¢∫ÔºåÈÅøÂÖçÂÜóÈï∑ÔºåÂÉÖÊèêÂèñÈóúÈçµË≥áË®ä„ÄÇ
3. Ëã•ÂèÉËÄÉÊñá‰ª∂ÁÑ°Ê≥ïÊèê‰æõÁ≠îÊ°àÔºåË´ãÁõ¥Êé•ÂõûÁ≠î„ÄåÊàëÁÑ°Ê≥ïÊ†πÊìöÁèæÊúâË≥áÊñôÂõûÁ≠îÈÄôÂÄãÂïèÈ°å„ÄçÔºå‰∏¶‰∏çË¶ÅËá™Ë°åË£úÂÖÖ„ÄÇ

Âö¥Ê†º‰ΩøÁî®ÁπÅÈ´î‰∏≠ÊñáÔºåÈÅøÂÖçËã±ÊñáÊàñÁ∞°È´î‰∏≠Êñá„ÄÇ


# ÂèÉËÄÉÊñá‰ª∂
Document 1:
Ê∞ëÂúã 112 Âπ¥Â∫¶
ÂñÆ‰ΩçÔºöÊñ∞Âè∞Âπ£
1, ‰∫§Êòì‰∫∫ÂêçÁ®± = Ë£ïÂà©ËÇ°‰ªΩÊúâÈôêÂÖ¨Âè∏. 1, ÈáëÈ°ç = $1,230,544,978. 1, Êáâ‰ªòÁ•®Êìö„ÄÅ  Â∏≥Ê¨æÈ§òÈ°ç = $538,625,956. 1, Âç†Ë©≤È†ÖÊîØ  Âá∫‰πãÊØîÁéá = 45%. 2, ‰∫§Êòì‰∫∫ÂêçÁ®± = Âè∞ÁÅ£Â§ßÊòåËèØÂòâËÇ°‰ªΩÊúâÈôêÂÖ¨Âè∏. 2, ÈáëÈ°ç = 425,136,734. 2, Êáâ‰ªòÁ•®Êìö„ÄÅ  Â∏≥Ê¨æÈ§òÈ°ç = 177,372,803. 2, Âç†Ë©≤È†ÖÊîØ  Âá∫‰πãÊØîÁéá = 15%. 3, ‰∫§Êòì‰∫∫ÂêçÁ®± = ‰πÖË£ï‰ºÅÊ•≠ËÇ°‰ªΩÊúâÈôêÂÖ¨Âè∏. 3, ÈáëÈ°ç = 163,077,219. 3, Êáâ‰ªòÁ•®Êìö„ÄÅ  Â∏≥Ê¨æÈ§òÈ°ç = 67,343,897. 3, Âç†Ë©≤È†ÖÊîØ  Âá∫‰πãÊØîÁéá = 6%. 4, ‰∫§Êòì‰∫∫ÂêçÁ®± = ËèØÂÆâËó•ÂìÅËÇ°‰ªΩÊúâÈôêÂÖ¨Âè∏. 4, ÈáëÈ°ç = 66,678,740. 4, Êáâ‰ªòÁ•®Êìö„ÄÅ  Â∏≥Ê¨æÈ§òÈ°ç = 29,857,859. 4, Âç†Ë©≤È†ÖÊîØ  Âá∫‰πãÊØîÁéá = 2%. 5, ‰∫§Êòì‰∫∫ÂêçÁ®± = Âè∞ÁÅ£Êù±Ê¥ãËó•ÂìÅÂ∑•Ê•≠ËÇ°‰ªΩÊúâÈôêÂÖ¨Âè∏. 5, ÈáëÈ°ç = 45,251,402. 5, Êáâ‰ªòÁ•®Êìö„ÄÅ  Â∏≥Ê¨æÈ§òÈ°ç = 16,630,969. 5, Âç†Ë©≤È†ÖÊîØ  Âá∫‰πãÊØîÁéá = 2%. 6, ‰∫§Êòì‰∫∫ÂêçÁ®± = Âè∞Áî∞Ëó•ÂìÅËÇ°‰ªΩÊúâÈôêÂÖ¨Âè∏. 6, ÈáëÈ°ç = 44,433,615. 6, Êáâ‰ªòÁ•®Êìö„ÄÅ  Â∏≥Ê¨æÈ§òÈ°ç = 17,112,370. 6, Âç†Ë©≤È†ÖÊîØ  Âá∫‰πãÊØîÁéá = 2%. 7, ‰∫§Êòì‰∫∫ÂêçÁ®± = Âè∞ÁÅ£‰∏≠Â§ñË£ΩËó•ËÇ°‰ªΩÊúâÈôêÂÖ¨Âè∏. 7, ÈáëÈ°ç = 41,218,669. 7, Êáâ‰ªòÁ•®Êìö„ÄÅ  Â∏≥Ê¨æÈ§òÈ°ç = 15,854,114. 7, Âç†Ë©≤È†ÖÊîØ  Âá∫‰πãÊØîÁéá = 1%. 8, ‰∫§Êòì‰∫∫ÂêçÁ®± = ÁëûÂÆâÂúãÈöõËÇ°‰ªΩÊúâÈôêÂÖ¨Âè∏. 8, ÈáëÈ°ç = 31,866,988. 8, Êáâ‰ªòÁ•®Êìö„ÄÅ  Â∏≥Ê¨æÈ§òÈ°ç = 9,608,150. 8, Âç†Ë©≤È†ÖÊîØ  Âá∫‰πãÊØîÁéá = 1%. 9, ‰∫§Êòì‰∫∫ÂêçÁ®± = Â§ßÈöÜËààËó•ÂìÅËÇ°‰ªΩÊúâÈôêÂÖ¨Âè∏. 9, ÈáëÈ°ç = 29,255,417. 9, Êáâ‰ªòÁ•®Êìö„ÄÅ  Â∏≥Ê¨æÈ§òÈ°ç = 11,576,886. 9, Âç†Ë©≤È†ÖÊîØ  Âá∫‰πãÊØîÁéá = 1%. 10, ‰∫§Êòì‰∫∫ÂêçÁ®± = ÂÖÉËã±‰ºÅÊ•≠ËÇ°‰ªΩÊúâÈôêÂÖ¨Âè∏. 10, ÈáëÈ°ç = 28,909,315. 10, Êáâ‰ªòÁ•®Êìö„ÄÅ  Â∏≥Ê¨æÈ§òÈ°ç = 9,644,946. 10, Âç†Ë©≤È†ÖÊîØ  Âá∫‰πãÊØîÁéá = 1%

Document 2:
Ê∞ëÂúã 111 Âπ¥Â∫¶
ÂñÆ‰ΩçÔºöÊñ∞Âè∞Âπ£
ÈÜ´ÂãôÊî∂ÂÖ•, ‰∫§ÊòìÂ∞çË±° = ÂúãÊ≥∞‰∫∫Â£Ω. ÈÜ´ÂãôÊî∂ÂÖ•, Èóú‰øÇ = ÂÖ∂‰ªñÈóú‰øÇ‰∫∫. ÈÜ´ÂãôÊî∂ÂÖ•, ‰∫§ÊòìÈáëÈ°ç = $9,318,908. ÈÜ´ÂãôÊî∂ÂÖ•, Êéà‰ø°ÊúüÈñì = 90 Â§©. ÈÜ´ÂãôÊî∂ÂÖ•, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.È§òÈ°ç = $-. ÈÜ´ÂãôÊî∂ÂÖ•, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.Âç†Á∏ΩÊáâÊî∂(‰ªò)Á•®  ÊìöÂ∏≥Ê¨æÊØîÁéá = -%. ÈÜ´ÂãôÊî∂ÂÖ•, ÂÇôË®ª = ÊáâÊî∂Â∏≥Ê¨æ. ÈÜ´ÂãôÊî∂ÂÖ•, ‰∫§ÊòìÂ∞çË±° = ÂúãÊ≥∞ÈáëÊéß. ÈÜ´ÂãôÊî∂ÂÖ•, Èóú‰øÇ = ÂÖ∂‰ªñÈóú‰øÇ‰∫∫. ÈÜ´ÂãôÊî∂ÂÖ•, ‰∫§ÊòìÈáëÈ°ç = 781,800. ÈÜ´ÂãôÊî∂ÂÖ•, Êéà‰ø°ÊúüÈñì = 90 Â§©. ÈÜ´ÂãôÊî∂ÂÖ•, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.È§òÈ°ç = -. ÈÜ´ÂãôÊî∂ÂÖ•, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.Âç†Á∏ΩÊáâÊî∂(‰ªò)Á•®  ÊìöÂ∏≥Ê¨æÊØîÁéá = -%. ÈÜ´ÂãôÊî∂ÂÖ•, ÂÇôË®ª = ÊáâÊî∂Â∏≥Ê¨æ. ÈÜ´ÂãôÊî∂ÂÖ•, ‰∫§ÊòìÂ∞çË±° = ÈúñÂúíÂÖ¨ÂØì. ÈÜ´ÂãôÊî∂ÂÖ•, Èóú‰øÇ = ÂÖ∂‰ªñÈóú‰øÇ‰∫∫. ÈÜ´ÂãôÊî∂ÂÖ•, ‰∫§ÊòìÈáëÈ°ç = 208,000. ÈÜ´ÂãôÊî∂ÂÖ•, Êéà‰ø°ÊúüÈñì = 90 Â§©. ÈÜ´ÂãôÊî∂ÂÖ•, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.È§òÈ°ç = -. ÈÜ´ÂãôÊî∂ÂÖ•, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.Âç†Á∏ΩÊáâÊî∂(‰ªò)Á•®  ÊìöÂ∏≥Ê¨æÊØîÁéá = -%. ÈÜ´ÂãôÊî∂ÂÖ•, ÂÇôË®ª = ÊáâÊî∂Â∏≥Ê¨æ. ÈÜ´ÂãôÊî∂ÂÖ•, ‰∫§ÊòìÂ∞çË±° = ÂúãÊ≥∞‰∏ñËèØ. ÈÜ´ÂãôÊî∂ÂÖ•, Èóú‰øÇ = ÂÖ∂‰ªñÈóú‰øÇ‰∫∫. ÈÜ´ÂãôÊî∂ÂÖ•, ‰∫§ÊòìÈáëÈ°ç = 1,400,700. ÈÜ´ÂãôÊî∂ÂÖ•, Êéà‰ø°ÊúüÈñì = 90 Â§©. ÈÜ´ÂãôÊî∂ÂÖ•, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.È§òÈ°ç = -. ÈÜ´ÂãôÊî∂ÂÖ•, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.Âç†Á∏ΩÊáâÊî∂(‰ªò)Á•®  ÊìöÂ∏≥Ê¨æÊØîÁéá = -%. ÈÜ´ÂãôÊî∂ÂÖ•, ÂÇôË®ª = ÊáâÊî∂Â∏≥Ê¨æ. ÈÜ´ÂãôÊàêÊú¨-  ‰∫∫‰∫ãË≤ªÁî®, ‰∫§ÊòìÂ∞çË±° = ÂúãÊ≥∞‰∫∫Â£Ω. ÈÜ´ÂãôÊàêÊú¨-  ‰∫∫‰∫ãË≤ªÁî®, Èóú‰øÇ = ÂÖ∂‰ªñÈóú‰øÇ‰∫∫. ÈÜ´ÂãôÊàêÊú¨-  ‰∫∫‰∫ãË≤ªÁî®, ‰∫§ÊòìÈáëÈ°ç = 55,580,357. ÈÜ´ÂãôÊàêÊú¨-  ‰∫∫‰∫ãË≤ªÁî®, Êéà‰ø°ÊúüÈñì = 60 Â§©. ÈÜ´ÂãôÊàêÊú¨-  ‰∫∫‰∫ãË≤ªÁî®, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.È§òÈ°ç = -. ÈÜ´ÂãôÊàêÊú¨-  ‰∫∫‰∫ãË≤ªÁî®, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.Âç†Á∏ΩÊáâÊî∂(‰ªò)Á•®  ÊìöÂ∏≥Ê¨æÊØîÁéá = -%. ÈÜ´ÂãôÊàêÊú¨-  ‰∫∫‰∫ãË≤ªÁî®, ÂÇôË®ª = ÂÖ∂‰ªñÊáâ‰ªòÊ¨æ. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ‰∫§ÊòìÂ∞çË±° = ÂúãÊ≥∞Áî¢Èö™. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, Èóú‰øÇ = ÂÖ∂‰ªñÈóú‰øÇ‰∫∫. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ‰∫§ÊòìÈáëÈ°ç = 4,229,871. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, Êéà‰ø°ÊúüÈñì = 60 Â§©. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.È§òÈ°ç = -. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.Âç†Á∏ΩÊáâÊî∂(‰ªò)Á•®  ÊìöÂ∏≥Ê¨æÊØîÁéá = -%. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ÂÇôË®ª = ÂÖ∂‰ªñÊáâ‰ªòÊ¨æ. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ‰∫§ÊòìÂ∞çË±° = ÂúãÊ≥∞‰∫∫Â£Ω. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, Èóú‰øÇ = ÂÖ∂‰ªñÈóú‰øÇ‰∫∫. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ‰∫§ÊòìÈáëÈ°ç = 3,144,672. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, Êéà‰ø°ÊúüÈñì = 60 Â§©. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.È§òÈ°ç = -. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.Âç†Á∏ΩÊáâÊî∂(‰ªò)Á•®  ÊìöÂ∏≥Ê¨æÊØîÁéá = -%. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ÂÇôË®ª = ÂÖ∂‰ªñÊáâ‰ªòÊ¨æ. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ‰∫§ÊòìÂ∞çË±° = ÈúñÂúíÂÖ¨ÂØì. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, Èóú‰øÇ = ÂÖ∂‰ªñÈóú‰øÇ‰∫∫. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ‰∫§ÊòìÈáëÈ°ç = 5,427,619. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, Êéà‰ø°ÊúüÈñì = 60 Â§©. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.È§òÈ°ç = 5,699,000. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.Âç†Á∏ΩÊáâÊî∂(‰ªò)Á•®  ÊìöÂ∏≥Ê¨æÊØîÁéá = 1.13%. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ÂÇôË®ª = ÂÖ∂‰ªñÊáâ‰ªòÊ¨æ. ÈÜ´ÂãôÊàêÊú¨-  ÁßüÈáëÊîØÂá∫, ‰∫§ÊòìÂ∞çË±° = ÂúãÊ≥∞‰∫∫Â£Ω. ÈÜ´ÂãôÊàêÊú¨-  ÁßüÈáëÊîØÂá∫, Èóú‰øÇ = ÂÖ∂‰ªñÈóú‰øÇ‰∫∫. ÈÜ´ÂãôÊàêÊú¨-  ÁßüÈáëÊîØÂá∫, ‰∫§ÊòìÈáëÈ°ç = 189,425,794. ÈÜ´ÂãôÊàêÊú¨-  ÁßüÈáëÊîØÂá∫, Êéà‰ø°ÊúüÈñì = ‰∏çÈÅ©Áî®. ÈÜ´ÂãôÊàêÊú¨-  ÁßüÈáëÊîØÂá∫, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.È§òÈ°ç = -. ÈÜ´ÂãôÊàêÊú¨-  ÁßüÈáëÊîØÂá∫, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.Âç†Á∏ΩÊáâÊî∂(‰ªò)Á•®  ÊìöÂ∏≥Ê¨æÊØîÁéá = -%. ÈÜ´ÂãôÊàêÊú¨-  ÁßüÈáëÊîØÂá∫, ÂÇôË®ª = ÂÖ∂‰ªñÊáâ‰ªòÊ¨æ. ÂÖ∂‰ªñÁáüÈÅãË≤ªÁî®, ‰∫§ÊòìÂ∞çË±° = Á•ûÂùäË≥áË®ä. ÂÖ∂‰ªñÁáüÈÅãË≤ªÁî®, Èóú‰øÇ = ÂÖ∂‰ªñÈóú‰øÇ‰∫∫. ÂÖ∂‰ªñÁáüÈÅãË≤ªÁî®, ‰∫§ÊòìÈáëÈ°ç = 6,942,134. ÂÖ∂‰ªñÁáüÈÅãË≤ªÁî®, Êéà‰ø°ÊúüÈñì = 60 Â§©. ÂÖ∂‰ªñÁáüÈÅãË≤ªÁî®, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.È§òÈ°ç = 380,462. ÂÖ∂‰ªñÁáüÈÅãË≤ªÁî®, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.Âç†Á∏ΩÊáâÊî∂(‰ªò)Á•®  ÊìöÂ∏≥Ê¨æÊØîÁéá = 0.08%. ÂÖ∂‰ªñÁáüÈÅãË≤ªÁî®, ÂÇôË®ª = ÂÖ∂‰ªñÊáâ‰ªòÊ¨æ

Document 3:
Ê∞ëÂúã 111 Âπ¥Â∫¶
ÂñÆ‰ΩçÔºöÊñ∞Âè∞Âπ£
ÈÜ´ÂãôÊî∂ÂÖ•, ‰∫§ÊòìÂ∞çË±° = ÂúãÊ≥∞‰∫∫Â£Ω. ÈÜ´ÂãôÊî∂ÂÖ•, Èóú‰øÇ = ÂÖ∂‰ªñÈóú‰øÇ‰∫∫. ÈÜ´ÂãôÊî∂ÂÖ•, ‰∫§ÊòìÈáëÈ°ç = $9,318,908. ÈÜ´ÂãôÊî∂ÂÖ•, Êéà‰ø°ÊúüÈñì = 90 Â§©. ÈÜ´ÂãôÊî∂ÂÖ•, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.È§òÈ°ç = $-. ÈÜ´ÂãôÊî∂ÂÖ•, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.Âç†Á∏ΩÊáâÊî∂(‰ªò)Á•®  ÊìöÂ∏≥Ê¨æÊØîÁéá = -%. ÈÜ´ÂãôÊî∂ÂÖ•, ÂÇôË®ª = ÊáâÊî∂Â∏≥Ê¨æ. ÈÜ´ÂãôÊî∂ÂÖ•, ‰∫§ÊòìÂ∞çË±° = ÂúãÊ≥∞ÈáëÊéß. ÈÜ´ÂãôÊî∂ÂÖ•, Èóú‰øÇ = ÂÖ∂‰ªñÈóú‰øÇ‰∫∫. ÈÜ´ÂãôÊî∂ÂÖ•, ‰∫§ÊòìÈáëÈ°ç = 781,800. ÈÜ´ÂãôÊî∂ÂÖ•, Êéà‰ø°ÊúüÈñì = 90 Â§©. ÈÜ´ÂãôÊî∂ÂÖ•, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.È§òÈ°ç = -. ÈÜ´ÂãôÊî∂ÂÖ•, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.Âç†Á∏ΩÊáâÊî∂(‰ªò)Á•®  ÊìöÂ∏≥Ê¨æÊØîÁéá = -%. ÈÜ´ÂãôÊî∂ÂÖ•, ÂÇôË®ª = ÊáâÊî∂Â∏≥Ê¨æ. ÈÜ´ÂãôÊî∂ÂÖ•, ‰∫§ÊòìÂ∞çË±° = ÈúñÂúíÂÖ¨ÂØì. ÈÜ´ÂãôÊî∂ÂÖ•, Èóú‰øÇ = ÂÖ∂‰ªñÈóú‰øÇ‰∫∫. ÈÜ´ÂãôÊî∂ÂÖ•, ‰∫§ÊòìÈáëÈ°ç = 208,000. ÈÜ´ÂãôÊî∂ÂÖ•, Êéà‰ø°ÊúüÈñì = 90 Â§©. ÈÜ´ÂãôÊî∂ÂÖ•, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.È§òÈ°ç = -. ÈÜ´ÂãôÊî∂ÂÖ•, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.Âç†Á∏ΩÊáâÊî∂(‰ªò)Á•®  ÊìöÂ∏≥Ê¨æÊØîÁéá = -%. ÈÜ´ÂãôÊî∂ÂÖ•, ÂÇôË®ª = ÊáâÊî∂Â∏≥Ê¨æ. ÈÜ´ÂãôÊî∂ÂÖ•, ‰∫§ÊòìÂ∞çË±° = ÂúãÊ≥∞‰∏ñËèØ. ÈÜ´ÂãôÊî∂ÂÖ•, Èóú‰øÇ = ÂÖ∂‰ªñÈóú‰øÇ‰∫∫. ÈÜ´ÂãôÊî∂ÂÖ•, ‰∫§ÊòìÈáëÈ°ç = 1,400,700. ÈÜ´ÂãôÊî∂ÂÖ•, Êéà‰ø°ÊúüÈñì = 90 Â§©. ÈÜ´ÂãôÊî∂ÂÖ•, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.È§òÈ°ç = -. ÈÜ´ÂãôÊî∂ÂÖ•, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.Âç†Á∏ΩÊáâÊî∂(‰ªò)Á•®  ÊìöÂ∏≥Ê¨æÊØîÁéá = -%. ÈÜ´ÂãôÊî∂ÂÖ•, ÂÇôË®ª = ÊáâÊî∂Â∏≥Ê¨æ. ÈÜ´ÂãôÊàêÊú¨-  ‰∫∫‰∫ãË≤ªÁî®, ‰∫§ÊòìÂ∞çË±° = ÂúãÊ≥∞‰∫∫Â£Ω. ÈÜ´ÂãôÊàêÊú¨-  ‰∫∫‰∫ãË≤ªÁî®, Èóú‰øÇ = ÂÖ∂‰ªñÈóú‰øÇ‰∫∫. ÈÜ´ÂãôÊàêÊú¨-  ‰∫∫‰∫ãË≤ªÁî®, ‰∫§ÊòìÈáëÈ°ç = 55,580,357. ÈÜ´ÂãôÊàêÊú¨-  ‰∫∫‰∫ãË≤ªÁî®, Êéà‰ø°ÊúüÈñì = 60 Â§©. ÈÜ´ÂãôÊàêÊú¨-  ‰∫∫‰∫ãË≤ªÁî®, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.È§òÈ°ç = -. ÈÜ´ÂãôÊàêÊú¨-  ‰∫∫‰∫ãË≤ªÁî®, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.Âç†Á∏ΩÊáâÊî∂(‰ªò)Á•®  ÊìöÂ∏≥Ê¨æÊØîÁéá = -%. ÈÜ´ÂãôÊàêÊú¨-  ‰∫∫‰∫ãË≤ªÁî®, ÂÇôË®ª = ÂÖ∂‰ªñÊáâ‰ªòÊ¨æ. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ‰∫§ÊòìÂ∞çË±° = ÂúãÊ≥∞Áî¢Èö™. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, Èóú‰øÇ = ÂÖ∂‰ªñÈóú‰øÇ‰∫∫. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ‰∫§ÊòìÈáëÈ°ç = 4,229,871. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, Êéà‰ø°ÊúüÈñì = 60 Â§©. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.È§òÈ°ç = -. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.Âç†Á∏ΩÊáâÊî∂(‰ªò)Á•®  ÊìöÂ∏≥Ê¨æÊØîÁéá = -%. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ÂÇôË®ª = ÂÖ∂‰ªñÊáâ‰ªòÊ¨æ. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ‰∫§ÊòìÂ∞çË±° = ÂúãÊ≥∞‰∫∫Â£Ω. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, Èóú‰øÇ = ÂÖ∂‰ªñÈóú‰øÇ‰∫∫. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ‰∫§ÊòìÈáëÈ°ç = 3,144,672. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, Êéà‰ø°ÊúüÈñì = 60 Â§©. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅÂ∏≥Ê¨æ.È§òÈ°ç = -. ÈÜ´ÂãôÊàêÊú¨-  ‰∫ãÂãôË≤ªÁî®, ÊáâÊî∂(‰ªò)Á•®Êìö„ÄÅ

# Áî®Êà∂ÂïèÈ°å
Ë´ãÁµ¶Êàë112Âπ¥ÔºåËó•ÂìÅÈÄ≤Ë≤®‰∫§ÊòìÂ∞çË±°ÁöÑË≥áË®ä

================
<think>
Â•ΩÁöÑÔºåÊàëÁé∞Âú®ÈúÄË¶ÅÂõûÁ≠îÁî®Êà∑ÁöÑÈóÆÈ¢òÔºö‚ÄúË´ãÁµ¶Êàë112Âπ¥ÔºåËó•ÂìÅÈÄ≤Ë≤®‰∫§ÊòìÂ∞çË±°ÁöÑË≥áË®ä‚Äù„ÄÇÊ†πÊçÆ‰ªªÂä°Ë¶ÅÊ±ÇÔºåÊàëË¶ÅÂèÇËÄÉÊèê‰æõÁöÑ‰∏â‰∏™Êñá‰ª∂Êù•ÂØªÊâæÁõ∏ÂÖ≥‰ø°ÊÅØ„ÄÇ

È¶ñÂÖàÔºåÊàëÊü•Áúã‰∫ÜDocument 1„ÄÇËøô‰∏™ÊñáÊ°£Ê†áÈ¢òÊòØ‚ÄúÊ∞ëÂúã 112 Âπ¥Â∫¶‚ÄùÔºåÈáåÈù¢ÂàóÂá∫‰∫ÜÂ§ö‰∏™‰∫§Êòì‰∫∫ÁöÑÂêçÁß∞„ÄÅÈáëÈ°ç„ÄÅÊáâ‰ªòÁ•®Êìö„ÄÅÂ∏≥Ê¨æÈ§òÈ°ç‰ª•ÂèäÂç†Ë©≤È†ÖÊîØÂá∫ÁöÑÊØîÁéá„ÄÇÁúãËµ∑Êù•Ëøô‰∫õ‰∫§Êòì‰∫∫‰∏ªË¶ÅÊòØËçØÂìÅÂÖ¨Âè∏ÔºåÊØîÂ¶ÇË£ïÂà©ËÇ°‰ªΩÊúâÈôêÂÖ¨Âè∏„ÄÅÂè∞ÊπæÂ§ßÊòåÂçéÂòâËÇ°‰ªΩÊúâÈôêÂÖ¨Âè∏Á≠â„ÄÇÂõ†Ê≠§ÔºåDocument 1Êèê‰æõ‰∫ÜÂÖ≥‰∫é112Âπ¥Ëó•ÂìÅÈÄ≤Ë≤®‰∫§ÊòìÂØπË±°ÁöÑ‰ø°ÊÅØ„ÄÇ

Êé•‰∏ãÊù•ÔºåÊàëÊ£ÄÊü•‰∫ÜDocument 2ÂíåDocument 3„ÄÇËøô‰∏§‰∏™ÊñáÊ°£ÈÉΩÊòØ‚ÄúÊ∞ëÂúã 111 Âπ¥Â∫¶‚ÄùÁöÑÔºåÂÜÖÂÆπÊ∂âÂèäÂåªÂä°Êî∂ÂÖ•ÂíåÊàêÊú¨ÔºåÂåÖÊã¨‰∫§ÊòìÂØπË±°Â¶ÇÂõΩÊ≥∞‰∫∫Â£Ω„ÄÅÂõΩÊ≥∞ÈáëÊéßÁ≠âÔºå‰ΩÜËøô‰∫õ‰∏éËçØÂìÅËøõË¥ßÊó†ÂÖ≥ÔºåËÄåÊòØÂÖ≥‰∫é‰øùÈô©ÊàñÂÖ∂‰ªñÊúçÂä°ÁöÑ‰∫§Êòì„ÄÇÂõ†Ê≠§ÔºåËøô‰∏§‰∏™Êñá‰ª∂ÂØπÁî®Êà∑ÁöÑÈóÆÈ¢òÊ≤°ÊúâÂ∏ÆÂä©ÔºåÂèØ‰ª•ÂøΩÁï•„ÄÇ

Áªº‰∏äÊâÄËø∞ÔºåÂè™ÊúâDocument 1Êèê‰æõ‰∫Ü112Âπ¥Ëó•ÂìÅÈÄ≤Ë≤®‰∫§ÊòìÂØπË±°ÁöÑÂÖ∑‰Ωì‰ø°ÊÅØ„ÄÇ
</think>

Document 1 Êèê‰æõ‰∫ÜÊ∞ëÂúã 112 Âπ¥Â∫¶Ëó•ÂìÅÈÄ≤Ë≤®‰∫§ÊòìÂ∞çË±°ÁöÑË≥áË®äÔºö

1. Ë£ïÂà©ËÇ°‰ªΩÊúâÈôêÂÖ¨Âè∏  
2. Âè∞ÁÅ£Â§ßÊòåËèØÂòâËÇ°‰ªΩÊúâÈôêÂÖ¨Âè∏  
3. ‰πÖË£ï‰ºÅÊ•≠ËÇ°‰ªΩÊúâÈôêÂÖ¨Âè∏  
4. ËèØÂÆâËó•ÂìÅËÇ°‰ªΩÊúâÈôêÂÖ¨Âè∏  
5. Âè∞ÊπæÊù±Ê¥ãËó•ÂìÅÂ∑•Ê•≠ËÇ°‰ªΩÊúâÈôêÂÖ¨Âè∏  
6. Âè∞Áî∞Ëó•ÂìÅËÇ°‰ªΩÊúâÈôêÂÖ¨Âè∏  
7. Âè∞ÁÅ£‰∏≠Â§ñË£ΩËçØËÇ°‰ªΩÊúâÈôêÂÖ¨Âè∏  
8. ÁëûÂÆâÂúãÈöõËÇ°‰ªΩÊúâÈôêÂÖ¨Âè∏  
9. Â§ßÈöÜËààËó•ÂìÅËÇ°‰ªΩÊúâÈôêÂÖ¨Âè∏  
10. ÂÖÉËã±‰ºÅÊ•≠ËÇ°‰ªΩÊúâÈôêÂÖ¨Âè∏
"""
len(tokenizer.tokenize(text))