In [1]:
from CakePDFEmbedder import VectorDBLoader
from CakePDFEmbedder import VectorDBRetriever
from CakePDFEmbedder import ProcessPDF
from llama_index.vector_stores.milvus import MilvusVectorStore
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from langchain_openai import AzureChatOpenAI

In [3]:
import os
import subprocess
import base64
!export AZURE_API_VERSION="`kubectl get secret azure-creds -n kubeflow-brad -o jsonpath='{.data.AZURE_API_VERSION}' | base64 --decode`"
!export AZURE_OPENAI_API_KEY="`kubectl get secret azure-creds -n kubeflow-brad -o jsonpath='{.data.AZURE_OPENAI_API_KEY}' | base64 --decode`"
!export AZURE_OPENAI_ENDPOINT="`kubectl get secret azure-creds -n kubeflow-brad -o jsonpath='{.data.AZURE_OPENAI_ENDPOINT}' | base64 --decode`"
def kubectl_get_secret(secret_name, key):
    # Executing kubectl command to get the secret and extracting the value of the specified key
    command = f"kubectl get secret {secret_name} -n kubeflow-brad -o jsonpath={{.data.{key}}}"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)

    # Checking if the command was successful
    if result.returncode == 0:
        # Decoding the base64-encoded value and return
        decoded_value = base64.b64decode(result.stdout.strip()).decode('utf-8')
        return decoded_value

# Fetching values from Kubernetes secret and decode
AZURE_API_VERSION = kubectl_get_secret("azure-creds", "AZURE_API_VERSION")
AZURE_OPENAI_API_KEY = kubectl_get_secret("azure-creds", "AZURE_OPENAI_API_KEY")
AZURE_OPENAI_ENDPOINT = kubectl_get_secret("azure-creds", "AZURE_OPENAI_ENDPOINT")

# Setting the environment variables
os.environ["AZURE_API_VERSION"] = AZURE_API_VERSION
os.environ["AZURE_OPENAI_API_KEY"] = AZURE_OPENAI_API_KEY
os.environ["AZURE_OPENAI_ENDPOINT"] = AZURE_OPENAI_ENDPOINT

In [4]:
llm = AzureChatOpenAI(
    openai_api_version=os.getenv("AZURE_API_VERSION"),
    azure_deployment="gpt-4o",
    temperature=0
)

In [36]:
demo_vdb = "Grant_demo_pre_augmented_vdb"
# demo_vdb = "Grant_demo_post_augmented_vdb"


vector_store = MilvusVectorStore(uri="http://stepstone-milvus.milvus.svc.cluster.local:19530", dim=1024, overwrite=False, collection_name=demo_vdb)

In [6]:
embedding_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-m3",
    max_length=4096
)



In [7]:
myProcessor = ProcessPDF(llm)

In [8]:
myBlock_dicts = myProcessor.partition_file_via_open_source("./FloridaSample.pdf")

processing ./FloridaSample.pdf using local library...

Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
# myBlock_dicts
# myBlock_dicts[1]["text"]
table_element=None
for element in myBlock_dicts:
        if element["metadata"]["page_number"]==2 and element["type"]=="Table":
            table_element = element
# table_element=None
# for element in myBlock_dicts:
#         if element["metadata"]["page_number"]==2 and element["type"]=="Table":
#             print(element)

In [40]:
table_element

{'type': 'Table',
 'element_id': '7fd930491d9235cca3bd43e5c63775de',
 'text': 'SR 739, FROM N OF THE CALOOSAHATCHEE RIVER TO SR 78 Contract Descr: The improvements under this Contract consist of milling and resurfacing, widening, median improvements, lighting, signalization and signing and pavement markings along SR 739 from Caloosahatchee River Bridge to SR 78 in Lee County, FL. Line No / Item ID Alternate Item Description ( ) indicates item is bid as Lump Sum Quantity Units (1) AJAX PAVING INDUSTRIES OF FLORIDA LLC Unit Price Ext Amount (2) OHLA USA INC Unit Price Ext Amount (3) PREFERRED MATERIALS INC. Unit Price Ext Amount SECTION: 0001 Roadway 0005 0101 1 (1.000) 633,888.43000 633,888.43 1,200,000.00000 1,200,000.00 1,775,000.00000 1,775,000.00 MOBILIZATION 44629315201 LS 0010 0102 1 (290.000) 682,038.75000 682,038.75 854,271.00000 854,271.00 947,500.00000 947,500.00 MAINTENANCE OF TRAFFIC 44629315201 DA (LS) 0015 0102 4 1 (51.000) 12,115.54000 12,115.54 6,000.00000 6,000.00 27,39

In [13]:
myAugmentedDF = myProcessor.create_dataframe([table_element])

In [16]:
blocks_of_text = myProcessor.create_block_text(myAugmentedDF)

In [27]:
myLoader = VectorDBLoader(vector_store, embedding_model)

In [32]:
# myLoader.addToVectorDB([myBlock_dicts[1]["text"]])
# myLoader.addToVectorDB([table_element["text"]])
myLoader.addToVectorDB([blocks_of_text[1]])

In [37]:
myRetriever = VectorDBRetriever(vector_store, embedding_model, "default", 2)

In [38]:
query_engine = RetrieverQueryEngine.from_args(myRetriever)

In [39]:
# query_str = "For the mobility disability category, what was the accuracy?"
query_str = "What is AdSwerve's most recent Net Revenue (LTM)?"
# query_str = 'What was the total cost of "WORK ZONE SIGN", taking in to account the number of units, for AJAX PAVING INDUSTRIES, OHLA USA INC, and PREFERRED MATERIALS respectfully?'
response = query_engine.query(query_str)
print(str(response))

AdSwerve's most recent Net Revenue (LTM) is $78,605.


In [None]:
# PreAugmented -- The accuracy for the mobility disability category was 98.3%.