In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Add the parent directory of 'bin' to the system path
import sys
import os
# Get the current notebook's directory
notebook_dir = os.getcwd()

# Add the 'bin' directory to the system path
sys.path.append(os.path.abspath(os.path.join(notebook_dir, "./bin")))


In [3]:
import pandas as pd
import streamlit as st

from chunking import load_data, chunk_docs_recursive
from vectordb import get_vector_db, add_documents_to_vector_db, create_document_list, get_vector_db_from_persist_directory
from retrievar import vector_retrievar_with_source, get_context

In [4]:
file_path = "./data/sample_data.xlsx"

In [5]:
# Set OpenAI API key
openai_api_key = st.secrets["OPENAI_API_KEY"]
os.environ["OPENAI_API_KEY"] = openai_api_key

In [6]:
# Helper function for printing docs
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i + 1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

### Chunking

In [7]:
# Load the data from the Excel file
docs = load_data(file_path)

In [8]:
docs

[Document(page_content='\n\n\nREPORT_DATE\nSTART_TIME\nFINISH_TIME\nELAPSED_HOURS\n\n\n2023-05-18\n07:07\n12:50\n11.49\n\n\n2025-01-26\n02:31\n16:07\n10.84\n\n\n2023-08-13\n21:46\n13:45\n6.66\n\n\n2022-03-08\n13:58\n12:40\n4.54\n\n\n2024-02-08\n06:44\n21:30\n9.64\n\n\n2025-04-01\n02:22\n06:46\n6.68\n\n\n2024-07-22\n09:08\n19:09\n6.75\n\n\n2024-09-14\n16:38\n04:10\n7.4\n\n\n2023-02-07\n20:17\n03:42\n1.88\n\n\n2024-01-08\n09:10\n00:21\n4.27\n\n\n2021-09-27\n09:30\n12:47\n5.59\n\n\n2023-12-12\n20:37\n02:11\n6.5\n\n\n2023-12-19\n19:47\n22:41\n11.02\n\n\n2022-01-15\n20:48\n02:32\n9.07\n\n\n2024-02-10\n14:28\n10:52\n7.96\n\n\n2021-11-08\n11:28\n03:58\n7.68\n\n\n2024-11-15\n12:44\n15:35\n2.1\n\n\n2025-01-28\n09:38\n16:59\n4.83\n\n\n2023-07-14\n09:24\n16:55\n7.69\n\n\n2024-12-24\n23:47\n22:21\n6.72\n\n\n', metadata={'source': './data/sample_data.xlsx', 'filename': 'sample_data.xlsx', 'file_directory': './data', 'last_modified': '2025-05-09T09:56:10', 'filetype': 'application/vnd.openxmlformats

In [9]:
# Chunk the documents
# chunked_docs = chunk_docs_recursive(docs, max_chunk_size=1000)
# chunked_docs

### Vector DB

In [10]:
from langchain_openai import OpenAIEmbeddings
from langchain_openai import AzureOpenAIEmbeddings

In [11]:
persist_directory = "./vector_db"
collection_name = "algo_aces"
# embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")
embedding_model = AzureOpenAIEmbeddings(model=st.secrets["AZURE_OPENAI_EMBEDDING_MODEL"])

In [12]:
db = get_vector_db(persist_directory = persist_directory, 
                    embedding_function = embedding_model,
                    collection_name = collection_name)

In [13]:
documents = create_document_list(docs)

In [14]:
documents

[Document(page_content='\n\n\nREPORT_DATE\nSTART_TIME\nFINISH_TIME\nELAPSED_HOURS\n\n\n2023-05-18\n07:07\n12:50\n11.49\n\n\n2025-01-26\n02:31\n16:07\n10.84\n\n\n2023-08-13\n21:46\n13:45\n6.66\n\n\n2022-03-08\n13:58\n12:40\n4.54\n\n\n2024-02-08\n06:44\n21:30\n9.64\n\n\n2025-04-01\n02:22\n06:46\n6.68\n\n\n2024-07-22\n09:08\n19:09\n6.75\n\n\n2024-09-14\n16:38\n04:10\n7.4\n\n\n2023-02-07\n20:17\n03:42\n1.88\n\n\n2024-01-08\n09:10\n00:21\n4.27\n\n\n2021-09-27\n09:30\n12:47\n5.59\n\n\n2023-12-12\n20:37\n02:11\n6.5\n\n\n2023-12-19\n19:47\n22:41\n11.02\n\n\n2022-01-15\n20:48\n02:32\n9.07\n\n\n2024-02-10\n14:28\n10:52\n7.96\n\n\n2021-11-08\n11:28\n03:58\n7.68\n\n\n2024-11-15\n12:44\n15:35\n2.1\n\n\n2025-01-28\n09:38\n16:59\n4.83\n\n\n2023-07-14\n09:24\n16:55\n7.69\n\n\n2024-12-24\n23:47\n22:21\n6.72\n\n\n', metadata={'page_name': 'Daily DFR'}),
 Document(page_content='\n\n\nDate\nAuth\nTxn\n\n\n2023-06-08\n1284\n458\n\n\n2023-05-21\n3044\n834\n\n\n2024-01-04\n3929\n787\n\n\n2021-09-21\n4647\n37

In [15]:
add_documents_to_vector_db(db, documents)

<langchain_chroma.vectorstores.Chroma at 0x312d8bcd0>

In [16]:
db = get_vector_db_from_persist_directory(persist_directory = persist_directory, embedding_function = embedding_model, collection_name = collection_name)

### Retrievar

In [37]:
query = "What was the average elapsed time for the “Monthly Sales” report in 2021?"

In [38]:
retrieved_docs, sources = vector_retrievar_with_source(vector_db = db, query = query, top_k=3)

In [39]:
context = get_context(retrieved_docs)

In [40]:
context

['\n\n\nCOMPANY_ID\nENTITY\nENTITY_ID\nDB_KEY\nCOMPANY_NAME\nREPORT_NAME\nREPORT_DATE\nFREQUENCY\nSTART_TIME\nFINISH_TIME\nELAPSED_HOURS\n\n\n7065\nMarketing\n750\n773017\nCompany B\nMonthly Sales\n2023-06\nDaily\n02:33\n04:06\n17.39\n\n\n7438\nMarketing\n705\n365540\nCompany C\nMarketing Analysis\n2023-06\nMonthly\n05:30\n12:42\n21.26\n\n\n8868\nMarketing\n229\n313340\nCompany A\nQuarterly Financial\n2021-10\nMonthly\n05:28\n19:52\n22.44\n\n\n5438\nSales\n619\n484982\nCompany C\nQuarterly Financial\n2023-07\nWeekly\n04:42\n00:11\n17.28\n\n\n7433\nSales\n151\n207671\nCompany A\nQuarterly Financial\n2021-12\nDaily\n17:35\n06:15\n22.53\n\n\n7523\nFinance\n746\n777356\nCompany C\nMarketing Analysis\n2021-12\nMonthly\n13:38\n20:15\n10.46\n\n\n1820\nSales\n290\n371899\nCompany C\nMonthly Sales\n2025-04\nDaily\n15:29\n15:46\n15.78\n\n\n5004\nSales\n402\n647056\nCompany A\nMarketing Analysis\n2022-09\nDaily\n02:43\n07:52\n23.56\n\n\n8254\nMarketing\n336\n977477\nCompany B\nMonthly Sales\n2022

In [41]:
sources

{'Top monthly Grpt DFR reports (>10 hrs)'}

### Generation

In [42]:
azure_openai_api_key = st.secrets["AZURE_OPENAI_API_KEY"]
azure_openai_endpoint = st.secrets["AZURE_OPENAI_ENDPOINT"]
os.environ["AZURE_OPENAI_API_KEY"] = azure_openai_api_key
os.environ["AZURE_OPENAI_ENDPOINT"] = azure_openai_endpoint

In [43]:
azure_openai_endpoint

'https://algoacev2.openai.azure.com/'

In [44]:

api_version = st.secrets["API_VERSION"]
gen_model = st.secrets["AZURE_OPENAI_GENERATION_MODEL"]


In [46]:
from llm_generate import get_response, get_response_streaming

In [47]:
# res, token_metadata =  get_response(primary_model = gen_model, 
#                     api_version = api_version, 
#                     question = query, 
#                     context = context)

res_gen, token_metadata = get_response_streaming(primary_model = gen_model, 
                                            api_version = api_version, 
                                            question = query, 
                                            context = context)

In [48]:
res_gen

<generator object get_response_streaming.<locals>.token_text_generator at 0x139cfbbc0>

In [70]:
token_metadata

{'prompt_tokens': 2077,
 'completion_tokens': 242,
 'total_tokens': 2319,
 'total_cost (USD)': '0.000000'}