In [30]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
# Add the parent directory of 'bin' to the system path
import sys
import os
# Get the current notebook's directory
notebook_dir = os.getcwd()

# Add the 'bin' directory to the system path
sys.path.append(os.path.abspath(os.path.join(notebook_dir, "./bin")))


In [49]:
import pandas as pd
import streamlit as st

from chunking import load_data, chunk_docs_recursive
from vectordb import get_vector_db, add_documents_to_vector_db, create_document_list, get_vector_db_from_persist_directory
from retrievar import vector_retrievar_with_source, ensemble_retriever, get_context

In [33]:
file_path = "./data/sample_data.xlsx"

In [34]:
# Set OpenAI API key
openai_api_key = st.secrets["OPENAI_API_KEY"]
os.environ["OPENAI_API_KEY"] = openai_api_key

In [35]:
# Helper function for printing docs
def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i + 1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

### Chunking

In [36]:
# Load the data from the Excel file
docs = load_data(file_path)

In [37]:
# docs

In [39]:
# Chunk the documents
chunked_docs = chunk_docs_recursive(docs, chunk_size=5000, chunk_overlap=0)
# chunked_docs


In [57]:
import pickle
def save_docs_pickle(docs, path="./data/chunked_rows.pkl"):
    with open(path, "wb") as f:
        pickle.dump(docs, f)
save_docs_pickle(chunked_docs)

In [41]:
chunked_docs[2].metadata


{'source': './data/sample_data.xlsx',
 'filename': 'sample_data.xlsx',
 'file_directory': './data',
 'last_modified': '2025-05-10T15:29:59',
 'filetype': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
 'languages': ['eng'],
 'page_number': 1,
 'page_name': 'Daily DFR',
 'text_as_html': '<table border="1" class="dataframe">\n  <tbody>\n    <tr>\n      <td>REPORT_DATE</td>\n      <td>START_TIME</td>\n      <td>FINISH_TIME</td>\n      <td>ELAPSED_HOURS</td>\n    </tr>\n    <tr>\n      <td>2022-05-11</td>\n      <td>13:27:19</td>\n      <td>20:08:41</td>\n      <td>7.05</td>\n    </tr>\n    <tr>\n      <td>2022-05-12</td>\n      <td>05:44:47</td>\n      <td>23:56:24</td>\n      <td>1.42</td>\n    </tr>\n    <tr>\n      <td>2022-05-14</td>\n      <td>08:36:18</td>\n      <td>13:32:16</td>\n      <td>9.47</td>\n    </tr>\n    <tr>\n      <td>2022-05-15</td>\n      <td>00:50:14</td>\n      <td>16:46:59</td>\n      <td>8.83</td>\n    </tr>\n    <tr>\n      <td>2022-05-16<

In [42]:
len(chunked_docs)

49

### Vector DB

In [43]:
from langchain_openai import OpenAIEmbeddings
from langchain_openai import AzureOpenAIEmbeddings

In [44]:
persist_directory = "./vector_db"
collection_name = "algo_aces"
# embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")
embedding_model = AzureOpenAIEmbeddings(model=st.secrets["AZURE_OPENAI_EMBEDDING_MODEL"])

In [45]:
db = get_vector_db(persist_directory = persist_directory, 
                    embedding_function = embedding_model,
                    collection_name = collection_name)

In [46]:
documents = create_document_list(chunked_docs)

In [47]:
# documents

In [48]:
add_documents_to_vector_db(db, documents)

<langchain_chroma.vectorstores.Chroma at 0x31ce8e050>

### Retrievar

In [50]:
query = "What was the average elapsed time for the “Monthly Sales” report in 2021?"

In [58]:
def load_docs_pickle(path="./data/chunked_rows.pkl"):
    with open(path, "rb") as f:
        return pickle.load(f)
chunked_docs = load_docs_pickle()

In [None]:
db = get_vector_db_from_persist_directory(persist_directory = persist_directory, embedding_function = embedding_model, collection_name = collection_name)

In [51]:
retrieved_docs, sources = ensemble_retriever(chunked_docs, vector_db = db, query = query, top_k=3)

In [52]:
context = get_context(retrieved_docs)

In [53]:
context

['1501\nFinance\n311\n245052\nCompany B\nMonthly Sales\n2022-09\nWeekly\n13:54:28\n18:11:11\n8.12\n\n\n2999\nFinance\n686\n343463\nCompany C\nMonthly Sales\n2023-02\nWeekly\n17:49:11\n22:09:57\n9.72\n\n\n8024\nMarketing\n848\n104184\nCompany B\nQuarterly Financial\n2023-08\nWeekly\n07:19:51\n08:14:16\n19.23\n\n\n4239\nSales\n437\n219205\nCompany B\nMarketing Analysis\n2023-09\nMonthly\n06:30:07\n10:29:19\n17.26\n\n\n8757\nMarketing\n598\n169084\nCompany B\nMonthly Sales\n2024-03\nDaily\n21:31:28\n18:37:52\n9.37\n\n\n7416\nSales\n742\n967569\nCompany B\nMarketing Analysis\n2025-01\nMonthly\n07:54:12\n23:11:35\n10.01\n\n\n5994\nMarketing\n712\n924788\nCompany C\nMarketing Analysis\n2023-06\nWeekly\n19:59:19\n03:02:58\n15.17\n\n\n1245\nSales\n764\n913107\nCompany A\nMonthly Sales\n2025-02\nDaily\n00:57:09\n00:24:27\n19.44\n\n\n4338\nSales\n655\n978680\nCompany C\nQuarterly Financial\n2023-08\nDaily\n01:29:23\n06:30:04\n10.89\n\n\n3951\nMarketing\n482\n415919\nCompany A\nQuarterly Financia

In [54]:
sources

{'Daily Top GRPT DFR Reports', 'Daily Web'}