In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Add the parent directory of 'bin' to the system path
import sys
import os
import nest_asyncio
nest_asyncio.apply()
# Get the current notebook's directory
notebook_dir = os.getcwd()

# Add the 'bin' directory to the system path
sys.path.append(os.path.abspath(os.path.join(notebook_dir, "./bin")))


In [3]:
import pandas as pd
import streamlit as st
import pickle

from langchain_core.documents import Document as LCDocument
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader


from chunking import load_data, chunk_docs_recursive
from vectordb import get_vector_db, add_documents_to_vector_db, create_document_list, get_vector_db_from_persist_directory
from retrievar import vector_retrievar_with_source, ensemble_retriever, get_context

In [4]:
file_path = "./data/sample_data.xlsx"

In [5]:
# Set OpenAI API key
openai_api_key = st.secrets["OPENAI_API_KEY"]
os.environ["OPENAI_API_KEY"] = openai_api_key

In [6]:
df = pd.ExcelFile(file_path)

In [7]:
sheet_names = df.sheet_names

In [8]:
# Helper function for printing docs
# def pretty_print_docs(docs):
#     print(
#         f"\n{'-' * 100}\n".join(
#             [f"Document {i + 1}:\n\n" + d.page_content for i, d in enumerate(docs)]
#         )
#     )

### Chunking

In [9]:
# Load data using LlamaParse

parser = LlamaParse(api_key=st.secrets['LLAMA_PARSE_API_KEY'], result_type="markdown")
docs = parser.load_data(file_path)

Started parsing the file under job_id 1cd2fe60-0103-47ed-9e82-c06ad0f82315


In [10]:
docs

[Document(id_='c994b4b5-cdae-4010-a6a5-834d5ae9c73a', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='# Daily DFR\n\n|REPORT_DATE|START_TIME|FINISH_TIME|ELAPSED_HOURS|\n|---|---|---|---|\n|2022-05-11|13:27:19|20:08:41|7.05|\n|2022-05-12|05:44:47|23:56:24|1.42|\n|2022-05-14|08:36:18|13:32:16|9.47|\n|2022-05-15|00:50:14|16:46:59|8.83|\n|2022-05-16|08:17:38|16:29:38|11.24|\n|2022-05-17|20:22:18|16:48:58|11.12|\n|2022-05-18|11:35:32|14:19:16|2.56|\n|2022-05-22|09:34:08|22:30:48|3.11|\n|2022-05-23|05:11:13|17:26:05|6.66|\n|2022-05-24|13:42:57|03:04:29|3.5|\n|2022-05-25|05:13:08|01:56:44|4.7|\n|2022-05-27|11:21:35|17:18:43|6.06|\n|2022-05-28|17:36:42|07:57:40|11.31|\n|2022-05-29|10:18:30|06:33:58|8.94|\n|2022-05-31|00:24:05|03:57:04|3.99|\n|2022-06-02|13:23:00|03:12:18|4.82|\n|2022-06-08|22:01:54|07:02:31|1.23|\n|2022-06-14|14:10:08|04:28:12|10.41|\n|2022-06-17|17:53:43|09:54:46|5.26|\n|2022-06-19|05:22:03|06:04:56|2.49|\n|

In [11]:
# Load the data from the Excel file
# docs = load_data(file_path)

In [12]:
docs[0]

Document(id_='c994b4b5-cdae-4010-a6a5-834d5ae9c73a', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='# Daily DFR\n\n|REPORT_DATE|START_TIME|FINISH_TIME|ELAPSED_HOURS|\n|---|---|---|---|\n|2022-05-11|13:27:19|20:08:41|7.05|\n|2022-05-12|05:44:47|23:56:24|1.42|\n|2022-05-14|08:36:18|13:32:16|9.47|\n|2022-05-15|00:50:14|16:46:59|8.83|\n|2022-05-16|08:17:38|16:29:38|11.24|\n|2022-05-17|20:22:18|16:48:58|11.12|\n|2022-05-18|11:35:32|14:19:16|2.56|\n|2022-05-22|09:34:08|22:30:48|3.11|\n|2022-05-23|05:11:13|17:26:05|6.66|\n|2022-05-24|13:42:57|03:04:29|3.5|\n|2022-05-25|05:13:08|01:56:44|4.7|\n|2022-05-27|11:21:35|17:18:43|6.06|\n|2022-05-28|17:36:42|07:57:40|11.31|\n|2022-05-29|10:18:30|06:33:58|8.94|\n|2022-05-31|00:24:05|03:57:04|3.99|\n|2022-06-02|13:23:00|03:12:18|4.82|\n|2022-06-08|22:01:54|07:02:31|1.23|\n|2022-06-14|14:10:08|04:28:12|10.41|\n|2022-06-17|17:53:43|09:54:46|5.26|\n|2022-06-19|05:22:03|06:04:56|2.49|\n|2

In [13]:
# chunked_docs = docs.copy()

In [14]:
# Chunk the documents
# chunked_docs = chunk_docs_recursive(docs, chunk_size=512, chunk_overlap=200)

In [15]:
chunked_docs = [
    LCDocument(
        page_content=d.text,
        metadata={
            "page_name": d.metadata.get("page_name", sheet_names[i])  # Add page_name or any metadata you want
        }
    )
    for i, d in enumerate(docs)
]

In [16]:
chunked_docs[0].metadata

{'page_name': 'Daily DFR'}

In [17]:
import pickle
def save_docs_pickle(docs, path="./data/chunked_rows.pkl"):
    with open(path, "wb") as f:
        pickle.dump(docs, f)
save_docs_pickle(chunked_docs)

In [18]:
len(chunked_docs)

9

### Vector DB

In [19]:
from langchain_openai import OpenAIEmbeddings
from langchain_openai import AzureOpenAIEmbeddings

In [20]:
persist_directory = "./vector_db"
collection_name = "algo_aces"
embedding_model = AzureOpenAIEmbeddings(model=st.secrets["AZURE_OPENAI_EMBEDDING_MODEL"])

In [21]:
db = get_vector_db(persist_directory = persist_directory, 
                    embedding_function = embedding_model,
                    collection_name = collection_name)

In [22]:
documents = create_document_list(chunked_docs)

In [23]:
add_documents_to_vector_db(db, documents)

<langchain_chroma.vectorstores.Chroma at 0x30335bb10>

### Retrievar

In [24]:
query = "What was the average elapsed time for the “Monthly Sales” report in 2021?"

In [25]:
def load_docs_pickle(path="./data/chunked_rows.pkl"):
    with open(path, "rb") as f:
        return pickle.load(f)
chunked_docs = load_docs_pickle()

In [26]:
chunked_docs[0].metadata

{'page_name': 'Daily DFR'}

In [27]:
db = get_vector_db_from_persist_directory(persist_directory = persist_directory, embedding_function = embedding_model, collection_name = collection_name)

In [28]:
retrieved_docs, sources = ensemble_retriever(chunked_docs, vector_db = db, query = query, top_k=3)

In [29]:
context = get_context(retrieved_docs)

In [30]:
context

['# Top monthly Grpt DFR reports (>\n\n|COMPANY_ID|ENTITY|ENTITY_ID|DB_KEY|COMPANY_NAME|REPORT_NAME|REPORT_DATE|FREQUENCY|START_TIME|FINISH_TIME|ELAPSED_HOURS|\n|---|---|---|---|---|---|---|---|---|---|---|\n|6453|Finance|666|298970|Company B|Monthly Sales|2025-02|Monthly|10:46:27|08:17:57|15.43|\n|6321|Sales|354|738062|Company A|Marketing Analysis|2023-02|Monthly|12:20:45|05:27:43|10.03|\n|5354|Sales|488|704267|Company A|Quarterly Financial|2024-01|Daily|15:21:36|17:32:39|12.36|\n|7000|Finance|764|896744|Company B|Quarterly Financial|2023-01|Daily|08:08:11|00:42:41|17.17|\n|4954|Finance|660|963069|Company C|Monthly Sales|2024-07|Weekly|16:25:15|20:25:44|23.34|\n|4476|Marketing|155|123714|Company A|Marketing Analysis|2023-02|Daily|06:30:27|14:20:24|11.14|\n|7964|Finance|138|161798|Company C|Marketing Analysis|2022-10|Daily|12:25:27|13:21:15|15.01|\n|4266|Finance|274|955839|Company A|Monthly Sales|2022-12|Monthly|21:43:32|06:05:46|20.93|\n|2669|Finance|294|189213|Company C|Marketing Ana

In [31]:
sources

{'Daily Web',
 'Monthend Duration Web GRPT',
 'Top monthly Grpt DFR (>10 hrs)',
 'unknown'}