In [1]:
import json
import sys
import warnings
from pathlib import Path
from typing import Any, Literal

import numpy as np
import pandas as pd
import polars as pl
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)


# Demo (Prevents ruff from removing the unused module import)
my_path: Path = Path(".")
name: Any
category: Literal["A", "B", "C"]
json.loads('{"name": "Smart-RAG", "version": "1.0"}')

{'name': 'Smart-RAG', 'version': '1.0'}

In [3]:
go_up_from_current_directory(go_up=1)

from src.config import app_settings  # noqa: E402
from src.utilities.model_config import RemoteModel  # noqa: E402

settings = app_settings

/Users/mac/Desktop/Projects/smart-rag


In [4]:
from langchain_openai import ChatOpenAI

remote_llm = ChatOpenAI(
    api_key=settings.OPENROUTER_API_KEY.get_secret_value(),  # type: ignore
    base_url=settings.OPENROUTER_URL,
    temperature=0.0,
    model=RemoteModel.GEMINI_2_5_FLASH_LITE,
)


# Test the LLMs
response = remote_llm.invoke("Tell me a very short joke.")
response.pretty_print()


Why did the scarecrow win an award?

Because he was outstanding in his field!


In [5]:
# Use aiohttp for async requests
# Create pipeline for downloading the data
# - add tqdm for progress bar

In [6]:
import httpx


class HTTPXClient:
    def __init__(
        self,
        base_url: str = "",
        timeout: int = 30,
        http2: bool = True,
        max_connections: int = 20,
        max_keepalive_connections: int = 5,
    ) -> None:
        self.base_url = base_url
        self.timeout = timeout
        self.http2 = http2
        self.max_connections = max_connections
        self.max_keepalive_connections = max_keepalive_connections
        self.client = httpx.AsyncClient(
            base_url=self.base_url,
            timeout=self.timeout,
            http2=self.http2,
            limits=httpx.Limits(
                max_connections=self.max_connections,
                max_keepalive_connections=self.max_keepalive_connections,
            ),
        )

    async def __aenter__(self) -> "HTTPXClient":
        return self

    async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
        await self.client.aclose()

    async def get(
        self,
        url: str,
        params: dict[str, Any] | None = None,
        headers: dict[str, Any] | None = None,
    ) -> dict[str, Any]:
        """Perform an asynchronous GET request."""
        try:
            response = await self.client.get(url, params=params, headers=headers)
            return self._parse_response(response)
        except Exception as e:
            return self._handle_exception(e)

    async def post(
        self,
        url: str,
        data: dict[str, Any] | None = None,
        params: dict[str, Any] | None = None,
        headers: dict[str, Any] | None = None,
    ) -> dict[str, Any]:
        """Perform an asynchronous POST request."""
        try:
            response = await self.client.post(
                url, data=data, params=params, headers=headers
            )
            return self._parse_response(response)
        except Exception as e:
            return self._handle_exception(e)

    def _parse_response(self, response: httpx.Response) -> dict[str, Any]:
        """Parse the HTTPX response and return a standardized dictionary."""
        try:
            data = response.json()
        except json.JSONDecodeError:
            data = response.text

        return {
            "success": response.status_code < 400,
            "status_code": response.status_code,
            "data": data,
            "headers": dict(response.headers),
            "error": (
                None
                if response.status_code < 400
                else f"HTTP {response.status_code} Error"
            ),
        }

    def _handle_exception(self, e: Exception) -> dict[str, Any]:
        """Handle exceptions and return a standardized error response."""
        if isinstance(e, httpx.ConnectError):
            error_msg = f"Connection Error: {str(e)}"
        elif isinstance(e, httpx.TimeoutException):
            error_msg = f"Request Timeout: {str(e)}"
        else:
            error_msg = f"Unexpected Error: {str(e)}"

        return {
            "success": False,
            "status_code": None,
            "data": None,
            "headers": None,
            "error": error_msg,
        }

In [7]:
async with HTTPXClient() as client:
    response = await client.get(
        "https://www.bbc.com/sport/football/articles/cwy543n274wo"
    )
    print(response)



In [8]:
response["data"]



In [9]:
from markdownify import markdownify as md

console.print(md(response["data"])[3000:5000])

<br>

# RAG Pipeline


## Step 0

- Download and prepare your documents.

In [10]:
import re
from pathlib import Path
from typing import Any

from bs4 import BeautifulSoup
from markdownify import markdownify as md


def clean_xbrl_noise(text: str) -> str:
    """Aggressively remove XBRL noise while preserving document structure.

    This function removes all XBRL/XML metadata and keeps only the meaningful
    HTML content that can be converted to readable markdown.
    """

    body_match = re.search(r"<body[^>]*>(.*)</body>", text, re.DOTALL | re.IGNORECASE)
    if body_match:
        text = "<body>" + body_match.group(1) + "</body>"

    try:
        soup = BeautifulSoup(text, "html.parser")

        # Remove <head> entirely - it contains most XBRL metadata
        for head in soup.find_all("head"):
            head.decompose()

        # Remove all script and style tags
        for tag in soup(["script", "style", "meta", "link"]):
            tag.decompose()

        # Remove XML/XBRL namespaced elements (tags with colons)
        for tag in soup.find_all():
            if tag.name and ":" in tag.name:
                tag.decompose()

        # Remove hidden XBRL data elements (usually display:none or specific XBRL classes)
        for tag in soup.find_all(style=re.compile(r"display:\s*none", re.I)):
            tag.decompose()

        for tag in soup.find_all(class_=re.compile(r"xbrl|hidden", re.I)):
            tag.decompose()

        # Remove specific XBRL attribute clutter
        for tag in soup.find_all():
            if tag.name:
                # Remove XBRL attributes
                attrs_to_remove = []
                for attr in tag.attrs:
                    if (
                        ":" in attr
                        or attr.startswith("xmlns")
                        or attr in ["contextref", "unitref", "decimals"]
                    ):
                        attrs_to_remove.append(attr)  # noqa: PERF401
                for attr in attrs_to_remove:
                    del tag[attr]

        # Get the cleaned HTML
        cleaned: str = str(soup)

    except Exception as e:
        print(f"Warning: HTML parsing failed: {e}")
        cleaned = text

    # Post-processing regex cleanup for any remaining XBRL noise

    # Remove namespace URLs that got left behind
    cleaned = re.sub(
        r'http://[^\s<>"]+(?:xbrl|fasb|sec\.gov)[^\s<>"]*', "", cleaned, flags=re.I
    )

    # Remove XBRL namespace tokens (us-gaap:Something, iso4217:USD, etc.)
    cleaned = re.sub(
        r"\b(?:us-gaap|nvda|srt|stpr|fasb|xbrli|iso4217|xbrl|dei|ix|country|xbrldi|link):[A-Za-z0-9_\-:()]+(?:Member)?\b",
        "",
        cleaned,
        flags=re.I,
    )

    # Remove long numeric strings (CIK numbers, etc.) - 10+ digits
    cleaned = re.sub(r"\b\d{10,}\b", "", cleaned)
    # Remove date patterns that are concatenated without separators (2023-01-292022-01-30)
    cleaned = re.sub(r"(?:\d{4}-\d{2}-\d{2}){2,}", "", cleaned)
    # Remove very long alphanumeric strings (40+ chars) that indicate concatenated tags
    cleaned = re.sub(r"\b[A-Za-z0-9_\-]{40,}\b", "", cleaned)
    # Remove XML/namespace declarations
    cleaned = re.sub(r'xmlns[:\w]*="[^"]*"', "", cleaned)
    cleaned = re.sub(r'xml:\w+="[^"]*"', "", cleaned)
    # Remove "pure" standalone (XBRL unit)
    cleaned = re.sub(r"\bpure\b(?!\s+\w)", "", cleaned)
    # Clean up multiple colons and extra punctuation
    cleaned = re.sub(r":{2,}", ":", cleaned)
    return re.sub(r"\s*:\s*:\s*", " ", cleaned)


async def download_and_parse_data(
    url: str,
    raw_doc_path: Path | str,
    cleaned_doc_path: Path | str,
    force_download: bool = False,
) -> None:
    """Download and parse HTML/XBRL documents with aggressive noise removal.

    Parameters
    ----------
        url : str
            The remote URL to download
        raw_doc_path : Path | str
            Output path for the raw bytes/text
        cleaned_doc_path : Path | str
            Output path for the cleaned markdown/text
        force_download : bool, default=False
            When True, re-download and re-clean even if file(s) exist

    Returns
    -------
        None
    """
    if isinstance(raw_doc_path, str):
        raw_doc_path = Path(raw_doc_path)
    if isinstance(cleaned_doc_path, str):
        cleaned_doc_path = Path(cleaned_doc_path)

    # Safe, identifiable user agent:
    USER_AGENT: str = (
        "MyCompany MyDownloader/1.0 (+https://mycompany.example; dev@mycompany.example)"
    )
    headers: dict[str, str] = {"User-Agent": USER_AGENT, "Accept": "application/json"}

    # If raw document exists and we are not forcing re-download
    if raw_doc_path.exists() and raw_doc_path.is_file() and not force_download:
        print(f"Raw file already exists: {raw_doc_path}. Skipping download.")
    else:
        # Ensure the path exists
        raw_doc_path.parent.mkdir(parents=True, exist_ok=True)

        async with HTTPXClient() as client:
            response: dict[str, Any] = await client.get(url, headers=headers)

        if not response["success"]:
            print(f"Failed to download {url}: {response.get('error')}")
            return

        # Response data may be a dict or string; store as text
        raw_content: Any = response["data"]
        if not isinstance(raw_content, str):
            # Coerce to text safely
            try:
                raw_content = json.dumps(raw_content, ensure_ascii=False)
            except Exception:
                raw_content = str(raw_content)

        raw_doc_path.write_text(raw_content, encoding="utf-8")
        print(f"Saved raw content to {raw_doc_path}")

    # Convert the raw HTML/text into a cleaned markdown or plain text
    raw_text: str = raw_doc_path.read_text(encoding="utf-8")

    # Use the aggressive cleaner to remove XBRL noise
    cleaned_html = clean_xbrl_noise(raw_text)

    # For HTML content, convert to markdown with better formatting
    try:
        # Configure markdownify to preserve more structure
        cleaned_text: str = md(
            cleaned_html,
            heading_style="ATX",  # Use # for headers
            bullets="-",  # Use - for bullet points
            strong_em_symbol="**",  # Use ** for bold
            strip=["script", "style"],  # Remove script and style tags
        )
    except Exception as e:
        # If markdownify fails, try basic text extraction
        print(f"Warning: Markdown conversion failed: {e}")
        try:
            soup = BeautifulSoup(cleaned_html, "html.parser")
            cleaned_text = soup.get_text("\n", strip=True)
        except Exception:
            cleaned_text = cleaned_html

    # Post-processing cleanup on the markdown text
    # Remove lines that are mostly XBRL noise (lots of colons, short tokens)
    lines: list[str] = cleaned_text.split("\n")
    cleaned_lines: list[str] = []
    for line in lines:
        # Skip lines with excessive XBRL patterns
        if len(line) < 10:  # Keep very short lines (might be intentional)
            cleaned_lines.append(line)
            continue

        # Count suspicious patterns
        colon_count = line.count(":")
        token_count = len(
            re.findall(r"\b[A-Z][a-z]+(?:[A-Z][a-z]+)+\b", line)
        )  # CamelCase tokens

        # If line has too many colons or camelCase tokens relative to length, skip it
        if colon_count > len(line) / 20 or (token_count > 5 and len(line.split()) < 20):
            continue

        cleaned_lines.append(line)

    cleaned_text = "\n".join(cleaned_lines)

    # Remove excessive blank lines (more than 2 consecutive)
    cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text)

    # Remove leading/trailing whitespace from each line
    cleaned_text = "\n".join(line.strip() for line in cleaned_text.split("\n"))

    # Final whitespace cleanup
    cleaned_text = cleaned_text.strip()

    # Ensure the path exists
    cleaned_doc_path.parent.mkdir(parents=True, exist_ok=True)
    cleaned_doc_path.write_text(cleaned_text, encoding="utf-8")

    print(f"Saved cleaned content to {cleaned_doc_path}")
    return

In [11]:
url: str = "https://www.sec.gov/Archives/edgar/data/1045810/000104581023000017/nvda-20230129.htm"

await download_and_parse_data(
    url=url, raw_doc_path="raw_doc.txt", cleaned_doc_path="cleaned_doc.txt"
)

Raw file already exists: raw_doc.txt. Skipping download.
Saved cleaned content to cleaned_doc.txt
Saved cleaned content to cleaned_doc.txt


In [12]:
fp: str = "cleaned_doc.txt"

with Path(fp).open("r", encoding="utf-8") as file:
    cleaned_doc = file.read()

In [13]:
console.print(cleaned_doc[500:1_500])

<br>

## Step 1
- Split documents into chunks and create embeddings.

In [14]:
# from langchain_community.document_loaders import CSVLoader
from langchain_community.document_loaders import TextLoader

loader = TextLoader(fp)  # Integration-specific parameters here

# Load all documents
documents = loader.load()

# For large datasets, lazily load documents
# for document in loader.lazy_load():
#     print(document)

In [15]:
len(documents)

1

In [16]:
from re import Match, Pattern

# Extract 10-K sections with title and content separately (line-by-line comments)
# Get the entire document text from the TextLoader's first document
raw_text: str = documents[0].page_content  # the string to search for ITEM headers

# Header pattern: match 'ITEM 1.' or 'ITEM 1A.' etc. at the beginning of a line
# ^\s*            -> allow leading whitespace before the header
# ITEM\s+         -> the literal word ITEM followed by at least one space
# \d+             -> the item number (one or more digits)
# [A-Z]?           -> optional letter (A, B, etc.) after the number
# \.               -> period following the number (escaped dot)
# [\t ]+          -> at least one whitespace char (tab/space) after the dot
# [^\n\r]*        -> the remainder of the heading line (until newline)
# re.MULTILINE     -> ^ anchors at the beginning of each line
header_pattern: Pattern[str] = re.compile(
    r"^\s*(ITEM\s+\d+[A-Z]?\.[\t ]+[^\n\r]*)", re.MULTILINE
)

# run finditer which returns match objects with start()/end() locations
matches: list[Match[str]] = list(
    header_pattern.finditer(raw_text)
)  # convert to list for indexing

# Prepare lists to hold the results
section_titles: list[str] = []  # will store the header lines like 'ITEM 1. BUSINESS'
# will store the textual content of each section (no header)
section_content: list[str] = []

# Walk through each header match, capturing both title and the content after it
for i, match in enumerate(matches):
    title: str = match.group(1).strip()  # capture the heading text and strip whitespace
    section_titles.append(title)

    # The content begins right after the matched heading line
    start_pos: int = match.end()  # numeric index where this header finishes

    # Determine where this section ends: next header start or the end of the document
    if i + 1 < len(matches):
        end_pos: int = matches[i + 1].start()  # next header's start position
    else:
        end_pos: int = len(raw_text)  # or EOF if this is the last header

    # Use the start/end slices to get the body text and strip leading/trailing whitespace
    content: str = raw_text[start_pos:end_pos].strip()  # remove extra whitespace
    section_content.append(content)  # store the cleaned body in the sections list

# Confirmation print for quick inspection when the cell runs
print(f"Found {len(section_titles)} ITEM sections.")

Found 21 ITEM sections.


### Create Metadata-rich Chunks

In [126]:
from uuid import uuid4

from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1_000,  # chunk size (characters)
    chunk_overlap=50,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)

doc_chunks_with_metadata: list[Document] = []

# Loop thru each section's content and its title
for content, title in zip(section_content, section_titles):
    section_chunks: list[str] = text_splitter.split_text(content)

    # Loop thru each chunk to add metadata
    for chunk in section_chunks:
        chunk_id: str = str(uuid4())  # unique ID for this chunk
        doc_chunks_with_metadata.append(
            Document(
                page_content=chunk,
                metadata={
                    "source_doc": fp,  # original document path
                    "section": title,  # section header/title
                    "chunk_id": chunk_id,  # unique chunk ID
                },
            )
        )

print(f"Created {len(doc_chunks_with_metadata)} document chunks with metadata.")

Created 371 document chunks with metadata.


In [127]:
console.print(doc_chunks_with_metadata[51])

In [131]:
section_titles

['ITEM 1. BUSINESS',
 'ITEM 1A. RISK FACTORS',
 'ITEM 1B. UNRESOLVED STAFF COMMENTS',
 'ITEM 2. PROPERTIES',
 'ITEM 3. LEGAL PROCEEDINGS',
 'ITEM 4. MINE SAFETY DISCLOSURES',
 'ITEM 5. MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND ISSUER PURCHASES OF EQUITY SECURITIES',
 'ITEM 6. [RESERVED]',
 'ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS',
 'ITEM 7A. QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK',
 'ITEM\xa08. FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA',
 'ITEM\xa09. CHANGES IN AND DISAGREEMENTS WITH ACCOUNTANTS ON ACCOUNTING AND FINANCIAL DISCLOSURE',
 'ITEM\xa09A. CONTROLS AND PROCEDURES',
 'ITEM 9C. DISCLOSURE REGARDING FOREIGN JURISDICTIONS THAT PREVENT INSPECTIONS',
 'ITEM 10. DIRECTORS, EXECUTIVE OFFICERS AND CORPORATE GOVERNANCE',
 'ITEM 11. EXECUTIVE COMPENSATION',
 'ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFICIAL OWNERS AND MANAGEMENT AND RELATED STOCKHOLDER MATTERS',
 'ITEM 13. CERTAIN 

In [139]:
# Test the Metadata-aware chunking: e.g. 'Risk Factors' should be in the section
sample_chunk = (
    chunk
    for chunk in doc_chunks_with_metadata
    if "risk factors" in chunk.metadata.get("section", "").lower()
)
console.print(next(sample_chunk))

In [119]:
import os
from typing import Any

from langchain_core.embeddings import Embeddings
from langchain_core.utils import convert_to_secret_str
from pydantic import (
    BaseModel,
    ConfigDict,
    Field,
    SecretStr,
    model_validator,
)

from src.utilities.openrouter.client import AsyncOpenRouterClient, OpenRouterClient


def set_openrouter_api(value: str | None = None) -> SecretStr:
    """Set the OpenRouter API key"""
    if value is None:
        return convert_to_secret_str(os.getenv("OPENROUTER_API_KEY", ""))
    return convert_to_secret_str(value)


class OpenRouterEmbeddings(BaseModel, Embeddings):
    """Using Field with default_factory for automatic client creation."""

    model_config = ConfigDict(arbitrary_types_allowed=True)

    client: OpenRouterClient = Field(default_factory=OpenRouterClient)
    aclient: AsyncOpenRouterClient = Field(default_factory=AsyncOpenRouterClient)

    openrouter_api_key: SecretStr = Field(default_factory=set_openrouter_api)
    model: str = Field(default="sentence-transformers/paraphrase-minilm-l6-v2")

    @model_validator(mode="after")
    def validate_environment(self) -> "OpenRouterEmbeddings":
        """Validate the environment and set up the OpenRouter client."""
        _api_key: SecretStr | str = self.openrouter_api_key or os.getenv(
            "OPENROUTER_API_KEY", ""
        )
        if not _api_key:
            raise ValueError(
                "OpenRouter API key not found. Please set the OPENROUTER_API_KEY environment variable."
            )

        if isinstance(_api_key, str):
            _api_key = convert_to_secret_str(_api_key)

        # Set up the OpenRouter client if not already set
        self.client = OpenRouterClient(
            api_key=_api_key.get_secret_value(),  # type: ignore
            default_model=self.model,
        )
        self.aclient = AsyncOpenRouterClient(
            api_key=_api_key.get_secret_value(),  # type: ignore
            default_model=self.model,
        )
        return self

    def embed_documents(self, texts: list[str]) -> list[list[float]]:
        """Embed search docs."""
        response: dict[str, Any] = self.client.embeddings.create(
            input=texts, model=self.model
        )
        return [emb["embedding"] for emb in response["data"]]

    def embed_query(self, text: str) -> list[float]:
        """Embed query text."""
        return self.embed_documents([text])[0]

    async def aembed_documents(self, texts: list[str]) -> list[list[float]]:
        """Embed search docs."""
        response: dict[str, Any] = await self.aclient.aembeddings.create(
            input=texts, model=self.model
        )

        return [emb["embedding"] for emb in response["data"]]

    async def aembed_query(self, text: str) -> list[float]:
        """Embed query text."""
        return (await self.aembed_documents([text]))[0]


embeddings = OpenRouterEmbeddings()
result = await embeddings.aembed_documents(texts=["Hello there!"])

In [150]:
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams

client = QdrantClient(":memory:")

vector_size = len(await embeddings.aembed_query("sample text"))
collection_name: str = "smart_rag_collection"

if not client.collection_exists(collection_name):
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=vector_size, distance=Distance.COSINE),
    )
vectorstore = QdrantVectorStore(
    client=client,
    collection_name=collection_name,
    embedding=embeddings,
)
# Embed all the documents
document_ids: list[str] = await vectorstore.aadd_documents(
    documents=doc_chunks_with_metadata
)
print(document_ids[:3])

['f8b6ffbb6a3f49fbbabb647d70c70e43', '6097c626f8e94748a15014944780ebc8', '99327f7abf584fdbb33aa73e32d61cd7']


In [148]:
doc_chunks_with_metadata[0].model_dump()

{'id': None,
 'metadata': {'source_doc': 'cleaned_doc.txt',
  'section': 'ITEM 1. BUSINESS',
  'chunk_id': 'c1cb8dcb-9d52-49ba-890e-05fedbe38d83'},
 'page_content': 'Our Company\n\nNVIDIA pioneered accelerated computing to help solve the most challenging computational problems. Since our original focus on PC graphics, we have expanded to several other large and important computationally intensive fields. Fueled by the sustained demand for exceptional 3D graphics and the scale of the gaming market, NVIDIA has leveraged its GPU architecture to create platforms for scientific computing, artificial intelligence, or AI, data science, autonomous vehicles, or AV, robotics, metaverse and 3D internet applications.',
 'type': 'Document'}

In [168]:
from qdrant_client import QdrantClient, models

query: str = "Why was the 2023 effective tax rate lower than 2022?"

retrieved_docs = vectorstore.similarity_search(
    query,
    k=2,
    filter=models.Filter(
        must=[
            models.FieldCondition(
                key="metadata.section",
                match=models.MatchValue(value="ITEM 1. BUSINESS"),
            )
        ]
    ),
)
formatted_docs: str = "\n\n".join(
    (f"Source: {doc.metadata}\nContent: {doc.page_content}") for doc in retrieved_docs
)

console.print(formatted_docs)

### Create States

<br>

#### 1.) Step

- The smallest unit. Multiple Steps make up a Plan.
- A Step has:
  - question: The question being asked.
  - rationale: The reasoning behind the question.
  - tool: The tool to be used to answer the question. (e.g. web_search or vectorstore_lookup)
  - search_keywords: Keywords to use for searching.
  - target_section: The section of the document to focus on. (Only for vectorstore_lookup tool)


In [None]:
from typing import TypedDict


class Step(BaseModel):
    """A single step in the multi-step reasoning process."""

    question: str = Field(..., description="The question to be answered by the step.")
    rationale: str = Field(..., description="The brief reasoning behind the question.")
    tool: Literal["web_search", "vector_store", "hybrid_search"] = Field(
        ..., description="The tool to use for this step."
    )
    search_keywords: list[str] = Field(
        ...,
        description="Critical keywords and phrases to use for web search or vector store "
        "retrieval to ensure quality results are returned.",
    )
    target_section: str | None = Field(
        default=None,
        description="The target section in the document to focus on. This is ONLY required when "
        "the tool is 'vector_store'. e.g., 'ITEM 1A. RISK FACTORS'.",
    )

#### 2.) Plan

- A Plan is a sequence of Steps to achieve a goal.


#### 2.b) PastStep

- This is used to store the history of executed steps in the plan.


#### 3.) State

- A State represents the current status of the RAG process. It includes:
  - original_question: The initial question posed by the user.
  - plan: The current plan being executed.
  - past_steps: A list of PastStep objects representing the history of executed steps.
  - current_step_index: The index of the current step in the plan.
  - retrieved_docs: A list of Document objects that have been retrieved so far.
  - reranked_docs: A list of Document objects that have been reranked based on relevance.
  - synthesized_content: The content synthesized from the reranked documents.
  - final_answer: The final answer generated for the original question.

In [None]:
class Plan(BaseModel):
    """A multi-step plan for answering a complex question."""

    steps: list[Step] = Field(
        ..., description="A list of steps to execute in the plan."
    )


class PastStep(TypedDict):
    """Record of a completed step in the multi-step plan."""

    step_index: int  # Index of the step in the plan
    question: str  # The question asked in this step
    retrieved_documents: list[Document]  # Documents retrieved for this step
    summary: str  # Summary of the step's findings


class RetrievalDecision(BaseModel):
    """Decision made during document retrieval."""

    tool: Literal["web_search", "vector_store", "hybrid_search"] = Field(
        ..., description="Tool used for retrieval"
    )
    rationale: str = Field(
        ..., description="The brief reason for the retrieval decision"
    )


class State(TypedDict):
    """State of the multi-step reasoning process."""

    original_question: str  # The original complex question
    plan: Plan  # The multi-step plan
    past_steps: list[PastStep]  # List of completed steps
    current_step_index: int  # Index of the current step being executed
    retrieved_documents: list[Document]  # Documents retrieved in the current step
    reranked_documents: list[Document]  # Documents reranked based on relevance
    synthesized_context: str  # Synthesized context from reranked documents
    final_answer: str  # The final answer to the original question

In [190]:
"""This module contains prompt templates used for various interactions within the application."""

planner_prompt: str = """
<SYSTEM>
    <ROLE>
        You are an expert query planner specializing in multi-step reasoning.
        Your task is to decompose complex user questions into a logical sequence of steps that answer the original query.
        Each step should be clear, actionable, and designed to retrieve relevant information using appropriate tools.
    </ROLE>

    <GUIDELINES>
        <ANALYSIS>
            - Thoroughly analyze the user's question to understand its core intent and information requirements
            - Determine the logical order of information gathering for optimal answer synthesis
        </ANALYSIS>

        <PLANNING>
            - Create a multi-step plan where each step builds upon previous findings
            - Ensure steps are atomic: each should focus on one specific aspect of the query
            - Minimize redundancy: avoid overlapping information retrieval across steps
            - Order steps logically: foundational information first, specific details later
            - Limit plans to 3-5 steps for most queries (use more only if truly necessary)
        </PLANNING>

        <STEP_RULES>
            Each step MUST contain:

            - question: A clear, self-contained sub-question that addresses one aspect of the original query
              * Should be specific and focused
              * Must be answerable with the chosen tool
              * Should build on or complement previous steps

            - rationale: A brief explanation (1-2 sentences) of why this step is necessary and how it contributes to 
            answering the original query

            - tool: The retrieval tool best suited for this step
              * vector_store: Use for {company} document-specific information, facts from the corpus, or domain knowledge
              * web_search: Use for real-time information, external context, or data not in the document corpus
              * hybrid_search: Use when both document corpus and external sources are needed

            - search_keywords: 3-7 highly relevant keywords or phrases that optimize retrieval quality
              * Include domain-specific terms, technical jargon, and key entities
              * Prioritize precision over generic terms

            - target_section: (REQUIRED when tool is "vector_store", null otherwise)
              * Specify the exact document section to search (e.g. "ITEM 1A. RISK FACTORS", "ITEM 7. MANAGEMENT'S DISCUSSION")
              * Use standardized section names that match the document structure
        </STEP_RULES>
    </GUIDELINES>

    <SECTIONS>
        {section_titles}
    </SECTIONS>

    <AVAILABLE_TOOLS>
        - vector_store
        - web_search
        - hybrid_search
    </AVAILABLE_TOOLS>

    <OUTPUT_FORMAT>
        Return a structured Plan object containing a list of Step objects, each with all required fields properly populated.
    </OUTPUT_FORMAT>

</SYSTEM>
"""

In [181]:
hello_prompt: str = """{section_titles}"""
hello_prompt.format(section_titles=" | ".join(section_titles))

'ITEM 1. BUSINESS | ITEM 1A. RISK FACTORS | ITEM 1B. UNRESOLVED STAFF COMMENTS | ITEM 2. PROPERTIES | ITEM 3. LEGAL PROCEEDINGS | ITEM 4. MINE SAFETY DISCLOSURES | ITEM 5. MARKET FOR REGISTRANT’S COMMON EQUITY, RELATED STOCKHOLDER MATTERS AND ISSUER PURCHASES OF EQUITY SECURITIES | ITEM 6. [RESERVED] | ITEM 7. MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION AND RESULTS OF OPERATIONS | ITEM 7A. QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK | ITEM\xa08. FINANCIAL STATEMENTS AND SUPPLEMENTARY DATA | ITEM\xa09. CHANGES IN AND DISAGREEMENTS WITH ACCOUNTANTS ON ACCOUNTING AND FINANCIAL DISCLOSURE | ITEM\xa09A. CONTROLS AND PROCEDURES | ITEM 9C. DISCLOSURE REGARDING FOREIGN JURISDICTIONS THAT PREVENT INSPECTIONS | ITEM 10. DIRECTORS, EXECUTIVE OFFICERS AND CORPORATE GOVERNANCE | ITEM 11. EXECUTIVE COMPENSATION | ITEM 12. SECURITY OWNERSHIP OF CERTAIN BENEFICIAL OWNERS AND MANAGEMENT AND RELATED STOCKHOLDER MATTERS | ITEM 13. CERTAIN RELATIONSHIPS AND RELATED TRANSACTI

In [173]:
# utils.py

from enum import Enum

import instructor
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from openai import AsyncOpenAI

from src.config import app_settings

# from src.schemas.types import OpenRouterModels, PydanticModel


class OpenRouterModels(str, Enum):
    """OpenRouter LLMs."""

    GEMINI_2_0_FLASH_LITE = "google/gemini-2.0-flash-001"
    GEMINI_2_5_FLASH = "google/gemini-2.5-flash"
    GEMINI_2_5_FLASH_LITE = "google/gemini-2.5-flash-lite"
    GPT_OSS_120B = "openai/gpt-oss-120b"
    GPT_OSS_20B = "openai/gpt-oss-20b"
    GPT_5_NANO = "openai/gpt-5-nano"
    LLAMA_3_3_70B_INSTRUCT = "meta-llama/llama-3.3-70b-instruct"
    LLAMA_3_8B_INSTRUCT = "meta-llama/llama-3-8b-instruct"
    NEMOTRON_NANO_9B_V2 = "nvidia/nemotron-nano-9b-v2"
    QWEN3_30B_A3B = "qwen/qwen3-30b-a3b"
    QWEN3_NEXT_80B_A3B_INSTRUCT = "qwen/qwen3-next-80b-a3b-instruct"
    QWEN3_32B = "qwen/qwen3-32b"
    SAO10K_L3_LUNARIS_8B = "sao10k/l3-lunaris-8b"
    X_AI_GROK_4_FAST = "x-ai/grok-4-fast"
    X_AI_GROK_CODE_FAST_1 = "x-ai/grok-code-fast-1"
    Z_AI_GLM_4_5 = "z-ai/glm-4.5"


_async_client = AsyncOpenAI(
    api_key=app_settings.OPENROUTER_API_KEY.get_secret_value(),
    base_url=app_settings.OPENROUTER_URL,
)

aclient = instructor.from_openai(
    _async_client,
    mode=instructor.Mode.OPENROUTER_STRUCTURED_OUTPUTS,
)
type PydanticModel = type[BaseModel]


async def get_structured_output(
    messages: list[dict[str, Any]],
    model: OpenRouterModels | None,
    schema: PydanticModel,
) -> PydanticModel:
    """
    Retrieves structured output from a chat completion model.

    Parameters
    ----------
    messages : list[dict[str, Any]]
        The list of messages to send to the model for the chat completion.
    model : RemoteModel
        The remote model to use for the chat completion (e.g., 'gpt-4o').
    schema : PydanticModel
        The Pydantic schema to enforce for the structured output.

    Returns
    -------
    BaseModel
        An instance of the provided Pydantic schema containing the structured output.

    Notes
    -----
    This is an asynchronous function that awaits the completion of the API call.
    """
    model = model if model else OpenRouterModels.GEMINI_2_0_FLASH_LITE

    return await aclient.chat.completions.create(
        model=model,
        response_model=schema,
        messages=messages,  # type: ignore
        max_retries=5,
    )


def convert_langchain_messages_to_dicts(
    messages: list[HumanMessage | SystemMessage | AIMessage],
) -> list[dict[str, str]]:
    """Convert LangChain messages to a list of dictionaries.

    Parameters
    ----------
    messages : list[HumanMessage | SystemMessage | AIMessage]
        List of LangChain message objects to convert.

    Returns
    -------
    list[dict[str, str]]
        List of dictionaries with 'role' and 'content' keys.
        Roles are mapped as follows:
        - HumanMessage -> "user"
        - SystemMessage -> "system"
        - AIMessage -> "assistant"

    """
    role_mapping: dict[str, str] = {
        "SystemMessage": "system",
        "HumanMessage": "user",
        "AIMessage": "assistant",
    }

    converted_messages: list[dict[str, str]] = []
    for msg in messages:
        message_type: str = msg.__class__.__name__
        # Default to "user" if unknown
        role: str = role_mapping.get(message_type, "user")
        converted_messages.append({"role": role, "content": msg.content})  # type: ignore

    return converted_messages


def append_memory(existing: dict[str, Any], new: dict[str, Any]) -> dict[str, Any]:
    """Merge new memory data into existing memory, appending lists and merging dicts.

    Parameters
    ----------
    existing: dict[str, Any]
        The existing memory data.
    new: dict[str, Any]
        The new memory data to merge.

    Returns
    -------
    dict[str, Any]
        The merged memory data.
    """
    result: dict[str, Any] = existing.copy()

    for key, new_value in new.items():
        # Skip None or empty values
        if new_value is None or new_value == "" or new_value == []:
            continue

        existing_value = result.get(key)

        # If key doesn't exist, just add it
        if existing_value is None:
            result[key] = new_value
            continue

        # Lists: combine and remove duplicates
        if isinstance(new_value, list):
            combined = existing_value + new_value
            # Preserve order, remove duplicates
            # ["a", "b", "a"] -> ["a", "b"]
            result[key] = list(dict.fromkeys(combined))

        # Dicts: merge
        elif isinstance(new_value, dict):
            result[key] = {**existing_value, **new_value}

        # Everything else: new value overwrites
        else:
            result[key] = new_value

    return result

In [191]:
async def generate_plan(state: State) -> State:
    company: str = "NVIDIA"
    user_question: str = state["original_question"]
    user_query: str = f"<USER_QUESTION>{user_question}</USER_QUESTION>"

    query = planner_prompt.format(
        company=company,
        user_question=user_question,
        section_titles=" | ".join(section_titles),
    )
    llm_with_structure = remote_llm.with_structured_output(Plan)
    response = await llm_with_structure.ainvoke(
        [SystemMessage(content=query), HumanMessage(content=user_query)]
    )

    return State(
        original_question=state["original_question"] or "",
        plan=response,
        past_steps=state["past_steps"] or [],
        current_step_index=state["current_step_index"] or 0,
        retrieved_documents=state["retrieved_documents"] or [],
        reranked_documents=state["reranked_documents"] or [],
        synthesized_context=state["synthesized_context"] or "",
        final_answer=state["final_answer"] or "",
    )

In [192]:
user_query: str = "Why are the risks associated with Nvidia in 2025?"
state: State = {
    "original_question": user_query,
    "plan": None,
    "past_steps": [],
    "current_step_index": 0,
    "retrieved_documents": [],
    "reranked_documents": [],
    "synthesized_context": "",
    "final_answer": "",
}
response = await generate_plan(state)

In [193]:
console.print(response["plan"])