In [None]:
DEBUG = False

In [None]:
import os, torch, json, re, numpy as np

# local model instantiation
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_huggingface import HuggingFacePipeline

# agent
from langchain import hub
from langchain.agents import create_structured_chat_agent, AgentExecutor
from langchain_community.llms.utils import enforce_stop_tokens
from langchain_core.runnables import Runnable


## Qwen3 setup

### What create_react_agent expects 

uses LangChain's ReAct output parser "ReActSingleInputOutputParser." This parser is very strict. Every LLM turn must be structured as one of the following:

Thought: ...
Action: <tool_name>
Action Input: ...
or

Thought: ...
Final Answer: ...
not both. Both Qwen3-1.7B and Qwen3-8B sometimes produce both an Action and Final Answer in one message, causing the parser to throw an OutputParserException. This looks like:

```
OutputParserException: Parsing LLM output produced both a final answer and a parse-able action.
```

### How regex pattern matching work's in our pipeline

1. re.escape: Scans a string and prefixes every character that has a special meaning in regular expression syntax with a backslash (`. ^ $ * + ? { } [ ] \). This is done so that these metacharacters aren't compiled into regex pattern matching before the text is searched.

*WITHOUT* using re.escape("1+"):

```python
>>> import re
>>> text = "Here’s a math fact: 1+11=2? Cool, right?"
>>> re.split("1+", text) # match one or more "1"s
["Here’s a math fact: ", "+", "=2? Cool, right?"]
```

*USING* re.escape("1+"):

```python
>>> import re
>>> text = "Here’s a math fact: 1+11=2? Cool, right?"
>>> re.split(re.escape("1+"), text) # now we're searching for the literal "1\\+"
["Here’s a math fact: ", "11=2? Cool, right?"]
```

2. ```map(re.escape, self.stop)```: applies re.escape to each stop string
3. ```"|".join(map(re.escape, self.stop))```: joins the escaped stop tokens with |, producing a regex “or” expression. If self.stop = ["Obs:", "Observation:"], you’d get r"Obs:|Observation:"
4. ```re.split(pattern, text)```: actually splits the text at each pattern

Example:

```text = (
    "The chemical reaction proceeded without incident. "
    "Observation: the solution turned bright blue. "
    "We collected three samples for further analysis."
)
```
produces

```[
  "The chemical reaction proceeded without incident. ",
  " the solution turned bright blue. We collected three samples for further analysis."
]
```

### TL;DR

The below method using the shadow class for HuggingFacePipeline follows the streaming approach:

Converts each stop string to its token-ID, wraps it in a custom StoppingCriteria, and passes that into model.generate before generation starts. The moment the last token of any stop string appears, generation terminates — no extra tokens are produced, saving compute/time.

<u>Section refs</u>:

https://python.langchain.com/api_reference/huggingface/llms/langchain_huggingface.llms.huggingface_pipeline.HuggingFacePipeline.html

source code for huggingFace pipeline:

https://python.langchain.com/api_reference/_modules/langchain_huggingface/llms/huggingface_pipeline.html

In [None]:
model_path = "/storage/ice1/6/7/dharden7/rag_models/Qwen3-8B"

# trust_remote_code flag allows from_pretrained func to execute custom class definitions
# bcs qwen is not native to AutoModelForCausalLM compiler
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    trust_remote_code=True,
)

max_tokens = 512
temp = 0.1
top_p = 0.9

gen_cfg = dict(max_new_tokens=max_tokens, temperature=temp, top_p=top_p)

# create an hf text gen pipeline 
hf_pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, **gen_cfg)

# We want the LLM pipeline to stop generating tokens after "Observation:", so that 
# the tool call reaches the LLM and is not hallucinated.
STOP_STRINGS = ["\nObservation:", "\n[ANSWER END]"]

# shadow HuggingFacePipeline to fix _trim class - see stack overflow 
class hf_stop(HuggingFacePipeline):

    def __init__(self, pipeline, stop=None, **kwargs):
        super().__init__(pipeline=pipeline, **kwargs)

        # Normally, Pydantic complains if we define attributes after super().__init__
        # bypass pydantic and assign stop to underlying dict
        object.__setattr__(self, "stop", stop)

    def _trim(self, text: str) -> str:
        pattern = "|".join(map(re.escape, self.stop))
        return re.split(pattern, text)[0]

    def invoke(self, input, config=None, **kwargs):
        text = super().invoke(input, **kwargs)
        return self._trim(text)

    '''
    Iterate over each token-chunk produced by the wrapped LLMs own stream() method.
    A chunk here is a string such as "The", "dog", "barked", depending on tokenization.
    Whatever is yeilded here is sent to the AgentExecutor.
    '''
    def stream(self, input, config=None, **kwargs):
        acc = ""
        for chunk in super().stream(input, config=config, **kwargs):
            acc += chunk
            trimmed = self._trim(acc)
            if trimmed != acc:
                # len(acc) - len(chunk) is the buffer length before adding this chunk
                # hence we return everything accumulated up to the latest chunk
                yield trimmed[len(acc) - len(chunk):]
                break
            yield chunk

llm = hf_stop(pipeline=hf_pipe, stop=STOP_STRINGS)



In [None]:
# quick sanity test
if DEBUG:
    print(llm.invoke("Thought: X\nAction: Y\nAction Input: Z\nObservation: SHOULD NOT APPEAR"))

### Alternative but slower approach to stop_tokens

When you call llm.bind(stop=[...]).invoke(...) (the non-streaming pathway) the model still generates the whole completion; LangChain just slices the returned string afterward. Only the streaming methods (stream()/astream()) actually halt token generation mid-flight.

#### Why this is less efficient

With **llm.bind(...).invoke() you’re paying for the full 512-token generation—even though you only keep the prefix—whereas with **stream() you’d stop the GPU as soon as "Observation:" is emitted.

<u>Relevant documentation</u>:

1. https://python.langchain.com/docs/how_to/streaming/
2. https://python.langchain.com/api_reference/core/runnables/langchain_core.runnables.base.Runnable.html
3. https://python.langchain.com/docs/how_to/binding/

In [None]:
if DEBUG:
    stop = ["\nObservation"] 

    llm_to_use = llm.bind(
        stop=stop,                  
        max_new_tokens=max_tokens,  
        temperature=temp,           
        top_p=top_p,                
    )

In [None]:
# quick sanity test
if DEBUG:
    print(llm_to_use.invoke("Thought: X\nAction: Y\nAction Input: Z\nObservation: SHOULD NOT APPEAR"))

## Chroma on disk persistent vector DB

<u>Relevant documentation</u>:

1. https://api.python.langchain.com/en/latest/embeddings/langchain_community.embeddings.huggingface.HuggingFaceEmbeddings.html
2. https://python.langchain.com/api_reference/core/vectorstores/langchain_core.vectorstores.in_memory.InMemoryVectorStore.html

### Imports

In [None]:
# general - DEBUG

os.environ["ANONYMIZED_TELEMETRY"] = "FALSE"
os.environ["LANGCHAIN_TELEMETRY"] = "false"

# vector store
from langchain_core.vectorstores import InMemoryVectorStore
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
import logging, warnings
from chromadb.config import Settings
import chromadb

### What is Telemetry?

In this context, telemetry refers to Chroma’s built-in instrumentation that automatically collects and (normally) reports anonymous usage data back to its maintainers. Typical telemetry events include things like:

* ClientStartEvent – “My library has just been initialized.”

* ClientCreateCollectionEvent – “A new collection (i.e. a new vector index) was created.”

* CollectionQueryEvent, CollectionGetEvent, etc. – “The client ran a query” or “fetched some documents.”

The idea is to help the Chroma developers understand how the library is being used in the wild—e.g. what features are popular, how often people hit the disk, what query volumes look like, and so on.

### How is it used in Chromadb?

Chromadb ships with a “no-op” telemetry function
In order to avoid pulling in a full telemetry dependency, the Python side of Chromadb provides its own capture function that simply does nothing. But it was implemented to accept one argument (say, an event object).

The Rust backend calls capture() with three arguments
When you initialize the client or create a collection, the Rust code does something like:

```python
// pseudocode
Python.capture("ClientStartEvent", payload_dict, metadata_dict)
```

That ends up invoking the Python stub as capture(arg1, arg2, arg3), but since that stub only takes one parameter, Python immediately raises:

```python
capture() takes 1 positional argument but 3 were given
```


### Why we disable it
* Privacy: we don't want our usage patterns sent anywhere.

* Noise: If the telemetry hook is broken (as in this case), it pollutes our console with errors.

* On-premises use: In our environment without internet access, telemetry can’t actually send data and just produces errors.

### How Chroma's local DB works

When we do this:

```python
from langhchain_chroma import Chroma
db = Chroma(persist_directory='/path/to/db', embedding_function=embedder)
```

here's what happens under the hood:

1. The langchain_chroma wrapper talks to the chromadb Python package, which in turn uses a Rust library compiled into your Python process (no separate server process by default).
2. That Rust code implements both the on‐disk storage layer and the vector search algorithms.
3. A set of files appears in persist_directory—by default a SQLite (or DuckDB) database plus some binary files for the index.
4. One file holds your documents and metadata (often as JSON blobs or tables), another holds the embeddings and the nearest‐neighbor index (e.g. an HNSW graph).
5. When you first call Chroma(...), it either:
    * Creates a new “collection” by initializing empty tables and index files, or
    * Loads an existing collection by opening those files and preparing the in-memory structures (like loading the HNSW graph into RAM).

#### Additional Notes
* Everything lives on your local disk and memory. Queries (e.g. similarity_search) go directly against the Rust library in your process, which:
    1. Embeds your query via your embedding_function.
    2. Runs a nearest-neighbor search in the HNSW (or flat) index.
    3. Retrieves the top-k document IDs and then returns the corresponding text + metadata from the on-disk tables.
* Persistence on demand
    * When you call db._client.persist() (or the older db.persist()), it flushes any new embeddings, documents, and index updates from memory back to those files.
    * On your next program run, Chroma(...) simply reopens those files and you get instant search without recomputing embeddings.

<u>Relevant documentation</u>:

* Suppressing warnings/logging

1. https://pypdf.readthedocs.io/en/stable/user/suppress-warnings.html
2. https://docs.python.org/3/library/warnings.html

* Local Chromadb client

1. https://docs.trychroma.com/docs/run-chroma/persistent-client
2. https://python.langchain.com/docs/integrations/vectorstores/chroma/

* Recursive text splitting
1. https://python.langchain.com/docs/concepts/text_splitters/
2. https://python.langchain.com/docs/how_to/recursive_text_splitter/

* PyPDF Loader
1. https://python.langchain.com/docs/integrations/document_loaders/pypdfloader/
2. https://python.langchain.com/docs/how_to/document_loader_pdf/

* Sim search/retrieval
1. https://python.langchain.com/docs/how_to/vectorstore_retriever/
2. https://python.langchain.com/docs/tutorials/retrievers/

In [None]:
# (optional) hide any pypdf UserWarnings too
logging.getLogger("pypdf").setLevel(logging.ERROR)
warnings.filterwarnings("ignore", category=UserWarning, module="pypdf")

# any Sentence-Transformers model works; mini-LM is light & decent
embedder = HuggingFaceEmbeddings(
    model_name='sentence-transformers/all-MiniLM-L6-v2',
    model_kwargs={"device": "cuda"}
)

client = chromadb.PersistentClient(
    settings=Settings(anonymized_telemetry=False) 
)

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

persist_dir = Path("/home/hice1/dharden7/deepsearch/langchain/chroma_db")
persist_dir.mkdir(parents=True, exist_ok=True)

db = Chroma(
    persist_directory=str(persist_dir),
    embedding_function=embedder,
)

if len(db.get(limit=0)) == 0:
    print('DB is empty. Ingesting documents...\n')

    texts, metadata = [], []
    
    for pdf_path in Path("/home/hice1/dharden7/deepsearch/langchain/papers").glob("*.pdf"):
        pages = PyPDFLoader(str(pdf_path)).load_and_split(text_splitter=splitter)
        texts.extend([page.page_content for page in pages])
        metadata.extend([page.metadata for page in pages])

    db.add_texts(texts=texts, metadatas=metadata)
    db._client.persist()
else:
    print('Loaded db from disk.')

# print total chunks in db - DEBUG
all_data = db.get(include=["documents", "metadatas", "embeddings"])
print(f"Num stored chunks: {len(all_data['documents'])}")

In [None]:
# testing the retriever

retriever = db.as_retriever()

query= "Give me a list of everyone on the thesis committee from the paper titled 'COLLISION INDUCED SELF ORGANIZATION IN SHAPE CHANGING ROBOTS'."
results = db.similarity_search(query, k=3)

if 1:
    for i, doc in enumerate(results, start=1):
        print(f"\nHit #{i}")
        print(doc.page_content[:500], "...")     
        print("Metadata:", doc.metadata)


### Simple ReAct agent



<u>Relevant documentation</u>:

* Creating ReAct agent

1. https://python.langchain.com/docs/tutorials/agents/
2. https://python.langchain.com/api_reference/langchain/agents/langchain.agents.react.agent.create_react_agent.html

* Agent Orchestration (AgentExecutor)

1. https://python.langchain.com/api_reference/langchain/agents/langchain.agents.agent.AgentExecutor.html

* Prompting: Hub + PromptTemplate

1. https://smith.langchain.com/hub/hwchase17/react
2. https://python.langchain.com/api_reference/core/prompts/langchain_core.prompts.prompt.PromptTemplate.html

* Custom agent output parser

1. https://python.langchain.com/docs/how_to/output_parser_custom/
2. https://api.python.langchain.com/en/latest/exceptions/langchain_core.exceptions.OutputParserException.html

* Creating/using tools

1. https://python.langchain.com/docs/concepts/tools/
2. https://python.langchain.com/api_reference/core/tools.html
3. Retriever tool - https://python.langchain.com/api_reference/core/tools/langchain_core.tools.retriever.create_retriever_tool.html

* Using regex for parsing

1. https://docs.python.org/3/library/re.html
2. https://docs.python.org/3/howto/regex.html

In [None]:
from langchain.agents import create_react_agent, AgentExecutor
from langchain import hub  
from langchain.prompts import PromptTemplate

# ReAct parser override
import re
from langchain.schema import AgentAction, AgentFinish
from langchain.agents.agent import AgentOutputParser
from langchain.agents.react.output_parser import ReActOutputParser

In [None]:
class QwenReActParser(AgentOutputParser):
    # "Action:"" / "Action Input:"" block's
    # \s*(\w+) -> skip spaces, capture the tool name (e.g. pdf_search) in group 1
    # \s*[\n\r]+ -> skip any trailing spaces, then at least one newline

    _act_re = re.compile(
        r"Action:\s*(\w+)\s*[\n\r]+Action Input:\s*(.*?)\s*$",
        re.I | re.S,
    )

    # "Final Answer:" block
    # non-greedy capture (.*?) between the two markers
    # \s* allows for newline / spaces before the [END ANSWER]
    _final_re = re.compile(
        r"Final Answer:\s*(.*?)\s*\[END ANSWER\]",
        re.I | re.S,
    )

    # if [END ANSWER] is missing grab rest of text
    _fallback_re = re.compile(
        r"Final Answer:\s*(.*)",
        re.I | re.S,
    )

    def parse(self, text: str):
        # block with END marker
        m = self._final_re.search(text)
        if m:
            answer = m.group(1).strip()
            return AgentFinish({"output": answer}, log=text)

        # "Final Answer:" exists but no [END ANSWER] marker
        m_fb = self._fallback_re.search(text)
        if m_fb:
            answer = m_fb.group(1).splitlines()[0].strip()
            return AgentFinish({"output": answer}, log=text)

        #  "Action:"" / "Action Input:"" block's
        # i.e group 0: 'Action: pdf_search\nAction Input: "COLLISION INDUCED SELF ORGANIZATION IN SHAPE CHANGING ROBOTS"'
        # group(1): 'pdf_search'
        # group(2): '"COLLISION INDUCED SELF ORGANIZATION IN SHAPE CHANGING ROBOTS"'
        m_act = self._act_re.search(text)
        if m_act:
            return AgentAction(
                tool=m_act.group(1).strip(),
                tool_input=m_act.group(2).strip().strip('"'),
                log=text,
            )

        raise OutputParserException(f"Could not parse LLM output:\n{text}")


In [None]:
from langchain.tools import Tool
from langchain_core.tools import create_retriever_tool
from langchain_core.exceptions import OutputParserException

# when the agent doesn't follow the expected Thought - Action - Action Input - Final Answer format
# handle OutputParserException's by reminding it how to format its answers
def fix_format(error: Exception) -> str:
    return (
        "FORMAT ERROR: "           
        "When you want to use a tool, reply ONLY with:\n"
        "Thought: <your thought>\n"
        "Action: <tool name>\n"
        "Action Input: <input>\n"
        "Do NOT put Final Answer in that same response.\n"
        "When you are completely finished and need no more tools, "
        "reply ONLY with:\n"
        "Thought: <your thought>\n"
        "Final Answer: <answer>\n"
    )

extra_rule = (
    "\nIMPORTANT:\n"
    "After you emit a line that begins with 'Final Answer:' **You must output nothing else.** "
    "Do NOT add any further Thought, Action, or Observation."
    # "NEVER repeat the rules given to you in your Final Answer."
    "Keep your 'Final Answer' as concise as possible while providing all relevant details and explanations."
    "Once you are satisfied with your final answer, write '\n[END ANSWER]\n[ANSWER END]'."
)

# create base prompt for the ai agent
base_prompt: PromptTemplate = hub.pull("hwchase17/react")
strict_prompt = PromptTemplate(
    input_variables=base_prompt.input_variables,
    template=base_prompt.template + "\nNEVER write “Final Answer:” until AFTER you have read an Observation." + extra_rule,
)

# setup retrieval tool
retriever_tool = create_retriever_tool(
    # default knn search
    retriever = db.as_retriever(),
    name = "pdf_search",
    description = (
        "Searches the ingested PDF knowledge base and returns relevant passages with metadata such as source file and page number."
    ),
)

def pdf_search_string(query: str, k: int = 3) -> str:
    docs = db.similarity_search(query, k=k)
    if not docs:
        return "NO_MATCH"

    passages = []
    for d in docs:
        src  = d.metadata.get("source", "unknown").split("/")[-1]
        page = d.metadata.get("page_label") or d.metadata.get("page")
        passages.append(
            f"[SOURCE: {src} | page {page}]\n"
            # get first 500 char per hit
            f"{d.page_content.strip()[:500]}..."
        )

    return "\n\n-\n\n".join(passages)

# define the search tool
pdf_search_tool = Tool(
    name="pdf_search",
    func=pdf_search_string,
    description=(
        "Searches the ingested PDF knowledge base and returns up to 3 passages "
        "that mention the query, each prefixed with source file and page."
    ),
)

# create our react agent
agent = create_react_agent(
    llm,
    [pdf_search_tool],
    prompt=strict_prompt,
    output_parser=QwenReActParser()
)

# create an agent executor that orchestrates the prompting + tool calling pipeline
agent_exec = AgentExecutor(
    agent=agent, 
    tools=[pdf_search_tool],
    verbose=False,
    handle_parsing_errors=fix_format,
    return_intermediate_steps=True
)

### Prompt the agent

In [None]:
# let's test the agent's retrieval tool
# question = "Give me a list of everyone on the thesis committee from the paper titled 'COLLISION INDUCED SELF ORGANIZATION IN SHAPE CHANGING ROBOTS'."
question = "Give me a list of everyone who approved the paper whose title goes as 'Development of Novel Platforms for Homogeneous and Heterogeneous Catalysis'"

if 1:
    result = agent_exec.invoke({"input": question})
    print(result["output"])

### Conversing with our agent as a chatbot with memory

<u>Relevant documentation</u>:

* Conversation with memory

1. https://api.python.langchain.com/en/latest/memory/langchain.memory.buffer.ConversationBufferMemory.html
2. https://python.langchain.com/docs/versions/migrating_memory/conversation_buffer_memory/
3. https://medium.com/%40danushidk507/memory-in-langchain-iii-f0a226f5eb65

* Text coloring w/ANSI

1. https://jakob-bagterp.github.io/colorist-for-python/ansi-escape-codes/standard-16-colors/
2. https://python-reference.readthedocs.io/en/latest/docs/file/flush.html


In [None]:
from langchain.memory import ConversationBufferMemory
import sys, time
from IPython.display import display, Markdown, clear_output

# Prevent messages like "You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset"
os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "1"

In [None]:
memory = ConversationBufferMemory(
    # The key name under which the memory object will return the stored conversation when a chain/agent loads memory variables
    memory_key="chat_history",

    '''
    Tells the memory which field in the chain’s outputs contains the model’s reply that should be written back into memory after each run
    For instance the chain could store the models outputs under "Answer" or "Result"
    '''
    output_key="output",

    '''
    Controls the format in which past conversation is returned
    True: You get a list of structured HumanMessage / AIMessage objects (recommended for chat models)
    False: You get a single concatenated string buffer, which can behave unpredictably with chat-oriented LLMs
    '''
    return_messages=True
)

chatbot = AgentExecutor(
    agent = agent,
    tools = [pdf_search_tool],
    memory = memory,
    verbose = False,               
    return_intermediate_steps = False 
)


In [None]:
# ANSI escape codes for coloring terminal text
BLUE   = "\033[94m"
GREEN  = "\033[92m"
DEF  = "\033[0m"

while True:
    user_msg = input(f"{BLUE}You:{DEF} ")
    if user_msg.lower() in {"exit", "quit"}:
        break

    # show the user’s message right away
    print(f"{BLUE}You:{DEF} {user_msg}", flush=True)

    response = chatbot.invoke({"input": user_msg})
    agent_reply = response["output"]

    print(f"{GREEN}Agent:{DEF} {agent_reply}\n", flush=True)

    '''
    Give Jupyter a moment to push the buffer to UI. Handy in interactive chat loops so user sees 
    streaming output without waiting for buffer flush on newline/exit.
    '''
    sys.stdout.flush()
    time.sleep(0.05)
