In [None]:
# !pip install -U langchain langchain_community faiss-cpu sentence-transformers pypdf
# !pip install numexpr
# !pip install -U duckduckgo-search

# Pip install

In [None]:
# !pip install -U langchain langchain_community faiss-cpu sentence-transformers pypdf
# !pip install numexpr
# !pip install -U duckduckgo-search

# Load Libraries

In [18]:
import os
import glob

from pprint import pprint
from dotenv import load_dotenv
from typing import List

from sentence_transformers import SentenceTransformer

from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings

from langchain_community.tools import DuckDuckGoSearchRun
from langchain.tools import tool
from langchain.agents import load_tools

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.tools.retriever import create_retriever_tool
from langchain.agents import AgentExecutor, create_react_agent, initialize_agent, AgentType
from langchain_core.documents import Document
from langchain.memory import ConversationBufferMemory

load_dotenv()

True

# Main

In [19]:
# 0) Config
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
assert GOOGLE_API_KEY, "Set GOOGLE_API_KEY env var"
INDEX_DIR = "./faiss_index"
DOCS_DIR = "./docs"

In [20]:
# 1) Load & chunk documents
def load_docs(path: str) -> List[Document]:
    docs = []
    for f in glob.glob(os.path.join(path, "**/*"), recursive=True):
        print(f)
        if f.lower().endswith(".pdf"):
            docs.extend(PyPDFLoader(f).load())
        elif f.lower().endswith((".txt", ".md")):
            docs.extend(TextLoader(f, encoding="utf-8").load())
    return docs

raw_docs = load_docs(DOCS_DIR)
if not raw_docs:
    print("No docs found in ./docs. The agent still works, but RAG will be empty.")

splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=150)
chunks = splitter.split_documents(raw_docs)

./docs\1706.03762v7.pdf
./docs\2106.09685v2.pdf
./docs\2501.12948v1.pdf


In [21]:
chunks

[Document(metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': './docs\\1706.03762v7.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Bra

In [22]:
# 2) Embeddings + VectorStore (persisted FAISS)
emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
emb = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')

if os.path.isdir(INDEX_DIR) and glob.glob(os.path.join(INDEX_DIR, "*")):
    vs = FAISS.load_local(INDEX_DIR, emb, allow_dangerous_deserialization=True)
else:
    vs = FAISS.from_documents(chunks, emb)
    vs.save_local(INDEX_DIR)

retriever = vs.as_retriever(search_kwargs={"k": 4})

In [23]:
# 3) LLM (Gemini) and ReAct agent
llm = ChatGoogleGenerativeAI(model="gemini-2.5-pro", temperature=0.3)
prompt = hub.pull("hwchase17/react")  # standard ReAct prompt template



In [24]:
# 4) Define the RAG tool
rag_tool = create_retriever_tool(
    retriever,
    name="local_corpus_search",
    description="Search the local knowledge base built from files in ./docs."
)

# Math calculator (built-in)
calculator_tool = load_tools(["llm-math"], llm=llm)[0]

# Web search tool
search_tool = DuckDuckGoSearchRun()

tools = [rag_tool, calculator_tool, search_tool]

In [25]:
# 5) Agent
# agent = create_react_agent(llm=llm, tools=tools, prompt=prompt)
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
# executor = AgentExecutor(agent=agent, tools=tools, memory=memory, verbose=True, handle_parsing_errors=True)
# executor = AgentExecutor(agent=agent, tools=tools, verbose=True, handle_parsing_errors=True)

agent = initialize_agent(
    tools,
    llm,
    agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,
    verbose=True,
    memory=memory,
    handle_parsing_errors=True
)
executor = agent

In [26]:
# # 5) Simple CLI loop
# SYSTEM_HINT = (
#     "You are an agent with a local RAG tool. "
#     "When questions may be answered from the local corpus, ALWAYS call the tool. "
#     "If the tool returns context, synthesize a concise answer and cite snippets."
# )

# print("Gemini + LangChain RAG Agent. Type 'exit' to quit.")
# while True:
#     q = input("\nUser: ").strip()
#     if q.lower() in {"exit", "quit"}:
#         break
#     try:
#         out = executor.invoke({"input": f"{SYSTEM_HINT}\n\nQuestion: {q}"})
#         print("\nAssistant:", out["output"])
#     except Exception as e:
#         print("Error:", e)

In [34]:
# 5) Simple CLI loop
SYSTEM_HINT = (
    "You are an AI agent with access to three tools: "
    "1) Local RAG search for answering from a local document corpus. "
    "2) Math calculator for numerical computations. "
    "3) Web search for retrieving up-to-date online information. "
    "Use the most relevant tool depending on the query. "
    "If local RAG provides context, synthesize a concise answer and cite relevant snippets. "
    "If the answer is not in the local corpus, search the web. "
    "If a calculation is needed, use the calculator."
    "When answering, include the tool you used in the format:"
    "Tool used: <tool_name>"
    "Answer: <answer>"
)

# SYSTEM_HINT = """
# You are a ReAct agent. Always follow this format:

# Thought: reasoning here
# Action: tool_name
# Action Input: input for the tool
# ... (repeat as needed)
# Observation: result of action
# Thought: more reasoning
# Final Answer: the answer to the user

# Never omit 'Action:' when using a tool.
# """

In [35]:
query = "Summarize the main points from 'attention is all you need' paper."
result = executor.invoke({"input": f"{SYSTEM_HINT}\n\nQuestion: {query}"}, return_intermediate_steps=True)
print(result["output"])



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "local_corpus_search",
    "action_input": "attention is all you need"
}
```[0m
Observation: [36;1m[1;3mto investigate local, restricted attention mechanisms to efficiently handle large inputs and outputs
such as images, audio and video. Making generation less sequential is another research goals of ours.
The code we used to train and evaluate our models is available at https://github.com/
tensorflow/tensor2tensor.
Acknowledgements We are grateful to Nal Kalchbrenner and Stephan Gouws for their fruitful
comments, corrections and inspiration.
References
[1] Jimmy Lei Ba, Jamie Ryan Kiros, and Geoffrey E Hinton. Layer normalization. arXiv preprint
arXiv:1607.06450, 2016.
[2] Dzmitry Bahdanau, Kyunghyun Cho, and Yoshua Bengio. Neural machine translation by jointly
learning to align and translate. CoRR, abs/1409.0473, 2014.
[3] Denny Britz, Anna Goldie, Minh-Thang Luong, and Quoc V . Le. Massive explora

In [37]:
pprint(result)

{'chat_history': [HumanMessage(content="You are an AI agent with access to three tools: 1) Local RAG search for answering from a local document corpus. 2) Math calculator for numerical computations. 3) Web search for retrieving up-to-date online information. Use the most relevant tool depending on the query. If local RAG provides context, synthesize a concise answer and cite relevant snippets. If the answer is not in the local corpus, search the web. If a calculation is needed, use the calculator.\n\nQuestion: Summarize the main points from 'attention is all you need' paper.", additional_kwargs={}, response_metadata={}),
                  AIMessage(content='Based on the provided document snippets, the paper presents a model centered around an attention mechanism. Here are the main points:\n\n1.  **Attention Function**: The core of the model is an attention function described as "mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all

In [38]:
query = "Who are the authors of the lora paper?."
result = executor.invoke({"input": f"{SYSTEM_HINT}\n\nQuestion: {query}"}, return_intermediate_steps=True)
print(result["output"])



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mCould not parse LLM output: 
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE [0m
Observation: Invalid or incomplete response
Thought:[32;1m[1;3m```json
{
    "action": "duckduckgo_search",
    "action_input": "authors of LoRA paper"
}
```[0m

  with DDGS() as ddgs:



Observation: [38;5;200m[1;3mO WhatsApp Web pode apresentar alguns erros de conectividade com o aplicativo para celular, e, assim, apresentar lentidão ao carregar as mensagens. A primeira sugestão que damos é … Se o problema ocorrer apenas na versão web do WhatsApp, entre em contato com o suporte do WhatsApp para obter assistência. Em relação a esse problema, você pode primeiro tentar … Galera, como usar o WhatsApp Web no PC sem o QR Code ou sem câmera? Meu celular quebrou e não liga mais. Como não consigo ligar, não tenho como pegar o código. Desde que migrei do chrome para o microsoft edge novo não recebo notificações na area de trabalho de todos os sites, básicamente só do whatsapp web. Já chequei se estão ativas e … May 27, 2025 · Outline · [ Standard ] · Linear+ Saifuddin's Whatsapp kena hack 107.5k views ... « Next Oldest · Kopitiam · Next Newest »[0m
Thought:[32;1m[1;3mCould not parse LLM output: 
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting

  with DDGS() as ddgs:



Observation: [38;5;200m[1;3mWelcome to the first chapter of The Things Fundamentals on LoRaWAN. In this section, you’ll learn why LoRaWAN is so awesome, hear about some great LoRaWAN use cases, and learn the difference between LoRa and LoRaWAN. LoRa LoRa is a wireless modulation technique derived from Chirp Spread Spectrum (CSS) technology. It encodes information on radio waves using … The LoRa Alliance’s unwavering commitment to advancing LPWAN technology has empowered us to deliver state-of-the-art, long-range, and low-power IoT solutions that are redefining industries and driving sustainability. May 9, 2019 · LoRaWAN LoRaWAN is a high capacity, Long Range, open, Low Power Wide Area Network (LPWAN) standard designed for LoRa Powered IoT Solutions by the LoRa Alliance. It is a bi-directional protocol which takes full advantage of all the features of the LoRa technology to deliver services including reliable message delivery, end to end security, location and multicast … LoRa vs LoR

  with DDGS() as ddgs:



Observation: [38;5;200m[1;3mOct 5, 2024 · In this work, we propose a higher-order Candecomp/Parafac (CP) decomposition, enabling a more compact and flexible representation compared to existing matrix and tensor … Jan 22, 2025 · Summary of Paper’s Contributions: This paper provides a theoretical analysis of low-rank adaptation techniques (LoRA and GaLore) for large language model (LLM) training … May 1, 2025 · We identify significant progress, clarify current methodological challenges, and propose future research directions, providing a concrete and actionable guide for enhancing … Jul 17, 2025 · In recent years, we have seen models like Large Language Models (LLMs) such as GPT, LLaMA, and Falcon have shown incredible capabilities in writing human-like text, … Aug 5, 2025 · Lora: Low-rank adaptation of large language models, E. J. Hu, Y. Shen, P. Wallis, Z. Allen-Zhu, Y. Li, S. Wang, L. Wang, and W. Chen. 2...[0m
Thought:

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 58
}
].
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 56
}
].


[32;1m[1;3mCould not parse LLM output: 
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE [0m
Observation: Invalid or incomplete response
Thought:

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 48
}
].


[32;1m[1;3mCould not parse LLM output: 
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE [0m
Observation: Invalid or incomplete response
Thought:[32;1m[1;3mCould not parse LLM output: 
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE [0m
Observation: Invalid or incomplete response
Thought:[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "Based on information found online, the authors of the paper 'LoRA: Low-Rank Adaptation of Large Language Models' are: E. J. Hu, Y. Shen, P. Wallis, Z. Allen-Zhu, Y. Li, S. Wang, L. Wang, and W. Chen."
}
```[0m

[1m> Finished chain.[0m
Based on information found online, the authors of the paper 'LoRA: Low-Rank Adaptation of Large Language Models' are: E. J. Hu, Y. Shen, P. Wallis, Z. Allen-Zhu, Y. Li, S. Wang, L. Wang, and W. Chen.


In [39]:
query = "What is deepseek about?"
result = executor.invoke({"input": f"{SYSTEM_HINT}\n\nQuestion: {query}"}, return_intermediate_steps=True)
print(result["output"])



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mCould not parse LLM output: 
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE [0m
Observation: Invalid or incomplete response
Thought:[32;1m[1;3m```json
{
    "action": "local_corpus_search",
    "action_input": "deepseek"
}
```[0m
Observation: [36;1m[1;3m32B, and 70B checkpoints based on Qwen2.5 and Llama3 series to the community.
1.2. Summary of Evaluation Results
• Reasoning tasks: (1) DeepSeek-R1 achieves a score of 79.8% Pass@1 on AIME 2024, slightly
surpassing OpenAI-o1-1217. On MATH-500, it attains an impressive score of 97.3%,
performing on par with OpenAI-o1-1217 and significantly outperforming other models. (2)
On coding-related tasks, DeepSeek-R1 demonstrates expert level in code competition tasks,
as it achieves 2,029 Elo rating on Codeforces outperforming 96.3% human participants in
the competition. For engineering-related tasks, DeepSeek-R1 perform

In [40]:
query = "What is LORA?"
result = executor.invoke({"input": f"{SYSTEM_HINT}\n\nQuestion: {query}"}, return_intermediate_steps=True)
print(result["output"])



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "local_corpus_search",
    "action_input": "LORA"
}
```[0m
Observation: [36;1m[1;3mzero for B, so ∆W = BAis zero at the beginning of training. We then scale ∆Wx by α
r, where α
is a constant in r. When optimizing with Adam, tuning αis roughly the same as tuning the learning
rate if we scale the initialization appropriately. As a result, we simply set α to the ﬁrst r we try
and do not tune it. This scaling helps to reduce the need to retune hyperparameters when we vary
r(Yang & Hu, 2021).
A Generalization of Full Fine-tuning. A more general form of ﬁne-tuning allows the training of
a subset of the pre-trained parameters. LoRA takes a step further and does not require the accumu-
lated gradient update to weight matrices to have full-rank during adaptation. This means that when
applying LoRA to all weight matrices and training all biases 2, we roughly recover the expressive-
ness of full ﬁne-tuning by 

In [44]:
query = "Search the web for the latest news about Claude"
result = executor.invoke({"input": f"{SYSTEM_HINT}\n\nQuestion: {query}"}, return_intermediate_steps=True)
print(result["output"])



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mCould not parse LLM output: 
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE [0m
Observation: Invalid or incomplete response
Thought:[32;1m[1;3m```json
{
    "action": "duckduckgo_search",
    "action_input": "latest news Anthropic Claude"
}
```[0m

  with DDGS() as ddgs:



Observation: [38;5;200m[1;3mAug 5, 2025 · GitHub notes that Claude Opus 4.1 improves across most capabilities relative to Opus 4, with particularly notable performance gains in multi-file code refactoring. Rakuten …The upgraded Claude 3.5 Sonnet is now available for all users. Starting today, developers can build with the computer use beta on the Anthropic … Aug 5, 2025 · Anthropic’s latest model is out today for Claude customers and through Claude Code, through Anthropic’s API, Amazon Bedrock, and Google Cloud’s Vertex AI. Aug 5, 2025 · Anthropic's Claude Opus 4.1 achieves 74.5% on coding benchmarks, leading the AI market, but faces risk as nearly half its $3.1B API revenue depends on just two customers. 2 days ago · Claude Sonnet 4 now supports up to 1 million tokens of context on the Anthropic API—a 5x increase that lets you process entire codebases with over 75,000 lines of code or … Jul 15, 2025 · Today, we're introducing a comprehensive solution for financial analysis that tra

In [42]:
query = "What is 10+10?"
result = executor.invoke({"input": f"{SYSTEM_HINT}\n\nQuestion: {query}"}, return_intermediate_steps=True)
print(result["output"])



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mCould not parse LLM output: 
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE [0m
Observation: Invalid or incomplete response
Thought:[32;1m[1;3m```json
{
    "action": "Calculator",
    "action_input": "10+10"
}
```[0m
Observation: [33;1m[1;3mAnswer: 20[0m
Thought:[32;1m[1;3mCould not parse LLM output: 
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE [0m
Observation: Invalid or incomplete response
Thought:[32;1m[1;3mCould not parse LLM output: 
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE [0m
Observation: Invalid or incomplete response
Thought:[32;1m[1;3mCould not parse LLM output: 
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/OUTPUT_PARSING_FAILURE [0m
Observation: Invalid or incomplete res

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised InternalServerError: 500 An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting.
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 4.0 seconds as it raised InternalServerError: 500 An internal error has occurred. Please retry or report in https://developers.generativeai.google/guide/troubleshooting.


[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "Tool used: Calculator\nAnswer: 20"
}
```[0m

[1m> Finished chain.[0m
Tool used: Calculator
Answer: 20


In [43]:
query = "What is e^10*pi?"
result = executor.invoke({"input": f"{SYSTEM_HINT}\n\nQuestion: {query}"}, return_intermediate_steps=True)
print(result["output"])



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Calculator",
    "action_input": "math.exp(10) * math.pi"
}
```[0m
Observation: [33;1m[1;3mAnswer: 69198.18312551164[0m
Thought:

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 48
}
].


[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "Tool used: Calculator\nAnswer: 69198.18312551164"
}
```[0m

[1m> Finished chain.[0m
Tool used: Calculator
Answer: 69198.18312551164
