In [1]:
#!/usr/bin/env python3
"""
Finance Research Copilot — Agentic AI (LangChain + LangGraph)
=============================================================

What this is
------------
An intermediate-complexity **agentic AI application** that orchestrates planning, tool-use, retrieval,
analysis, and reporting for an equity/sector research task in **finance** using **LangChain** and **LangGraph**.

It demonstrates:
- A **planner → router → tool-use → retrieval → analysis → report** loop.
- LangGraph **state machine** with typed state and conditional edges.
- Tool calling (calculator, simple web search, CSV/PDF loaders) via LangChain Tools.
- Local **vector store (Chroma)** with OpenAI embeddings (configurable provider) for RAG.
- A **guardrail** self-check pass and a **memory** summary store (JSONL) for continuity.

Run it
------
1) Create a virtualenv and install deps (see `requirements` string near bottom for a ready list):

   ```bash
   python -m venv .venv && source .venv/bin/activate
   pip install -U pip
   pip install -r requirements.txt
   ```

2) Export your LLM/Embeddings key(s):

   ```bash
   export OPENAI_API_KEY=sk-...  # or set in your shell profile
   ```

   (You can swap providers in code if you prefer — see notes in `make_llm()` and `make_embeddings()`.)

3) Run the app:

   ```bash
   python agent_finance_copilot.py --query "Analyze TCS Q1 FY26 results vs Infosys; compute YoY growth and give risks/opportunities"
   ```

4) Optional: first run will auto-create a small demo dataset under `./data/` (CSV + MD). You can drop your
   own PDFs/CSVs/MD into `./data/` and re-run to enrich retrieval.

Outputs
-------
- A **final research brief** printed to stdout.
- A **scratch/plan** section and **tool traces** in the logs.
- `./memory/session_memory.jsonl` with rolling summaries.

Note
----
This is a single-file demo for clarity. In production, you’d split into modules and add structured logging,
robust evals, and proper error handling.
"""



In [2]:
from __future__ import annotations

import argparse
import json
import math
import os
import random
import re
import sys
import textwrap
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, TypedDict

In [3]:
!pip install langgraph
!pip install langchain
!pip install langchain_community
!pip install langchain_core
!pip install langchain_text_splitters
!pip install duckduckgo_search

Collecting langgraph
  Downloading langgraph-0.6.7-py3-none-any.whl.metadata (6.8 kB)
Collecting langgraph-checkpoint<3.0.0,>=2.1.0 (from langgraph)
  Downloading langgraph_checkpoint-2.1.1-py3-none-any.whl.metadata (4.2 kB)
Collecting langgraph-prebuilt<0.7.0,>=0.6.0 (from langgraph)
  Downloading langgraph_prebuilt-0.6.4-py3-none-any.whl.metadata (4.5 kB)
Collecting langgraph-sdk<0.3.0,>=0.2.2 (from langgraph)
  Downloading langgraph_sdk-0.2.9-py3-none-any.whl.metadata (1.5 kB)
Collecting ormsgpack>=1.10.0 (from langgraph-checkpoint<3.0.0,>=2.1.0->langgraph)
  Downloading ormsgpack-1.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Downloading langgraph-0.6.7-py3-none-any.whl (153 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.3/153.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading langgraph_chec

In [4]:
!pip install langchain_openai

Collecting langchain_openai
  Downloading langchain_openai-0.3.33-py3-none-any.whl.metadata (2.4 kB)
Downloading langchain_openai-0.3.33-py3-none-any.whl (74 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.0/75.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: langchain_openai
Successfully installed langchain_openai-0.3.33


In [5]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-1.1.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.9 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.37.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pypika>=0.48.9 (from chromadb)
  Downloading PyPika-0.48.9.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?

In [15]:
from google.colab import userdata
import os

# Retrieve the API key from Colab secrets
openai_api_key = userdata.get('OPENAI_API_KEY')

# Set the environment variable
os.environ['OPENAI_API_KEY'] = openai_api_key

In [6]:
# --- LangChain / LangGraph imports ---
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage, ToolMessage
from langchain_core.runnables import RunnableLambda
from langchain_core.tools import tool
from langgraph.graph import StateGraph, END
from langgraph.prebuilt import ToolNode

# LLMs / Embeddings providers
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# Vector store (Chroma)
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import TextLoader, CSVLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Optional utility
from duckduckgo_search import DDGS  # lightweight search (no API key)

In [7]:
# -----------------------
# Config & Utilities
# -----------------------
import os
import random # Import random
from pathlib import Path

ROOT = Path(os.getcwd())
DATA_DIR = ROOT / "data"
DB_DIR = ROOT / "chroma_db"
MEM_DIR = ROOT / "memory"
MEM_DIR.mkdir(parents=True, exist_ok=True)
MEM_FILE = MEM_DIR / "session_memory.jsonl"

RANDOM_SEED = 37
random.seed(RANDOM_SEED)


def hrule(title: str = "") -> str:
    line = "\n" + ("=" * 80)
    return f"{line}\n{title}\n{line}\n"

In [8]:
# -----------------------
# LLMs & Embeddings
# -----------------------

def make_llm(model: str = "gpt-4o-mini", temperature: float = 0.2):
    """Return an LLM. Swap provider here if needed.
    - For OpenAI: set OPENAI_API_KEY in environment.
    - You could adapt this to Azure OpenAI (langchain-openai supports it) or Anthropic (use langchain-anthropic).
    """
    return ChatOpenAI(model=model, temperature=temperature)


def make_embeddings(model: str = "text-embedding-3-large"):
    return OpenAIEmbeddings(model=model)


In [16]:
# -----------------------
# Demo Data Bootstrap
# -----------------------

def bootstrap_demo_corpus() -> None:
    DATA_DIR.mkdir(parents=True, exist_ok=True)
    # Simple CSV with quarterly numbers (illustrative synthetic values)
    csv_path = DATA_DIR / "it_services_q_results.csv"
    if not csv_path.exists():
        csv_path.write_text(
            """company,quarter,fy,rev_inr_cr,profit_inr_cr,yoy_rev_growth_pct
TCS,Q1,26,64000,12800,7.5
Infosys,Q1,26,38000,7200,5.2
HCLTech,Q1,26,28000,4200,6.1
Wipro,Q1,26,22000,3100,3.9
"""
        )

    # Short markdown notes to be retrievable
    md_path = DATA_DIR / "sector_notes.md"
    if not md_path.exists():
        md_path.write_text(
            textwrap.dedent(
                """
                # India IT Services Sector — Quick Notes (FY26 Q1)

                - Demand steady in BFSI and healthcare; telecom muted.
                - Cost optimization continues; vendor consolidation favors top-3 players.
                - GenAI pilots moving to production in customer support and code modernization.
                - Currency tailwinds mixed; cross-currency impact ~(-0.4%) for Q1.
                - Risks: prolonged US slowdown, pricing pressure, large deal ramp-downs.
                - Opportunities: cloud modernization, vendor consolidation, GenAI productivity deals.
                """
            ).strip()
        )

    # Tiny PDF (single-page) with a mock excerpt
    pdf_path = DATA_DIR / "mock_investor_update.pdf"
    if not pdf_path.exists():
        # Generate a simple PDF from text (fallback as .txt if PyPDF2 not present) — but we'll ship as a text stub
        # so loader can still demonstrate. For reliability, just create a text that we load via TextLoader.
        (DATA_DIR / "mock_investor_update.txt").write_text(
            "Investor Update: Tier-1 IT firms report stable margins; deal pipeline healthy; GenAI backlog building."
        )



In [9]:
# -----------------------
# Document Ingestion & Vector Store
# -----------------------
from typing import List, Any # Import List and Any

def load_documents() -> List[Any]:
    docs: List[Any] = []
    splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=120)
    for path in DATA_DIR.glob("**/*"):
        if path.is_dir():
            continue
        try:
            if path.suffix.lower() in {".md", ".txt"}:
                loader = TextLoader(str(path))
                docs.extend(splitter.split_documents(loader.load()))
            elif path.suffix.lower() == ".csv":
                loader = CSVLoader(str(path))
                docs.extend(splitter.split_documents(loader.load()))
            elif path.suffix.lower() == ".pdf":
                # If you add a real PDF, this loader will work
                loader = PyPDFLoader(str(path))
                docs.extend(splitter.split_documents(loader.load()))
        except Exception as e:
            print(f"[WARN] Skipping {path.name}: {e}")
    return docs


def build_or_load_vectorstore(emb_model: str = "text-embedding-3-large") -> Chroma:
    embeddings = make_embeddings(model=emb_model)
    if DB_DIR.exists() and any(DB_DIR.iterdir()):
        db = Chroma(collection_name="finance_copilot", persist_directory=str(DB_DIR), embedding_function=embeddings)
    else:
        docs = load_documents()
        db = Chroma.from_documents(
            docs,
            embeddings,
            collection_name="finance_copilot",
            persist_directory=str(DB_DIR),
        )
    return db

In [10]:
# -----------------------
# LangChain Tools
# -----------------------

@tool("calc")
def calc(expression: str) -> str:
    """Safely evaluate a simple math expression. Supports +,-,*,/,**,(), and decimals.
    Example: "(64000-38000)/38000".
    """
    # Basic safety: restrict to allowed characters
    if not re.fullmatch(r"[0-9+\-*/(). %**]+", expression.replace(" ", "")):
        return "Error: unsupported characters in expression."
    try:
        # Evaluate with restricted globals
        value = eval(expression, {"__builtins__": {}}, {"math": math})
        return str(value)
    except Exception as e:
        return f"Error: {e}"


@tool("ddg_news")
def ddg_news(query: str, max_results: int = 5) -> str:
    """Lightweight web search via DuckDuckGo for recent news headlines/snippets.
    Returns a JSON string list of {title, href, snippet}.
    """
    results: List[Dict[str, str]] = []
    try:
        with DDGS() as ddgs:
            for r in ddgs.news(query, max_results=max_results):
                results.append({"title": r.get("title", ""), "href": r.get("url", ""), "snippet": r.get("body", "")})
    except Exception as e:
        results.append({"title": "(search error)", "href": "", "snippet": str(e)})
    return json.dumps(results, ensure_ascii=False)


@tool("tabular_lookup")
def tabular_lookup(company: str, quarter: str = "Q1", fy: str = "26") -> str:
    """Look up synthetic quarterly metrics from the demo CSV. Returns a JSON dict with fields
    {company, quarter, fy, rev_inr_cr, profit_inr_cr, yoy_rev_growth_pct} if found, else {}.
    """
    csv_path = DATA_DIR / "it_services_q_results.csv"
    if not csv_path.exists():
        return json.dumps({})
    rows = [line.strip().split(",") for line in csv_path.read_text().splitlines()]
    header = rows[0]
    for r in rows[1:]:
        row = dict(zip(header, r))
        if row["company"].lower() == company.lower() and row["quarter"].upper() == quarter.upper() and row["fy"] == fy:
            return json.dumps(row)
    return json.dumps({})


TOOLS = [calc, ddg_news, tabular_lookup]
TOOL_NODE = ToolNode(tools=TOOLS)



In [11]:
# -----------------------
# LangGraph State
# -----------------------
from typing import List, Any, TypedDict # Import TypedDict

class GraphState(TypedDict):
    query: str
    plan: str
    context_snippets: List[str]
    tool_calls: List[str]
    analysis: str
    report: str
    guard_feedback: str


# -----------------------
# Nodes
# -----------------------

def node_planner(state: GraphState) -> GraphState:
    llm = make_llm()
    sys_prompt = (
        "You are a senior equity research analyst. Break the user query into a short numbered plan: "
        "1) clarify intent (if needed), 2) data to fetch (tools), 3) retrieval queries, 4) calculations, "
        "5) risks/opportunities, 6) output format with headings. Keep it under 120 words."
    )
    msgs = [SystemMessage(content=sys_prompt), HumanMessage(content=state["query"])]
    out = llm.invoke(msgs)
    return {**state, "plan": out.content}


def node_router(state: GraphState) -> GraphState:
    # A tiny heuristic router: decide whether we need tools or retrieval based on keywords
    q = state["query"].lower()
    needs_news = any(k in q for k in ["news", "latest", "today", "headline"]) or "vs" in q
    needs_calc = any(k in q for k in ["growth", "cagr", "difference", "%", "yoy", "compute", "calculate"])
    needs_tabular = any(k in q for k in ["revenue", "profit", "q1", "fy26", "results", "numbers"]) or "vs" in q

    tool_calls = []
    if needs_news:
        tool_calls.append("ddg_news")
    if needs_calc:
        tool_calls.append("calc")
    if needs_tabular:
        tool_calls.append("tabular_lookup")

    return {**state, "tool_calls": tool_calls}


def node_retrieve(state: GraphState) -> GraphState:
    db = build_or_load_vectorstore()
    retriever = db.as_retriever(search_kwargs={"k": 4})
    # Build a synthetic retrieval query from the plan + original query
    q = f"{state['query']}\nContext needed: sector risks, demand trends, and genAI themes from notes"
    docs = retriever.get_relevant_documents(q)
    snippets = [f"[{d.metadata.get('source','')}] {d.page_content[:400]}" for d in docs]
    return {**state, "context_snippets": snippets}


def node_tool_use(state: GraphState) -> GraphState:
    # Let the LLM decide how to call tools using function-calling, given our Tools list.
    llm = make_llm()
    llm_with_tools = llm.bind_tools(TOOLS)

    tool_context = (
        "Available tools: calc(expression), ddg_news(query, max_results=5), tabular_lookup(company, quarter, fy).\n"
        "When comparing companies (e.g., 'A vs B'), call tabular_lookup for each to fetch numbers, then calc for ratios."
    )

    msgs = [
        SystemMessage(content=tool_context + " Return concise JSON per tool call, then a 2-3 line interim note."),
        HumanMessage(content=f"User query: {state['query']}\nPlan: {state['plan']}")
    ]

    # First pass: ask model if it wants to call tools
    first = llm_with_tools.invoke(msgs)
    tool_traces: List[str] = []
    tool_msgs: List[Any] = []

    if hasattr(first, "tool_calls") and first.tool_calls:
        for tc in first.tool_calls:
            name = tc["name"]
            args = tc.get("args", {})
            # Execute tool
            result = None
            if name == "calc":
                result = calc.invoke(args)
            elif name == "ddg_news":
                result = ddg_news.invoke(args)
            elif name == "tabular_lookup":
                result = tabular_lookup.invoke(args)
            tool_traces.append(f"TOOL {name}({args}) -> {result[:240]}...")
            tool_msgs.append(ToolMessage(tool_call_id=tc["id"], name=name, content=str(result)))

    # Second pass: give tool outputs back to model
    follow = llm.invoke(msgs + [first] + tool_msgs)
    interim = follow.content

    return {**state, "analysis": interim, "tool_calls": state["tool_calls"] + tool_traces}


def node_analyze_and_write(state: GraphState) -> GraphState:
    llm = make_llm(temperature=0.2)
    sys_prompt = (
        "Compose a crisp equity research brief with sections: SUMMARY, KEY METRICS, DRIVERS, RISKS, OPPORTUNITIES, "
        "ACTIONABLE INSIGHTS. Use bullet points, cite any numbers you computed, and keep to ~300-450 words."
    )
    ctx = "\n\n".join(state.get("context_snippets", []))
    tool_log = "\n".join(state.get("tool_calls", []))

    msgs = [
        SystemMessage(content=sys_prompt),
        HumanMessage(
            content=f"User query: {state['query']}\n\nContext from RAG:\n{ctx}\n\nTool trace:\n{tool_log}\n\nDo the final brief now."
        ),
    ]
    out = llm.invoke(msgs)
    return {**state, "report": out.content}


def node_guardrail(state: GraphState) -> GraphState:
    llm = make_llm(temperature=0)
    sys_prompt = (
        "You are a meticulous research QA assistant. Check the brief for: unsupported claims, unclear sources, "
        "and missing assumptions. Reply with a short bullet list of corrections or 'LGTM' if fine."
    )
    msgs = [SystemMessage(content=sys_prompt), HumanMessage(content=state["report"])]
    fb = llm.invoke(msgs).content
    return {**state, "guard_feedback": fb}


def node_memory(state: GraphState) -> GraphState:
    rec = {
        "timestamp": datetime.utcnow().isoformat() + "Z",
        "query": state["query"],
        "plan": state.get("plan", ""),
        "key_points": state.get("analysis", "")[:800],
        "summary": state.get("report", "")[:1200],
    }
    with open(MEM_FILE, "a", encoding="utf-8") as f:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")
    return state


# -----------------------
# Graph Wiring
# -----------------------

def build_graph():
    g = StateGraph(GraphState)
    g.add_node("planner", node_planner)
    g.add_node("router", node_router)
    g.add_node("retrieve", node_retrieve)
    g.add_node("tool_use", node_tool_use)
    g.add_node("write", node_analyze_and_write)
    g.add_node("guard", node_guardrail)
    g.add_node("memory", node_memory)

    g.set_entry_point("planner")
    g.add_edge("planner", "router")

    # Always retrieve some context
    g.add_edge("router", "retrieve")

    # If router decided tools are relevant, still run the tool node — it will decide if any tool calls happen.
    g.add_edge("retrieve", "tool_use")

    # Then synthesize report
    g.add_edge("tool_use", "write")

    # Guardrail check, then persist memory and end
    g.add_edge("write", "guard")
    g.add_edge("guard", "memory")
    g.add_edge("memory", END)

    return g.compile()

In [12]:
# -----------------------
# CLI Entrypoint
# -----------------------
import argparse

def main(query: str = "Analyze TCS Q1 FY26 results vs Infosys; compute YoY growth and give risks/opportunities"):
    parser = argparse.ArgumentParser(description="Finance Research Copilot — Agentic AI (LangChain + LangGraph)")
    parser.add_argument("--query", required=False, help="User research question / task")
    args, unknown = parser.parse_known_args() # Use parse_known_args to ignore unknown arguments

    # Use the provided query argument if available, otherwise use the default
    user_query = args.query if args.query is not None else query

    bootstrap_demo_corpus()

    app = build_graph()
    initial: GraphState = {
        "query": user_query,
        "plan": "",
        "context_snippets": [],
        "tool_calls": [],
        "analysis": "",
        "report": "",
        "guard_feedback": "",
    }

    print(hrule("START RUN"))
    final: GraphState = app.invoke(initial)

    print(hrule("PLAN"))
    print(final.get("plan", ""))

    print(hrule("TOOL TRACE (truncated)"))
    for t in final.get("tool_calls", [])[:12]:
        print("-", t)

    print(hrule("CONTEXT SNIPPETS"))
    for s in final.get("context_snippets", [])[:4]:
        print("*", s[:240], "...")

    print(hrule("RESEARCH BRIEF"))
    print(final.get("report", ""))

    print(hrule("GUARDRAIL FEEDBACK"))
    print(final.get("guard_feedback", ""))

    print(hrule("DONE"))

In [13]:
if __name__ == "__main__":
    # If running as a script, we also emit a handy requirements.txt next to the file
    import textwrap # Import textwrap
    requirements = textwrap.dedent(
        """
        # Core
        langchain>=0.2.11
        langgraph>=0.2.13
        langchain-openai>=0.1.23
        langchain-community>=0.2.10
        langchain-text-splitters>=0.2.2

        # Vector store
        chromadb>=0.5.3

        # Loaders
        pypdf>=4.2.0

        # Optional util
        duckduckgo-search>=6.2.10

        # Misc
        tiktoken>=0.7.0
        numpy>=1.26.0
        pandas>=2.2.2
        """
    ).strip()

    req_path = ROOT / "requirements.txt"
    try:
        if not req_path.exists():
            req_path.write_text(requirements)
    except Exception as e:
        print(f"[WARN] Could not write requirements.txt: {e}")

In [17]:
main()


START RUN




  docs = retriever.get_relevant_documents(q)



PLAN


1) Clarify Intent: Confirm if the user wants a comparative analysis of TCS and Infosys Q1 FY26 results, focusing on YoY growth.

2) Data to Fetch: Financial statements for TCS and Infosys Q1 FY26, including revenue, net income, and key metrics.

3) Retrieval Queries: Search for TCS Q1 FY26 results and Infosys Q1 FY26 results from financial databases or company filings.

4) Calculations: Compute YoY growth rates for revenue and net income for both companies.

5) Risks/Opportunities: Identify potential risks (e.g., market competition, regulatory changes) and opportunities (e.g., new markets, technological advancements) for both companies.

6) Output Format: 
   - Title: Comparative Analysis of TCS and Infosys Q1 FY26 Results
   - Section 1: YoY Growth Comparison
   - Section 2: Risks
   - Section 3: Opportunities

TOOL TRACE (truncated)


- ddg_news
- calc
- tabular_lookup
- TOOL tabular_lookup({'company': 'TCS', 'quarter': 'Q1', 'fy': '26'}) -> {"company": "TCS", "quarter": "Q1"

  "timestamp": datetime.utcnow().isoformat() + "Z",
