# Private Acquisition Research Automation
## Use Case
This project automates small business acquisition research by parsing OCR’d PDF text from business-for-sale listings, using Generative AI with @tool stubs to extract and analyze financial data, and outputting structured JSON with detailed insights.

## Problem
Manual research of business listings is slow, and web scraping faces blocks; parsing PDF text provides a reliable data source for Gen AI analysis.

## Solution
We use:
1. **Data Collection**: Read PDF text from Kaggle input (Class Element: Data Collection).
2. **Gen AI Extraction Tool**: `@tool extract_financials` stub to parse financials (Class Element: Gen AI Extraction).
3. **Gen AI Analysis**: Agent analyzes extracted data, computes metrics, and generates insights (Class Element: Gen AI Application).

In [4]:
## Setup
!pip install -qU langchain-google-genai==2.1.2 langgraph==0.3.21 PyPDF2

In [23]:
import json
import os
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.tools import tool
from langgraph.graph import StateGraph, END
from typing import Annotated, Dict, List, TypedDict
from langgraph.graph.message import add_messages
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage
from IPython.display import Markdown, display
from kaggle_secrets import UserSecretsClient
from datetime import datetime
import PyPDF2

# Configuration
GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", google_api_key=GOOGLE_API_KEY, max_tokens=500)
#PDF_PATH = "/kaggle/input/test-pdf-data/nemtb-test.pdf"
#PDF_PATH = "/kaggle/input/test-pdf-data/SpaMassage.pdf"
PDF_PATH = "/kaggle/input/test-pdf-data/stem.pdf"

# Function to read PDF text
def read_pdf(file_path: str) -> str:
    print(f"DEBUG: Reading PDF from {file_path}")
    try:
        with open(file_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() or ""
            print(f"DEBUG: Extracted PDF text: {text[:50]}...")
            return text
    except Exception as e:
        print(f"DEBUG: PDF read error: {e}")
        return ""

# Helper function (logic outside @tool)
def perform_extraction(text: str) -> Dict:
    print(f"DEBUG: Extracting financials from: {text[:50]}...")
    prompt = f"""
    Extract the following details from this text: "{text}".
    - Business Name (typically the title or a prominent or catchy descriptive phrase at the start of the listing; if not provided, use "N/A")
    - Asking Price (if not provided, use "N/A")
    - Revenue (if not provided, use "N/A")
    - Cash Flow/EBITDA (if not provided, use "N/A")
    - Location (if not provided, use "N/A")
    - Cash Needed (if not provided, use "N/A"; infer from financing if available)
    - Industry (infer from text if not explicit, use "N/A" if unclear)
    Return the result in JSON format like:
    ```json
    {{
      "business_name": "Business Name",
      "asking_price": "$XXX",
      "revenue": "$XXX",
      "cash_flow": "$XXX",
      "location": "City, State",
      "cash_needed": "$XXX or TBD",
      "industry": "Industry Name"
    }}
    ```
    """
    try:
        response = llm.invoke(prompt)
        result = json.loads(response.content.strip().replace("```json", "").replace("```", ""))
        print(f"DEBUG: Extraction result: {json.dumps(result, indent=2)}")
        return result
    except Exception as e:
        print(f"DEBUG: Extraction error: {e}")
        return {
            "business_name": "N/A",
            "asking_price": "N/A",
            "revenue": "N/A",
            "cash_flow": "N/A",
            "location": "N/A",
            "cash_needed": "N/A",
            "industry": "N/A"
        }

# Tools (stubs)
@tool
def extract_financials(text: str) -> Dict:
    """Stub to extract financial details and business name from OCR’d PDF text using Gen AI."""
    return perform_extraction(text)

# Define state
class ResearchState(TypedDict):
    ocr_text: str
    financials: Dict
    analysis: Dict
    messages: Annotated[List, add_messages]
    finished: bool
    quota_exceeded: bool

# Nodes
def agent_node(state: ResearchState) -> ResearchState:
    print(f"DEBUG: Entering agent_node")
    if not state["messages"]:
        return state | {"messages": [AIMessage(content="Starting research. Extracting financials and business name from PDF text.")], "quota_exceeded": False}
    
    last_msg = state["messages"][-1].content.lower()
    if "extracting financials" in last_msg:
        return state | {"messages": [AIMessage(content="Calling extract_financials tool.", tool_calls=[{"name": "extract_financials", "args": {"text": state["ocr_text"]}, "id": "extract_1"}])]}
    elif state["financials"] and not state["analysis"]:
        prompt = f"""
        Given this business listing:
        - Business Name: "{state['financials']['business_name']}"
        - Location: "{state['financials']['location']}"
        - Revenue: "{state['financials']['revenue']}"
        - EBITDA: "{state['financials']['cash_flow']}"
        - Asking Price: "{state['financials']['asking_price']}"
        - Cash Needed: "{state['financials']['cash_needed']}"
        - Industry: "{state['financials']['industry']}"
        - Description: "{state['ocr_text']}"
        
        Compute:
        - Cash Flow Multiple (asking price / EBITDA, if both available, else "N/A")
        - Revenue Multiple (asking price / revenue, if both available, else "N/A")
        - Optimization Potential (rate 1-5; 1 = low, 5 = very high, based on growth potential in text)
        - Notes (combine listing details and analysis, e.g., growth opportunities, valuation notes)
        Return in JSON:
        ```json
        {{
          "cash_flow_multiple": "X.XXx or N/A",
          "revenue_multiple": "X.XXx or N/A",
          "optimization_potential": "X",
          "notes": "Detailed notes"
        }}
        ```
        """
        print(f"DEBUG: Analyzing with prompt: {prompt[:100]}...")
        try:
            response = llm.invoke(prompt)
            analysis = json.loads(response.content.strip().replace("```json", "").replace("```", ""))
            print(f"DEBUG: Analysis result: {json.dumps(analysis, indent=2)}")
            return state | {"analysis": analysis, "finished": True, "messages": [AIMessage(content=f"Analysis complete: {json.dumps(analysis, indent=2)}")]}
        except Exception as e:
            print(f"DEBUG: Analysis error: {e}")
            if "429" in str(e):
                state["quota_exceeded"] = True
            analysis = {
                "cash_flow_multiple": "N/A",
                "revenue_multiple": "N/A",
                "optimization_potential": "N/A",
                "notes": "Analysis failed due to quota limit or error."
            }
            return state | {"analysis": analysis, "finished": True, "messages": [AIMessage(content=f"Analysis failed: {str(e)}")]}
    elif state["analysis"]:
        return state | {"finished": True, "messages": [AIMessage(content="Research complete. Saving results.")]}
    elif state["quota_exceeded"]:
        return state | {"finished": True, "messages": [AIMessage(content="Research halted due to quota limit. Saving partial results.")]}
    return state | {"messages": [AIMessage(content="No valid data to process, ending research.")], "finished": True}

def tool_node(state: ResearchState) -> ResearchState:
    print(f"DEBUG: Entering tool_node")
    last_msg = state["messages"][-1]
    outbound_msgs = []
    for tool_call in getattr(last_msg, "tool_calls", []):
        if tool_call["name"] == "extract_financials":
            result = extract_financials.invoke({"text": tool_call["args"]["text"]})
            state["financials"] = result
            outbound_msgs.append(ToolMessage(content=json.dumps(result), name="extract_financials", tool_call_id=tool_call["id"]))
    return state | {"messages": outbound_msgs}

def save_node(state: ResearchState) -> ResearchState:
    print(f"DEBUG: Entering save_node")
    output_file = "/kaggle/working/business_listings.json"
    listing_data = {
        "Business Name": state["financials"].get("business_name", "N/A"),
        "Location": state["financials"].get("location", "N/A"),
        "Revenue": state["financials"].get("revenue", "N/A"),
        "EBITDA": state["financials"].get("cash_flow", "N/A"),
        "Asking Price": state["financials"].get("asking_price", "N/A"),
        "Cash Needed": state["financials"].get("cash_needed", "N/A"),
        "Industry": state["financials"].get("industry", "N/A"),
        "cash_flow_multiple": state["analysis"].get("cash_flow_multiple", "N/A") if state["analysis"] else "N/A",
        "revenue_multiple": state["analysis"].get("revenue_multiple", "N/A") if state["analysis"] else "N/A",
        "Optimization Potential": state["analysis"].get("optimization_potential", "N/A") if state["analysis"] else "N/A",
        "Notes": state["analysis"].get("notes", "N/A") if state["analysis"] else "N/A",
        "raw_text": state["ocr_text"]
    }
    listings = [listing_data]
    try:
        with open(output_file, "w") as f:
            json.dump(listings, f, indent=2)
        print(f"Saved listing to {output_file}")
    except Exception as e:
        print(f"Error saving listings: {e}")
    return state

# Routing logic
def route_research(state: ResearchState) -> str:
    print(f"DEBUG: Routing state - Finished: {state.get('finished', False)}, Last message: {state['messages'][-1].content[:50]}...")
    if state.get("finished", False):
        return "save"
    last_msg = state["messages"][-1]
    if isinstance(last_msg, ToolMessage):
        return "agent"
    if isinstance(last_msg, AIMessage) and hasattr(last_msg, "tool_calls") and last_msg.tool_calls:
        return "tool"
    return "agent"

# Build the graph
workflow = StateGraph(ResearchState)
workflow.add_node("agent", agent_node)
workflow.add_node("tool", tool_node)
workflow.add_node("save", save_node)

workflow.set_entry_point("agent")
workflow.add_conditional_edges("agent", route_research)
workflow.add_conditional_edges("tool", route_research)
workflow.add_edge("save", END)

# Compile the graph
app = workflow.compile()

# Run the research
initial_state = {
    "ocr_text": read_pdf(PDF_PATH),
    "financials": {},
    "analysis": {},
    "messages": [],
    "finished": False,
    "quota_exceeded": False
}
config = {"recursion_limit": 100}
print("Starting acquisition research with PDF text. Please wait...")
state = app.invoke(initial_state, config)

# Display final output
output_file = "/kaggle/working/business_listings.json"
if os.path.exists(output_file):
    with open(output_file, "r") as f:
        data = json.load(f)
        print("Final listing with AI analysis:")
        print(json.dumps(data[0] if data else {}, indent=2))
else:
    print("No output file generated.")

DEBUG: Reading PDF from /kaggle/input/test-pdf-data/stem.pdf
DEBUG: PDF read error: [Errno 2] No such file or directory: '/kaggle/input/test-pdf-data/stem.pdf'
Starting acquisition research with PDF text. Please wait...
DEBUG: Entering agent_node
DEBUG: Routing state - Finished: False, Last message: Starting research. Extracting financials and busin...
DEBUG: Entering agent_node
DEBUG: Routing state - Finished: False, Last message: Calling extract_financials tool....
DEBUG: Entering tool_node
DEBUG: Extracting financials from: ...
DEBUG: Extraction error: Expecting value: line 1 column 1 (char 0)
DEBUG: Routing state - Finished: False, Last message: {"business_name": "N/A", "asking_price": "N/A", "r...
DEBUG: Entering agent_node
DEBUG: Analyzing with prompt: 
        Given this business listing:
        - Business Name: "N/A"
        - Location: "N/A"
     ...
DEBUG: Analysis result: {
  "cash_flow_multiple": "N/A",
  "revenue_multiple": "N/A",
  "optimization_potential": "N/A",
  "not