# Private Acquisition Research Automation
## Use Case
This project automates small business acquisition research by parsing OCR’d PDF text from business-for-sale listings, using Generative AI with @tool stubs to extract and analyze financial data, and outputting structured JSON with detailed insights.

## Problem
Manual research of business listings is slow, and web scraping faces blocks; parsing PDF text provides a reliable data source for Gen AI analysis.

## Solution
We use:
1. **Data Collection**: Read PDF text from Kaggle input (Class Element: Data Collection).
2. **Gen AI Extraction Tool**: `@tool extract_financials` stub to parse financials (Class Element: Gen AI Extraction).
3. **Gen AI Analysis**: Agent analyzes extracted data, computes metrics, and generates insights (Class Element: Gen AI Application).

In [None]:
## Setup
!pip install -qU langchain-google-genai==2.1.2 PyPDF2

In [None]:
## Setup again. Occasionally, pip throws an error on the first run
## because of an issue with Kaggle.
!pip install -qU langchain-google-genai==2.1.2 PyPDF2

In [None]:
import json
import os
import csv
from langchain_google_genai import ChatGoogleGenerativeAI
from IPython.display import FileLink, display
from kaggle_secrets import UserSecretsClient
import PyPDF2
import re
from typing import Dict, List
from datetime import datetime

# Configuration
GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", google_api_key=GOOGLE_API_KEY, max_tokens=500)
INPUT_DIR = "/kaggle/input/test-pdf-data/"
PDF_PATH = os.path.join(INPUT_DIR, "nemtb-test.pdf")

# "all" or "single"
MODE = "single"

# "csv" or "json"
OUTPUT_FORMAT = "csv"

# Utility Function: Parses numeric strings into integers
# Purpose: Converts financial values (e.g., "$350,000") to integers for computation
# Usage: Called by formatting tools to standardize numeric fields
def parse_numeric(value: str) -> int:
    return int(re.sub(r'[^\d]', '', value)) if value != "N/A" else 0

# Core Function: Reads text from a PDF file
# Purpose: Extracts raw text from a PDF for further processing
# Usage: Used in process_pdfs to get OCR’d text from each PDF
def read_pdf(file_path: str) -> str:
    print(f"Reading PDF: {file_path}")
    if not os.path.exists(file_path):
        print(f"Error: File not found - {file_path}")
        return ""
    try:
        with open(file_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() or ""
            print(f"Extracted text: {text[:50]}...")
            return text
    except Exception as e:
        print(f"PDF read error: {e}")
        return ""

# Tool Function: Formats raw financial data extracted by LLM
# Purpose: Ensures consistent numeric parsing and cash_needed format
# Usage: Called after extract_financials to standardize output
def format_financials(raw_financials: Dict, pdf_name: str) -> Dict:
    financials = raw_financials.copy()
    for key in ["asking_price", "revenue", "cash_flow"]:
        if key in financials and financials[key] != "N/A":
            financials[key] = parse_numeric(financials[key])
    if "cash_needed" in financials and financials["cash_needed"] != "N/A":
        if "TBD" not in financials["cash_needed"]:
            financials["cash_needed"] += " or TBD"
    financials["pdf_name"] = pdf_name
    return financials

# Tool Function: Formats raw analysis data from LLM
# Purpose: Adds "x" to multiples for consistent formatting
# Usage: Called after analyze_financials to standardize output
def format_analysis(raw_analysis: Dict) -> Dict:
    analysis = raw_analysis.copy()
    for key in ["cash_flow_multiple", "revenue_multiple"]:
        if key in analysis and analysis[key] != "N/A":
            value = str(analysis[key])
            if "x" not in value:
                analysis[key] = f"{float(value):.2f}x"
    return analysis

# Core Function: Extracts financial data using LLM
# Purpose: Gets raw financial details from PDF text
# Usage: Called in process_pdfs to extract initial data, then formatted
def extract_financials(text: str, pdf_name: str) -> Dict:
    print(f"Extracting financials from: {text[:50]}...")
    prompt = f"""
    Extract the following details from this text: "{text}".
    - Business Name (exact full title or prominent phrase at start, including all qualifiers and location if part of the title, e.g., 'Spa & Massage - 30% Repeat Clientele' or 'Coin Laundry Business in The Fan District, Richmond'; do not truncate any part; if not found, use 'N/A')
    - Asking Price (if not provided, use "N/A")
    - Revenue (if not provided, use "N/A")
    - Cash Flow/EBITDA (if not provided, use "N/A")
    - Location (if not provided, use "N/A")
    - Cash Needed (cash beyond loans/seller financing; infer from 'TBD' or '$75K+', e.g., '$75K+'; use "N/A" if no hint)
    - Industry (infer from text, use "N/A" if unclear)
    Return in JSON:
    ```json
    {{
      "business_name": "Business Name",
      "asking_price": "$XXX",
      "revenue": "$XXX",
      "cash_flow": "$XXX",
      "location": "City, State",
      "cash_needed": "$XXX",
      "industry": "Industry Name"
    }}
    ```
    """
    try:
        response = llm.invoke(prompt)
        raw_result = json.loads(response.content.strip().replace("```json", "").replace("```", ""))
        result = format_financials(raw_result, pdf_name)
        print(f"Extraction result: {json.dumps(result, indent=2)}")
        return result
    except Exception as e:
        print(f"Extraction error: {e}")
        return format_financials({
            "business_name": "N/A",
            "asking_price": "N/A",
            "revenue": "N/A",
            "cash_flow": "N/A",
            "location": "N/A",
            "cash_needed": "N/A",
            "industry": "N/A"
        }, pdf_name)

# Core Function: Analyzes financial data using LLM
# Purpose: Computes multiples and generates insights from extracted data
# Usage: Called in process_pdfs to analyze financials, then formatted
def analyze_financials(financials: Dict, text: str) -> Dict:
    print(f"Analyzing financials for {financials['pdf_name']}...")
    prompt = f"""
    Given this business listing:
    - Business Name: "{financials['business_name']}"
    - Location: "{financials['location']}"
    - Revenue: "{financials['revenue']}"
    - EBITDA: "{financials['cash_flow']}"
    - Asking Price: "{financials['asking_price']}"
    - Cash Needed: "{financials['cash_needed']}"
    - Industry: "{financials['industry']}"
    - Description: "{text}"
    
    Compute:
    - Cash Flow Multiple (asking price / EBITDA, if both available, else "N/A")
    - Revenue Multiple (asking price / revenue, if both available, else "N/A")
    - Optimization Potential (rate 1-5; 1 = low, 5 = high, based on growth potential)
    - Notes (combine details and analysis, e.g., growth opportunities)
    Return in JSON:
    ```json
    {{
      "cash_flow_multiple": "X.XX",
      "revenue_multiple": "X.XX",
      "optimization_potential": "X",
      "notes": "Detailed notes"
    }}
    ```
    """
    try:
        response = llm.invoke(prompt)
        raw_analysis = json.loads(response.content.strip().replace("```json", "").replace("```", ""))
        analysis = format_analysis(raw_analysis)
        print(f"Analysis result: {json.dumps(analysis, indent=2)}")
        return analysis
    except Exception as e:
        print(f"Analysis error: {e}")
        return format_analysis({
            "cash_flow_multiple": "N/A",
            "revenue_multiple": "N/A",
            "optimization_potential": "N/A",
            "notes": f"Analysis failed: {str(e)}"
        })

# Core Function: Saves results to a file in specified format
# Purpose: Writes aggregated results to JSON or CSV and provides download link
# Usage: Called in process_pdfs to persist and share output
def save_results(results: List[Dict], output_file: str, output_format: str) -> None:
    print(f"Saving results to {output_file}")
    if output_format == "json":
        with open(output_file, "w") as f:
            json.dump(results, f, indent=2)
    elif output_format == "csv":
        fieldnames = results[0].keys()
        with open(output_file, "w", newline='') as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(results)
    #display(FileLink(output_file, result_html_prefix=f"Download {output_format.upper()} file: "))
    print(f"Download results from Notebook Output: {output_file}")

# Core Function: Evaluates accuracy of extracted data against raw text
# Purpose: Compares LLM output to OCR’d text for validation
# Usage: Called in process_pdfs to assess extraction quality
def evaluate_accuracy(financials: Dict, raw_text: str) -> Dict:
    print(f"Evaluating accuracy for {financials['pdf_name']}")
    ground_truth = {}
    patterns = {
        "business_name": r"^(.*)$",
        "asking_price": r"Asking Price:\s*(\$\d+(?:,\d+)*)",
        "revenue": r"Gross Revenue:\s*(\$\d+(?:,\d+)*)",
        "cash_flow": r"Cash Flow:\s*(\$\d+(?:,\d+)*)",
        "location": r"Location:\s*([^,\n]+,\s*[A-Z]{2})",
        "cash_needed": r"(Cash Required:\s*\$\d+(?:,\d+)*|Financing:\s*(TBD|\$[\dK\+]+))",
        "industry": r"(Laundry|Medical Transport|Spa|Restaurant|Retail)"
    }
    for key, pattern in patterns.items():
        match = re.search(pattern, raw_text, re.MULTILINE)
        ground_truth[key] = match.group(1).strip() if match and match.groups() else "N/A"
    
    accuracy = {}
    for key in ground_truth:
        extracted = str(financials.get(key, "N/A"))
        truth = str(ground_truth[key])
        accuracy[key] = 1 if extracted == truth or (key == "cash_needed" and "TBD" in extracted and "TBD" in truth) else 0
        print(f"{key} - Extracted: {extracted}, Truth: {truth}, Match: {accuracy[key]}")
    
    total_fields = len(accuracy)
    correct_fields = sum(accuracy.values())
    accuracy_percentage = (correct_fields / total_fields) * 100 if total_fields > 0 else 0
    return {"accuracy_percentage": accuracy_percentage, "field_matches": accuracy}

# Core Function: Orchestrates PDF processing and analysis
# Purpose: Manages the full workflow from PDF reading to result saving
# Usage: Main entry point, called with mode, path, and format
def process_pdfs(mode: str, pdf_path: str, output_format: str) -> List[Dict]:
    results = []
    if mode == "single":
        pdf_files = [os.path.basename(pdf_path)]
        input_dir = os.path.dirname(pdf_path)
    elif mode == "all":
        input_dir = INPUT_DIR
        pdf_files = [f for f in os.listdir(input_dir) if f.endswith(".pdf")]
    else:
        print(f"Invalid mode '{mode}'. Use 'single' or 'all'.")
        return []

    for pdf_file in pdf_files:
        full_path = os.path.join(input_dir, pdf_file)
        text = read_pdf(full_path)
        if not text:
            continue
        
        financials = extract_financials(text, pdf_file)
        analysis = analyze_financials(financials, text)
        result = {**financials, **analysis}
        results.append(result)
        
        accuracy = evaluate_accuracy(financials, text)
        print(f"Accuracy for {pdf_file}: {accuracy['accuracy_percentage']:.2f}%")
        print(f"Field matches: {accuracy['field_matches']}")

    if results:
        output_file = f"/kaggle/working/business_listings_{datetime.now().strftime('%Y%m%d_%H%M%S')}.{output_format}"
        save_results(results, output_file, output_format)
    
    return results

results = process_pdfs(MODE, PDF_PATH, OUTPUT_FORMAT)

if results:
    print("Final analysis results:")
    print(json.dumps(results, indent=2))
else:
    print("No results generated.")