# KPI Extractor   
### A comprehensive workflow to extract important KPIs from financial documents.

Install Required Libraries

In [134]:
# !pip install --quiet -U langchain_openai langchain_core langchain_community langgraph pytesseract pdf2image pillow pandas==2.2.2 openpyxl tesseract pypdf

In [110]:
# !apt-get install -y tesseract-ocr

Import Required Libraries

In [137]:
import os
import re
import sys
import json
from pprint import pprint
from google.colab import userdata
from IPython.display import display, Image
from typing import Dict, List, Any
from typing_extensions import TypedDict, Annotated, Union

# PDF processing libraries
import io
import pytesseract
import pandas as pd
from pdf2image import convert_from_path

# other libraries
from pydantic import BaseModel, field_validator, ValidationError, Field

# LangChain and LangGraph
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from langchain.schema import Document
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.messages import AIMessage, HumanMessage, AnyMessage, SystemMessage, ToolMessage, RemoveMessage
from langchain.schema.runnable import RunnableConfig

import langgraph
from langgraph.graph import StateGraph, MessagesState, START, END
from langgraph.graph.message import add_messages
from langgraph.prebuilt import ToolNode, tools_condition
from langgraph.checkpoint.memory import MemorySaver

Setup Required Variables

In [138]:
GIRU_OPENAI_API_KEY = userdata.get('GIRU_OPENAI_API_KEY')
os.environ["OPENAI_API_KEY"] = GIRU_OPENAI_API_KEY

pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/pytesseract'

Initial Checks & Auths

In [139]:
# checking openai llm
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
result = llm.invoke("Hi, are you deepseek?")

# printing results
print("==="*25)
print(f"RESPONSE\n{result.content}")
print("==="*25)
print("METADATA")
print(f"model: {result.response_metadata['model_name']}")
for k, v in result.response_metadata['token_usage'].items():
  print(f"{k}: {v}")

RESPONSE
No, I'm not DeepSeek. I'm an AI language model created by OpenAI. How can I assist you today?
METADATA
model: gpt-4o-mini-2024-07-18
completion_tokens: 25
prompt_tokens: 14
total_tokens: 39
completion_tokens_details: {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}
prompt_tokens_details: {'audio_tokens': 0, 'cached_tokens': 0}


In [140]:
class KPIData(TypedDict):
    lp_contribution: str
    gp_contribution: str
    lp_distribution: str
    gp_distribution: str
    net_irr: str
    net_interest: str
    lp_nav: str
    gp_nav: str
    net_debt: str

In [141]:
class WorkflowState(TypedDict):
    file_path: str
    extracted_text: str
    kpi_data: KPIData
    human_feedback: str
    status: str

In [142]:
def load_pdf(state: WorkflowState) -> WorkflowState:
    try:
        file_path = state.get("file_path")
        print("inside load_pdf\nprinting file path: ", file_path)

        loader = PyPDFLoader(file_path)
        docs = loader.load()
        basic_text = "\n".join([doc.page_content for doc in docs])
        print(f"loaded pdf successfully!\nbasic text content: {basic_text[:500]}")

        return {**state, "extracted_text": basic_text, "status": "pdf_loaded"}

    except Exception as e:
        return {**state, "error": f"Error loading PDF: {str(e)}", "status": "error"}

In [135]:
def load_pdf(path):
    file_path = path
    print("inside load_pdf\nprinting file path: ", file_path)

    loader = PyPDFLoader(file_path)
    docs = loader.load()
    basic_text = "\n".join([doc.page_content for doc in docs])
    print(f"loaded pdf successfully!\nbasic text content: {basic_text[:500]}")

load_pdf("/content/sample1.pdf")

inside load_pdf
printing file path:  /content/sample1.pdf
loaded pdf successfully!
basic text content: JMI EQUITY FUND IX-A, L.P.     
JMI EQUITY FUND IX-B, L.P. 
 
 
 
 
 
First Quarter Report 
March 31, 2023
JMI EQUITY FUND IX-A, L.P.  
JMI EQUITY FUND IX-B, L.P. 
First Quarter Report 
 
 
Table of Contents 
 
 
 
I. Summary of Fund Activity 
 
II. Portfolio Company Summaries 
 
III. Financial Statements 
 
 
 
 
 
 
 
   
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
THIS INFORMATION IS CONFIDENTIAL AND PROPRIETARY AND MUST BE MAINTAINED IN 
CONFIDENCE AND IN ACCORDANCE WITH THE CONFIDENTIALITY OBLIGATIONS 


In [143]:
def extract_pdf_text(state: WorkflowState) -> WorkflowState:
    try:
        file_path = state.get("file_path")
        basic_text = state.get("extracted_text", "")

        # Use OCR for images in PDF
        images = convert_from_path(file_path)
        ocr_text = ""

        for i, image in enumerate(images):
            text = pytesseract.image_to_string(image)
            ocr_text += f"\n--- Page {i+1} OCR ---\n{text}"

        print(f"completed ocr\nocr text content: {ocr_text[:300]}")

        # Combine both extraction methods
        combined_text = f"{basic_text}\n\n{ocr_text}"

        print(f"text extracted successfully!\n{combined_text[:500]}")

        return {**state, "extracted_text": combined_text, "status": "text_extracted"}

    except Exception as e:
        return {**state, "error": f"Error extracting text: {str(e)}", "status": "error"}

In [144]:
def extract_kpis(state: WorkflowState) -> WorkflowState:
    try:
        text_content = state.get("extracted_text", "")
        human_feedback = state.get("human_feedback", "")

        # prompt with human feedback
        prompt = f"""
        You are a financial document analysis expert. Extract the following KPIs from the provided text content.
        If a value is not found, mark it as 'Not Found'.

        Text content from PDF:
        ```
        {text_content}
        ```

        Extract the following KPIs:
        1. LP Contribution
        2. GP Contribution
        3. LP Distribution
        4. GP Distribution
        5. Net IRR
        6. Net Interest
        7. LP NAV
        8. GP NAV
        9. Net Debt

        IMPORTANT HUMAN FEEDBACK TO CONSIDER:
        {human_feedback}

        Return your findings which must be inclosed inside ```json\n(your_findings as json dictionary)\n``` as a JSON object with these keys (keys must be inside inverted commas and with the same name as below):
        lp_contribution, gp_contribution, lp_distribution, gp_distribution, net_irr, net_interest, lp_nav, gp_nav, net_debt
        """

        # Extract KPIs using LLM
        response = llm.invoke([HumanMessage(content=prompt)])

        # Try to parse the JSON from the response
        try:
            # Look for JSON in the response
            response_text = response.content

            # Extract JSON part if embedded in text explanation
            json_match = re.search(r'```json\n(.*?)\n```', response_text, re.DOTALL)
            if json_match:
                json_str = json_match.group(1)
            else:
                # Try to find JSON-like structure
                json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
                if json_match:
                    json_str = json_match.group(0)
                else:
                    json_str = response_text

            kpi_data = json.loads(json_str)

            # Ensure all required keys are present
            required_keys = ["lp_contribution", "gp_contribution", "lp_distribution", "gp_distribution",
                            "net_irr", "net_interest", "lp_nav", "gp_nav", "net_debt"]

            for key in required_keys:
                if key not in kpi_data:
                    kpi_data[key] = "Not Found"

        except Exception as parse_error:
            print(f"Error parsing LLM response: {str(parse_error)}")
            print(f"Raw response: {response.content}")

            # Use previous KPI data if parsing fails
            kpi_data = state.get("kpi_data", {})

        print(f"Extracted KPIs")

        return {**state, "kpi_data": kpi_data, "status": "kpis_reextracted"}
    except Exception as e:
        return {**state, "error": f"Error re-extracting KPIs: {str(e)}", "status": "error"}

In [145]:
def human_validation(state: WorkflowState) -> Union[str, Dict]:
    try:
        kpi_data = state.get("kpi_data", {})

        # Format KPI data for display
        kpi_display = "\n".join([f"{k.upper()}: {v}" for k, v in kpi_data.items()])

        print("\n" + "="*50)
        print("EXTRACTED KPIs FOR VALIDATION:")
        print(kpi_display)
        print("="*50)

        # Get human feedback
        choice = input("\nAccept these KPIs? (yes/no): ").strip().lower()

        if choice == "yes":
            print("✅ KPIs accepted!")
            return "save_to_excel"
        else:
            feedback = input("Please provide feedback for improving extraction: ")
            return {"feedback": feedback}

    except Exception as e:
        return {**state, "error": f"Error during human validation: {str(e)}", "status": "error"}

In [146]:
def process_human_feedback(state: WorkflowState, feedback_data: Dict) -> WorkflowState:
    feedback = feedback_data.get("feedback", "")

    # Update state with feedback
    updated_state = {
        **state,
        "human_feedback": feedback,
        "status": "feedback_received"
    }

    print(f"✅ Human feedback recorded: {feedback}")

    return updated_state

In [147]:
def save_to_excel(state: WorkflowState) -> WorkflowState:
    try:
        kpi_data = state.get("kpi_data", {})
        file_path = state.get("file_path", "defaultname")

        # Create filename based on input PDF name
        base_name = os.path.basename(file_path)
        pdf_name = os.path.splitext(base_name)[0]
        excel_path = f"{pdf_name}_KPIs.xlsx"

        # Convert KPI data to DataFrame
        df = pd.DataFrame([kpi_data])

        # Save to Excel
        df.to_excel(excel_path, index=False)

        print(f"✅ Saved KPIs to Excel: {excel_path}")

        return {**state, "status": "completed", "excel_path": excel_path}

    except Exception as e:
        return {**state, "error": f"Error saving to Excel: {str(e)}", "status": "error"}

In [148]:
def build_workflow() -> StateGraph:
    # Initialize the graph
    workflow = StateGraph(WorkflowState)

    # Add nodes
    workflow.add_node("load_pdf", load_pdf)
    workflow.add_node("extract_pdf_text", extract_pdf_text)
    workflow.add_node("extract_kpis", extract_kpis)
    workflow.add_node("human_validation", human_validation)
    workflow.add_node("process_human_feedback", process_human_feedback)
    workflow.add_node("save_to_excel", save_to_excel)

    # Define edges for sequential flow
    workflow.add_edge("load_pdf", "extract_pdf_text")
    workflow.add_edge("extract_pdf_text", "extract_kpis")
    workflow.add_edge("extract_kpis", "human_validation")
    workflow.add_conditional_edges(
        "human_validation",
        {
            "save_to_excel": lambda x: x == "save_to_excel",
            "process_human_feedback": lambda x: isinstance(x, dict) and "feedback" in x
        }
    )
    workflow.add_edge("process_human_feedback", "extract_kpis")
    workflow.add_edge("extract_kpis", "human_validation")
    workflow.add_edge("save_to_excel", END)

    # Set the entry point
    workflow.set_entry_point("load_pdf")

    return workflow

In [149]:
def compile_workflow(pdf_path: str):
    workflow = build_workflow()
    app = workflow.compile()
    # display(Image(app.get_graph(xray=True).draw_mermaid_png()))
    return app

In [150]:
def run_workflow(pdf_path: str):
    app = compile_workflow(pdf_path)

    # Initial state
    initial_state = WorkflowState(
        file_path=pdf_path,
        extracted_text="",
        kpi_data={},
        human_feedback="",
        status="started",
        error=""
    )

    # Execute the workflow
    for output in app.stream(initial_state):
        node = output.get("node")
        if node == "error":
            print(f"❌ Error: {output['state'].get('error')}")
            return output['state']

    # Return the final state
    final_state = output.get("state", {})
    return final_state

In [151]:
# Run the workflow on the uploaded PDF
result = run_workflow("/content/sample1.pdf")

# Check the result
if result.get("status") == "completed":
    print("\n" + "="*50)
    print("✅ WORKFLOW COMPLETED SUCCESSFULLY")
    print(f"📊 KPIs saved to: {result.get('excel_path')}")
    print("="*50)

    # Print extracted KPIs
    print("\nExtracted KPIs:")
    kpi_data = result.get("kpi_data", {})
    for key, value in kpi_data.items():
        print(f"  - {key.upper()}: {value}")
else:
    print("\n" + "="*50)
    print(f"❌ WORKFLOW FAILED: {result.get('error')}")
    print("="*50)

inside load_pdf
printing file path:  /content/sample1.pdf
loaded pdf successfully!
basic text content: JMI EQUITY FUND IX-A, L.P.     
JMI EQUITY FUND IX-B, L.P. 
 
 
 
 
 
First Quarter Report 
March 31, 2023
JMI EQUITY FUND IX-A, L.P.  
JMI EQUITY FUND IX-B, L.P. 
First Quarter Report 
 
 
Table of Contents 
 
 
 
I. Summary of Fund Activity 
 
II. Portfolio Company Summaries 
 
III. Financial Statements 
 
 
 
 
 
 
 
   
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
THIS INFORMATION IS CONFIDENTIAL AND PROPRIETARY AND MUST BE MAINTAINED IN 
CONFIDENCE AND IN ACCORDANCE WITH THE CONFIDENTIALITY OBLIGATIONS 
Extracted KPIs

EXTRACTED KPIs FOR VALIDATION:
LP_CONTRIBUTION: 1273050007
GP_CONTRIBUTION: 11658721
LP_DISTRIBUTION: 484721771
GP_DISTRIBUTION: Not Found
NET_IRR: 26
NET_INTEREST: 309555
LP_NAV: 1724498074
GP_NAV: 312035219
NET_DEBT: 31556080

Accept these KPIs? (yes/no): yes
✅ KPIs accepted!


InvalidUpdateError: Expected dict, got save_to_excel
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/INVALID_GRAPH_NODE_RETURN_VALUE