Testing

In [2]:
# !pip install --quiet -U langchain_openai langchain_core langchain_community langgraph

In [88]:
import os
import re
import random
from pprint import pprint
from google.colab import userdata
from IPython.display import display, Image
from operator import add
from typing import List, Literal, Annotated, Optional
from typing_extensions import TypedDict
from pydantic import BaseModel, field_validator, ValidationError, Field
from langchain_openai import ChatOpenAI
from langchain_core.messages import AIMessage, HumanMessage, AnyMessage, SystemMessage, ToolMessage, RemoveMessage
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain.prompts import PromptTemplate
from langgraph.graph import StateGraph, MessagesState, START, END
from langgraph.graph.message import add_messages
from langgraph.prebuilt import ToolNode, tools_condition
from langgraph.checkpoint.memory import MemorySaver

In [6]:
GIRU_OPENAI_API_KEY = userdata.get('GIRU_OPENAI_API_KEY')
os.environ["OPENAI_API_KEY"] = GIRU_OPENAI_API_KEY

In [7]:
# checking openai llm
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
result = llm.invoke("Hi, are you deepseek?")

# printing results
print("==="*25)
print(f"RESPONSE\n{result.content}")
print("==="*25)
print("METADATA")
print(f"model: {result.response_metadata['model_name']}")
for k, v in result.response_metadata['token_usage'].items():
  print(f"{k}: {v}")

RESPONSE
No, I'm not DeepSeek. I'm an AI language model created by OpenAI. How can I assist you today?
METADATA
model: gpt-4o-mini-2024-07-18
completion_tokens: 25
prompt_tokens: 14
total_tokens: 39
completion_tokens_details: {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}
prompt_tokens_details: {'audio_tokens': 0, 'cached_tokens': 0}


In [12]:
class KPIDetail(BaseModel):
    name: str = Field(..., description="Exact name of the KPI")
    description: str = Field(..., description="Detailed explanation of what the KPI measures")
    value: Optional[str] = Field(default="Not specified", description="Extracted numeric or textual value of the KPI")
    confidence_score: float = Field(default=0.0, description="Confidence of extraction (0.0-1.0)")
    source_context: Optional[str] = Field(default=None, description="Contextual snippet supporting the KPI extraction")

In [13]:
class KPIExtraction(BaseModel):
    kpis: List[KPIDetail] = Field(..., description="List of extracted KPIs")
    extraction_notes: Optional[str] = Field(default=None, description="Additional notes about the extraction process")

In [15]:
# prompt for KPI extraction
kpi_extraction_prompt = PromptTemplate(
    input_variables=["document_text", "target_kpis", "parser"],
    template="""🔍 Advanced KPI Extraction Protocol 🔍

Objective: Perform a comprehensive, precise extraction of Key Performance Indicators from the provided document.

Extraction Criteria:
1. Target KPIs to Extract:
\"\"\"
{target_kpis}
\"\"\"

Extraction Guidelines:
- Mandatory Inclusion: Every target KPI MUST be addressed, even if not directly found
- Exhaustive Search: Scan the entire document thoroughly
- Contextual Awareness: Provide surrounding context for extracted values
- Confidence Assessment: Rate the reliability of each extraction

Detailed Extraction Requirements:
✅ KPI Name: Exact, verbatim name
✅ Description: Comprehensive explanation of the KPI's significance
✅ Value:
   - Precise numeric or textual representation
   - Use "Not specified" if no clear value is found
   - Include units of measurement if present
✅ Confidence Score:
   - 1.0: Extremely confident (direct, unambiguous extraction)
   - 0.7-0.9: High confidence (strong contextual evidence)
   - 0.4-0.6: Moderate confidence (partial or inferential evidence)
   - 0.1-0.3: Low confidence (weak or speculative extraction)
   - 0.0: No evidence found
✅ Source Context: Brief text snippet supporting the extraction

Prohibited Actions:
❌ Do NOT fabricate or invent KPI values
❌ Do NOT omit any target KPI from the response
❌ Avoid generic or vague descriptions

Response Format Mandate:
{parser}

Source Document:
\"\"\"
{document_text}
\"\"\"

Special Instructions:
- If a KPI requires complex interpretation, explain your reasoning
- Highlight any unusual or noteworthy observations
- Be academically rigorous and precise
"""
)

In [None]:


# Define the schema for state data
class KPIState(State):
    file_path: str
    extracted_text: str = ""
    kpi_data: Dict[str, Any] = {}
    validated_data: Dict[str, Any] = {}

# Node 1: Loader Node
def load_pdf_node(state: KPIState) -> KPIState:
    from PyPDF2 import PdfReader
    reader = PdfReader(state.file_path)
    text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
    state.extracted_text = text
    return state

# Node 2: Extractor Node
def extractor_node(state: KPIState) -> KPIState:
    if state.extracted_text.strip():
        return state
    else:
        import pytesseract
        from PIL import Image
        text = pytesseract.image_to_string(Image.open(state.file_path))
        state.extracted_text = text
        return state

# Node 3: KPI Extractor Node
def kpi_extractor_node(state: KPIState) -> KPIState:
    llm = ChatOpenAI(model="gpt-4o-mini")
    prompt = f"""
    Extract the following KPIs from the provided text:
    - LP Contribution
    - GP Contribution
    - LP Distribution
    - GP Distribution
    - Net IRR
    - Net Interest
    - LP NAV
    - GP NAV
    - Net Debt

    Format as JSON.

    Text:
    {state.extracted_text}
    """
    response = llm.predict(prompt)
    state.kpi_data = response
    return state

# Node 4: Human Validation Node
def human_validation_node(state: KPIState) -> KPIState:
    print("Extracted KPIs:", state.kpi_data)
    decision = input("Accept or Reject? (a/r): ")
    if decision.lower() == 'a':
        state.validated_data = state.kpi_data
    else:
        feedback = input("Enter your feedback or corrections: ")
        state.kpi_data = kpi_extractor_node(KPIState(file_path=state.file_path, extracted_text=feedback)).kpi_data
        state.validated_data = state.kpi_data
    return state

# Node 5: Save to Excel Node
def save_excel_node(state: KPIState) -> KPIState:
    df = pd.DataFrame([state.validated_data])
    output_file = os.path.splitext(state.file_path)[0] + "_output.xlsx"
    df.to_excel(output_file, index=False)
    return state

# Define the LangGraph workflow
graph = StateGraph(schema=KPIState)
graph.add_node("loader", load_pdf_node)
graph.add_node("extractor", extractor_node)
graph.add_node("kpi_extractor", kpi_extractor_node)
graph.add_node("human_validation", human_validation_node)
graph.add_node("save_excel", save_excel_node)

graph.set_entry_point("loader")
graph.connect("loader", "extractor")
graph.connect("extractor", "kpi_extractor")
graph.connect("kpi_extractor", "human_validation")
graph.connect("human_validation", "save_excel")
graph.set_exit_point("save_excel")

# Run the workflow
def run_workflow(file_path: str):
    initial_state = KPIState(file_path=file_path)
    graph.run(initial_state)

# Example usage
# run_workflow("sample.pdf")


Installing Libraries

In [54]:
# !pip install pytesseract pdf2image
# !pip install pillow pandas openpyxl
# !pip install tesseract
# !apt-get install -y tesseract-ocr

Importing Libraries

In [89]:
import os
import sys
import json
from IPython.display import display, Image
from typing import Dict, List, Any
from typing_extensions import TypedDict, Annotated, Union

# PDF processing libraries
import pytesseract
from pdf2image import convert_from_path
import io
import pandas as pd

# LangChain and LangGraph
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from langchain.schema import Document
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.messages import HumanMessage, AIMessage
from langchain.schema.runnable import RunnableConfig

import langgraph
from langgraph.graph import StateGraph, END

In [90]:
class KPIData(TypedDict):
    lp_contribution: str
    gp_contribution: str
    lp_distribution: str
    gp_distribution: str
    net_irr: str
    net_interest: str
    lp_nav: str
    gp_nav: str
    net_debt: str

In [91]:
class WorkflowState(TypedDict):
    file_path: str
    extracted_text: str
    kpi_data: KPIData
    human_feedback: str
    status: str
    error: str

In [92]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

In [93]:
def load_pdf(state: WorkflowState) -> WorkflowState:
    """Load PDF document from the provided file path."""
    try:
        file_path = state.get("file_path")
        if not file_path:
            return {**state, "error": "No file path provided", "status": "error"}

        if not os.path.exists(file_path):
            return {**state, "error": f"File not found: {file_path}", "status": "error"}

        loader = PyPDFLoader(file_path)
        docs = loader.load()

        # Basic extraction without OCR
        basic_text = "\n".join([doc.page_content for doc in docs])
        print(f"✅ Loaded PDF: {file_path}")

        return {**state, "extracted_text": basic_text, "status": "pdf_loaded"}
    except Exception as e:
        return {**state, "error": f"Error loading PDF: {str(e)}", "status": "error"}

In [94]:
def extract_pdf_text(state: WorkflowState) -> WorkflowState:
    """Extract text from PDF using both regular extraction and OCR."""
    try:
        file_path = state.get("file_path")
        basic_text = state.get("extracted_text", "")

        # Use OCR for images in PDF
        images = convert_from_path(file_path)
        ocr_text = ""

        for i, image in enumerate(images):
            # Perform OCR
            text = pytesseract.image_to_string(image)
            ocr_text += f"\n--- Page {i+1} OCR ---\n{text}"

        # Combine both extraction methods
        combined_text = f"{basic_text}\n\n{ocr_text}"

        print(f"✅ Extracted text from PDF (Regular + OCR)")

        return {**state, "extracted_text": combined_text, "status": "text_extracted"}
    except Exception as e:
        return {**state, "error": f"Error extracting text: {str(e)}", "status": "error"}

In [95]:
def extract_kpis(state: WorkflowState) -> WorkflowState:
    """Extract KPIs considering human feedback."""
    try:
        text_content = state.get("extracted_text", "")
        human_feedback = state.get("human_feedback", "")

        # prompt with human feedback
        prompt = f"""
        You are a financial document analysis expert. Extract the following KPIs from the provided text content.
        If a value is not found, mark it as 'Not Found'.

        Text content from PDF:
        ```
        {text_content}  # Limiting text length for model context
        ```

        Extract the following KPIs:
        1. LP Contribution
        2. GP Contribution
        3. LP Distribution
        4. GP Distribution
        5. Net IRR
        6. Net Interest
        7. LP NAV
        8. GP NAV
        9. Net Debt

        IMPORTANT HUMAN FEEDBACK TO CONSIDER:
        {human_feedback}

        Return your findings as a JSON object with these keys:
        lp_contribution, gp_contribution, lp_distribution, gp_distribution, net_irr, net_interest, lp_nav, gp_nav, net_debt
        """

        # Extract KPIs using LLM
        response = llm.invoke([HumanMessage(content=prompt)])

        # Try to parse the JSON from the response
        try:
            # Look for JSON in the response
            response_text = response.content

            # Extract JSON part if embedded in text explanation
            json_match = re.search(r'```json\n(.*?)\n```', response_text, re.DOTALL)
            if json_match:
                json_str = json_match.group(1)
            else:
                # Try to find JSON-like structure
                json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
                if json_match:
                    json_str = json_match.group(0)
                else:
                    json_str = response_text

            kpi_data = json.loads(json_str)

            # Ensure all required keys are present
            required_keys = ["lp_contribution", "gp_contribution", "lp_distribution", "gp_distribution",
                            "net_irr", "net_interest", "lp_nav", "gp_nav", "net_debt"]

            for key in required_keys:
                if key not in kpi_data:
                    kpi_data[key] = "Not Found"

        except Exception as parse_error:
            print(f"Error parsing LLM response: {str(parse_error)}")
            print(f"Raw response: {response.content}")

            # Use previous KPI data if parsing fails
            kpi_data = state.get("kpi_data", {})

        print(f"✅ Re-extracted KPIs with human feedback")

        return {**state, "kpi_data": kpi_data, "status": "kpis_reextracted"}
    except Exception as e:
        return {**state, "error": f"Error re-extracting KPIs: {str(e)}", "status": "error"}

In [96]:
def human_validation(state: WorkflowState) -> Union[str, Dict]:
    """Present extracted KPIs to human for validation."""
    try:
        kpi_data = state.get("kpi_data", {})

        # Format KPI data for display
        kpi_display = "\n".join([f"{k.upper()}: {v}" for k, v in kpi_data.items()])

        print("\n" + "="*50)
        print("EXTRACTED KPIs FOR VALIDATION:")
        print(kpi_display)
        print("="*50)

        # Get human feedback
        choice = input("\nAccept these KPIs? (yes/no): ").strip().lower()

        if choice == "yes":
            print("✅ KPIs accepted!")
            return "save_to_excel"
        else:
            feedback = input("Please provide feedback for improving extraction: ")
            return {"feedback": feedback}
    except Exception as e:
        return {**state, "error": f"Error during human validation: {str(e)}", "status": "error"}

In [97]:
def process_human_feedback(state: WorkflowState, feedback_data: Dict) -> WorkflowState:
    """Process human feedback and update state."""
    feedback = feedback_data.get("feedback", "")

    # Update state with feedback
    updated_state = {
        **state,
        "human_feedback": feedback,
        "status": "feedback_received"
    }

    print(f"✅ Human feedback recorded: {feedback}")

    return updated_state

In [98]:
def save_to_excel(state: WorkflowState) -> WorkflowState:
    """Save extracted KPIs to Excel file."""
    try:
        kpi_data = state.get("kpi_data", {})
        file_path = state.get("file_path", "unknown")

        # Create filename based on input PDF name
        base_name = os.path.basename(file_path)
        pdf_name = os.path.splitext(base_name)[0]
        excel_path = f"{pdf_name}_KPIs.xlsx"

        # Convert KPI data to DataFrame
        df = pd.DataFrame([kpi_data])

        # Save to Excel
        df.to_excel(excel_path, index=False)

        print(f"✅ Saved KPIs to Excel: {excel_path}")

        return {**state, "status": "completed", "excel_path": excel_path}
    except Exception as e:
        return {**state, "error": f"Error saving to Excel: {str(e)}", "status": "error"}

In [99]:
def build_workflow() -> StateGraph:
    # Initialize the graph
    workflow = StateGraph(WorkflowState)

    # Add nodes
    workflow.add_node("load_pdf", load_pdf)
    workflow.add_node("extract_pdf_text", extract_pdf_text)
    workflow.add_node("extract_kpis", extract_kpis)
    workflow.add_node("human_validation", human_validation)
    workflow.add_node("process_human_feedback", process_human_feedback)
    workflow.add_node("save_to_excel", save_to_excel)

    # Define edges for sequential flow
    workflow.add_edge("load_pdf", "extract_pdf_text")
    workflow.add_edge("extract_pdf_text", "extract_kpis")
    workflow.add_edge("extract_kpis", "human_validation")
    workflow.add_conditional_edges(
        "human_validation",
        {
            "save_to_excel": lambda x: x == "save_to_excel",
            "process_human_feedback": lambda x: isinstance(x, dict) and "feedback" in x
        }
    )
    workflow.add_edge("process_human_feedback", "extract_kpis")
    workflow.add_edge("extract_kpis", "human_validation")
    workflow.add_edge("save_to_excel", END)

    # Set the entry point
    workflow.set_entry_point("load_pdf")

    return workflow

In [103]:
def compile_workflow(pdf_path: str):
    workflow = build_workflow()
    app = workflow.compile()
    # display(Image(app.get_graph(xray=True).draw_mermaid_png()))
    return app

In [104]:
def run_workflow(pdf_path: str):
    pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/pytesseract'
    app = compile_workflow(pdf_path)

    # Initial state
    initial_state = WorkflowState(
        file_path=pdf_path,
        extracted_text="",
        kpi_data={},
        human_feedback="",
        status="started",
        error=""
    )

    # Execute the workflow
    for output in app.stream(initial_state):
        node = output.get("node")
        if node == "error":
            print(f"❌ Error: {output['state'].get('error')}")
            return output['state']

    # Return the final state
    final_state = output.get("state", {})
    return final_state

In [105]:
# Run the workflow on the uploaded PDF
result = run_workflow("/content/sample1.pdf")

# Check the result
if result.get("status") == "completed":
    print("\n" + "="*50)
    print("✅ WORKFLOW COMPLETED SUCCESSFULLY")
    print(f"📊 KPIs saved to: {result.get('excel_path')}")
    print("="*50)

    # Print extracted KPIs
    print("\nExtracted KPIs:")
    kpi_data = result.get("kpi_data", {})
    for key, value in kpi_data.items():
        print(f"  - {key.upper()}: {value}")
else:
    print("\n" + "="*50)
    print(f"❌ WORKFLOW FAILED: {result.get('error')}")
    print("="*50)

✅ Re-extracted KPIs with human feedback

EXTRACTED KPIs FOR VALIDATION:
LP_CONTRIBUTION: Not Found
GP_CONTRIBUTION: Not Found
LP_DISTRIBUTION: Not Found
GP_DISTRIBUTION: Not Found
NET_IRR: Not Found
NET_INTEREST: Not Found
LP_NAV: Not Found
GP_NAV: Not Found
NET_DEBT: Not Found

Accept these KPIs? (yes/no): no
Please provide feedback for improving extraction: Find all the details inside pdf

❌ WORKFLOW FAILED: None
