# KPI Extraction Workflow

In [None]:
# !pip install pandas
# !pip install numpy
# !pip install openai
# !pip install langchain
# !pip install langgraph
# !pip install langchain-community
# !pip install -qU langchain-openai
# !pip install streamlit
# !pip install tiktoken
# !pip install sqlparse
# !pip install sqlglot black
# !pip install sqlfluff
# !pip install matplotlib
# !pip install pip-system-certs
# !pip install -qU pypdf
# !pip install PyMuPDF
# !pip install easyocr
# !pip install fitz

In [2]:
!pip install pytesseract

Collecting tesseract
  Downloading tesseract-0.1.3.tar.gz (45.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.6/45.6 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: tesseract
  Building wheel for tesseract (setup.py) ... [?25l[?25hdone
  Created wheel for tesseract: filename=tesseract-0.1.3-py3-none-any.whl size=45562552 sha256=ba868731da2f24ab139fc77a88c5befafd3cef87be305d0427c7615682353f05
  Stored in directory: /root/.cache/pip/wheels/b4/47/6e/bb7543eee5b12cf0bbeedd33b40886429a79aef0b03d76e051
Successfully built tesseract
Installing collected packages: tesseract
Successfully installed tesseract-0.1.3


In [None]:
import os
import time
import openai
import pytesseract
import fitz  # PyMuPDF
import pandas as pd
from langchain import OpenAI
from langgraph.graph import StateGraph, State

# ----------- CONFIGURATION ----------- #
MAX_API_RETRIES = 2
MISTRAL_API_URL = "https://api.mistral.com/ocr"
MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY")
TESSERACT_CMD = r'/usr/local/bin/tesseract'  # Update as per your system
pytesseract.pytesseract.tesseract_cmd = TESSERACT_CMD

KPIS = [
    "LP Contribution", "GP Contribution", "LP Distribution", "GP Distribution",
    "Net IRR", "Net Interest", "LP NAV", "GP NAV", "Net Debt"
]

# ----------- UTILITIES ----------- #
def extract_text_local(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
            # OCR for images
            pix = page.get_pixmap()
            text += pytesseract.image_to_string(pix.tobytes())
    return text

def extract_text_api(pdf_path):
    with open(pdf_path, 'rb') as file:
        for attempt in range(MAX_API_RETRIES):
            try:
                response = openai.Completion.create(
                    api_key=MISTRAL_API_KEY,
                    engine="text-davinci-003",
                    prompt="Extract text from the uploaded PDF.",
                    files=[file]
                )
                return response.text
            except Exception as e:
                if attempt == MAX_API_RETRIES - 1:
                    print(f"API Error: {e}. Skipping file.")
                else:
                    time.sleep(2)
    return ""

def convert_to_markdown(text):
    # Basic markdown conversion; enhance as needed
    return f"```markdown\n{text}\n```"

def extract_kpis(markdown_text, llm):
    prompt = (
        f"Extract the following KPIs from the text:\n"
        f"{', '.join(KPIS)}\n"
        f"Text:\n{markdown_text}\n"
        "Return as a JSON object."
    )
    response = llm(prompt)
    return response

def save_to_excel(file_name, kpis):
    df = pd.DataFrame([kpis])
    df.to_excel(f"{file_name}.xlsx", index=False)

# ----------- STATE MANAGEMENT ----------- #
class KPIsState(State):
    files: dict

# ----------- LANGGRAPH NODES ----------- #
def start_node(state: KPIsState):
    return state

def text_extraction_node(state: KPIsState):
    for file, data in state.files.items():
        if not data['text_extracted']:
            text = extract_text_local(file)
            if not text:
                text = extract_text_api(file)
            state.files[file]['markdown'] = convert_to_markdown(text)
            state.files[file]['text_extracted'] = True
    return state

def kpi_extraction_node(state: KPIsState):
    llm = OpenAI(temperature=0)
    for file, data in state.files.items():
        if not data['kpis_extracted']:
            result = extract_kpis(data['markdown'], llm)
            missing_kpis = [kpi for kpi in KPIS if not result.get(kpi)]
            if missing_kpis and data['retry_count'] == 0:
                data['retry_count'] += 1
                data['text_extracted'] = False  # Trigger DeepOCR
            else:
                data['kpis'] = result
                data['kpis_extracted'] = True
    return state

def save_results_node(state: KPIsState):
    for file, data in state.files.items():
        save_to_excel(file, data['kpis'])
    return state

def end_node(state: KPIsState):
    print("Workflow completed.")
    return state

# ----------- WORKFLOW SETUP ----------- #
graph = StateGraph()

graph.add_node("start", start_node)
graph.add_node("extract_text", text_extraction_node)
graph.add_node("extract_kpis", kpi_extraction_node)
graph.add_node("save_results", save_results_node)
graph.add_node("end", end_node)

graph.connect("start", "extract_text")
graph.connect("extract_text", "extract_kpis")
graph.connect("extract_kpis", "extract_text")  # Loop back if retries needed
graph.connect("extract_kpis", "save_results")
graph.connect("save_results", "end")

graph.compile()

def run_workflow(files):
    initial_state = KPIsState(files={file: {
        'text_extracted': False,
        'kpis_extracted': False,
        'retry_count': 0,
        'kpis': {}
    } for file in files})

    graph.run(initial_state)

# Example Usage:
# run_workflow(['filea.pdf', 'fileb.pdf'])
