In [1]:
from pathlib import Path
import sys
import os
from dotenv import load_dotenv

# 1. Start from current working directory
cwd = Path().resolve()

# 2. Walk upwards until we find the project root (has pyproject.toml)
ROOT = cwd
while ROOT != ROOT.parent and not (ROOT / "pyproject.toml").exists():
    ROOT = ROOT.parent

print("Detected project root:", ROOT)

# 3. Add src/ to sys.path
SRC = ROOT / "src"
if str(SRC) not in sys.path:
    sys.path.append(str(SRC))

print("Using src path:", SRC)

# 4. Load .env from project root
load_dotenv(ROOT / ".env")
print("API loaded:", "OPENAI_API_KEY" in os.environ)

Detected project root: /Users/golibsanaev/Library/CloudStorage/Dropbox/GitHub_gsanaev/esg-llm-platform
Using src path: /Users/golibsanaev/Library/CloudStorage/Dropbox/GitHub_gsanaev/esg-llm-platform/src
API loaded: True


In [2]:
from esg_user.extractors.table_extractor_fitz import extract_kpis_from_fitz

pdf_path = ROOT / "data/raw/SIEMENS_2023_Sustainability.pdf"

extract_kpis_from_fitz(str(pdf_path))

2025-11-15 19:16:07,743 | INFO | esg_user.extractors.table_extractor_fitz | Running PyMuPDF table extractor (layout-based)…
2025-11-15 19:16:08,185 | INFO | esg_user.extractors.table_extractor_fitz | PyMuPDF hit water_withdrawal: value=2023.0, unit=m³, page=69


{'total_ghg_emissions': {'value': None, 'unit': None, 'confidence': 0.0},
 'energy_consumption': {'value': None, 'unit': None, 'confidence': 0.0},
 'water_withdrawal': {'value': 2023.0, 'unit': 'm³', 'confidence': 0.88}}

In [3]:
from esg_user.extractors.table_extractor_fitz import extract_kpis_from_fitz

pdf_path = ROOT / "data/raw/test_table_esg.pdf"

extract_kpis_from_fitz(str(pdf_path))

2025-11-15 19:16:12,935 | INFO | esg_user.extractors.table_extractor_fitz | Running PyMuPDF table extractor (layout-based)…
2025-11-15 19:16:12,941 | INFO | esg_user.extractors.table_extractor_fitz | PyMuPDF hit water_withdrawal: value=500000.0, unit=m3, page=1


{'total_ghg_emissions': {'value': None, 'unit': None, 'confidence': 0.0},
 'energy_consumption': {'value': None, 'unit': None, 'confidence': 0.0},
 'water_withdrawal': {'value': 500000.0, 'unit': 'm3', 'confidence': 0.88}}

In [4]:
from esg_user.extractors.table_extractor_fitz import extract_kpis_from_fitz

pdf_path = ROOT / "data/raw/UNILEVER_2024_Annual_ESG.pdf"

extract_kpis_from_fitz(str(pdf_path))

2025-11-15 19:16:15,637 | INFO | esg_user.extractors.table_extractor_fitz | Running PyMuPDF table extractor (layout-based)…


{'total_ghg_emissions': {'value': None, 'unit': None, 'confidence': 0.0},
 'energy_consumption': {'value': None, 'unit': None, 'confidence': 0.0},
 'water_withdrawal': {'value': None, 'unit': None, 'confidence': 0.0}}

In [None]:
from esg_user.extractors.table_extractor_fitz import extract_kpis_from_fitz

pdf_path = ROOT / "data/raw/SIEMENS_2023_Sustainability.pdf"

extract_kpis_from_fitz(str(pdf_path))

In [None]:
import sys
sys.path.append("../src")

from esg_user.extractors.table_extractor_fitz import extract_kpis_from_fitz

pdf = "../data/raw/SIEMENS_2023_Sustainability.pdf"
extract_kpis_from_fitz(pdf)


In [None]:
from esg_user.extractors.table_extractor_fitz import extract_kpis_from_fitz

# in extract_all_kpis(...)
table_res = {}
if pdf_path:
    camelot_res = extract_kpis_from_camelot_filtered(pdf_path)
    fitz_res = extract_kpis_from_fitz(pdf_path)

    table_res = {}
    for code in kpi_codes:
        table_res[code] = camelot_res.get(code) or fitz_res.get(code) or {
            "value": None,
            "unit": None,
            "confidence": 0.0,
        }


In [None]:
import fitz

pdf = "../data/raw/SIEMENS_2023_Sustainability.pdf"
doc = fitz.open(pdf)

# Try pages near Siemens KPIs (typically around 140–150)
pages_to_check = [120, 130, 140, 145, 150]

for p in pages_to_check:
    if p >= len(doc):
        continue
    print("==== PAGE", p+1, "====")
    words = doc[p].get_text("words")
    for w in words[:50]:  # print first 50 words
        print(w)

In [None]:
import fitz

pdf = "../data/raw/SIEMENS_2023_Sustainability.pdf"
doc = fitz.open(pdf)

# Print title of first 200 pages so we can locate KPIs
for p in range(0, 200):
    text = doc[p].get_text()
    if any(k in text.lower() for k in [
        "ghg", "co2", "emissions", "scope", "energy", "water", "consumption", "withdrawal"
    ]):
        print("=== CANDIDATE PAGE:", p+1, "===")
        print(text[:800])