In [1]:

from pathlib import Path
from esg_v2.pipeline.pipeline import run_pipeline_v2

PDF_DIR = Path("data/raw")

TEST_PDFS = [
    "test_table_esg_grid_v3.pdf",
    "test_table_esg_all.pdf",
    "test_table_esg.pdf",
    "test_table_esg2.pdf",
    "esg_report_v1.pdf",
    "esg_report_v2.pdf",
    "esg_report_v3.pdf",
    "SIEMENS_2023_Sustainability.pdf",
]

print("\n=== ESG v2 Pipeline – Integration Test ===\n")

for pdf in TEST_PDFS:
    path = PDF_DIR / pdf
    if not path.exists():
        print(f"[SKIP] {pdf} — file not found")
        continue

    print(f"\n--- Running pipeline on: {pdf} ---")
    try:
        results = run_pipeline_v2(str(path))
    except Exception as e:
        print(f"ERROR while processing {pdf}: {e}")
        continue

    # Convert list → dict
    lookup = {r.code: r for r in results}

    for code, r in lookup.items():
        if r.value is not None:
            print(
                f"{code:25}  "
                f"value={r.value:<12}  "
                f"unit={r.unit or '':<8}  "
                f"conf={r.confidence:<4}  "
                f"src={','.join(r.source)}"
            )
        else:
            print(f"{code:25}  (no extraction)")

print("\n=== Completed ===")

2025-11-18 20:20:44,874 | INFO | esg_v2.extractors.table_extractor_v3 | table_v3: extracting from data/raw/test_table_esg_grid_v3.pdf
2025-11-18 20:20:44,889 | INFO | esg_v2.extractors.table_extractor_v2 | table_v2: extracting from data/raw/test_table_esg_grid_v3.pdf
2025-11-18 20:20:44,910 | INFO | esg_v2.extractors.table_extractor_v3 | table_v3: extracting from data/raw/test_table_esg_all.pdf
2025-11-18 20:20:44,919 | INFO | esg_v2.extractors.table_extractor_v2 | table_v2: extracting from data/raw/test_table_esg_all.pdf
2025-11-18 20:20:44,928 | INFO | esg_v2.extractors.table_extractor_v2 | table_v2 hit total_ghg_emissions: 123,400 tCO2e (line='Total GHG emissions (tCO2e) — 123,400')
2025-11-18 20:20:44,928 | INFO | esg_v2.extractors.table_extractor_v2 | table_v2 hit energy_consumption: 500,000 MWh (line='Total energy consumption (MWh) — 500,000')
2025-11-18 20:20:44,929 | INFO | esg_v2.extractors.table_extractor_v2 | table_v2 hit water_withdrawal: 1,200,000 m3 (line='Total water wit


=== ESG v2 Pipeline – Integration Test ===


--- Running pipeline on: test_table_esg_grid_v3.pdf ---
total_ghg_emissions        value=123400.0      unit=tCO2e     conf=0.9   src=table_v3
energy_consumption         value=500000.0      unit=MWh       conf=0.9   src=table_v3
water_withdrawal           value=1200000.0     unit=m3        conf=0.9   src=table_v3

--- Running pipeline on: test_table_esg_all.pdf ---
total_ghg_emissions        value=123400.0      unit=tCO2e     conf=0.85  src=table_v2
energy_consumption         value=500000.0      unit=MWh       conf=0.85  src=table_v2
water_withdrawal           value=1200000.0     unit=m3        conf=0.85  src=table_v2

--- Running pipeline on: test_table_esg.pdf ---
total_ghg_emissions        value=123400.0      unit=tCO2e     conf=0.9   src=table_v3
energy_consumption         value=500000.0      unit=MWh       conf=0.9   src=table_v3
water_withdrawal           value=1200000.0     unit=m3        conf=0.9   src=table_v3

--- Running pipeline 

2025-11-18 20:20:45,112 | INFO | esg_v2.extractors.table_extractor_v2 | table_v2: extracting from data/raw/esg_report_v2.pdf
2025-11-18 20:20:45,131 | INFO | esg_v2.extractors.regex_extractor_v2 | regex_v2 hit total_ghg_emissions: 123,400 tCO2e
2025-11-18 20:20:45,131 | INFO | esg_v2.extractors.regex_extractor_v2 | regex_v2 hit energy_consumption: 500,000 MWh
2025-11-18 20:20:45,132 | INFO | esg_v2.extractors.regex_extractor_v2 | regex_v2 hit water_withdrawal: 1,200,000 m³
2025-11-18 20:20:45,132 | INFO | esg_v2.extractors.nlp_extractor_v2 | nlp_v2 hit total_ghg_emissions: raw_value='123,400', raw_unit='tCO2e'
2025-11-18 20:20:45,132 | INFO | esg_v2.extractors.nlp_extractor_v2 | nlp_v2 hit energy_consumption: raw_value='500,000', raw_unit='MWh'
2025-11-18 20:20:45,133 | INFO | esg_v2.extractors.nlp_extractor_v2 | nlp_v2 hit water_withdrawal: raw_value='1,200,000', raw_unit='m³'
2025-11-18 20:20:45,153 | INFO | esg_v2.extractors.table_extractor_v3 | table_v3: extracting from data/raw/es

total_ghg_emissions        value=123400.0      unit=tCO2e     conf=0.9   src=table_v3
energy_consumption         value=500000.0      unit=MWh       conf=0.9   src=table_v3
water_withdrawal           value=1200000.0     unit=m3        conf=0.9   src=table_v3

--- Running pipeline on: esg_report_v3.pdf ---
total_ghg_emissions        value=123400.0      unit=tCO2e     conf=0.9   src=table_v3
energy_consumption         value=500000.0      unit=MWh       conf=0.9   src=table_v3
water_withdrawal           value=1200000.0     unit=m3        conf=0.9   src=table_v3

--- Running pipeline on: SIEMENS_2023_Sustainability.pdf ---


2025-11-18 20:20:53,420 | INFO | esg_v2.extractors.table_extractor_v3 | table_v3: extracting from data/raw/SIEMENS_2023_Sustainability.pdf
2025-11-18 20:21:01,520 | INFO | esg_v2.extractors.table_extractor_v2 | table_v2: extracting from data/raw/SIEMENS_2023_Sustainability.pdf
2025-11-18 20:21:09,996 | INFO | esg_v2.extractors.regex_extractor_v2 | regex_v2 hit energy_consumption: 5,000,000 kWh
2025-11-18 20:21:09,996 | INFO | esg_v2.extractors.regex_extractor_v2 | regex_v2 hit water_withdrawal: 6,000 m3
2025-11-18 20:21:10,022 | INFO | esg_v2.extractors.nlp_extractor_v2 | nlp_v2 hit water_withdrawal: raw_value='14.26 million', raw_unit='m³'
2025-11-18 20:21:10,023 | INFO | esg_v2.pipeline.pipeline | pipeline_v2: 1 KPIs missing after deterministic extractors; using llm_v2 backfill.
2025-11-18 20:21:10,075 | INFO | esg_v2.extractors.llm_extractor_v2 | llm_v2: querying model=gpt-4o-mini
2025-11-18 20:21:18,690 | INFO | httpx | HTTP Request: POST https://api.openai.com/v1/chat/completions 

total_ghg_emissions        (no extraction)
energy_consumption         value=5000.0        unit=MWh       conf=0.6   src=regex_v2
water_withdrawal           value=6000.0        unit=m3        conf=0.6   src=regex_v2

=== Completed ===
