In [1]:
from pathlib import Path
from esg.pipeline.pipeline import run_pipeline
from esg.pipeline.io_utils import save_results_to_csv  # optional export

PDF_DIR = Path("data/samples")

TEST_PDFS = [
    "esg_simple_text.pdf",
    "esg_simple_table.pdf",
    "esg_simple_mixed.pdf",
    "esg_locale_numbers.pdf",
    "esg_messy_units.pdf",
    "esg_unstructured_long.pdf",
    "esg_ocr_noise.pdf",
    "esg_corrupted_table.pdf",
    "esg_llm_realistic_1.pdf",
    "esg_llm_realistic_2.pdf",
]

print("\n=== ESG Pipeline – Integration Test ===\n")

for pdf in TEST_PDFS:
    path = PDF_DIR / pdf
    if not path.exists():
        print(f"[SKIP] {pdf} — file not found")
        continue

    print(f"\n--- Running pipeline on: {pdf} ---")
    try:
        results = run_pipeline(str(path))
    except Exception as e:
        print(f"ERROR while processing {pdf}: {e}")
        continue

    # Optional CSV export
    out_csv = Path("out") / f"{pdf.replace('.pdf', '')}.csv"
    save_results_to_csv(results, out_csv)
    print(f"[saved] {out_csv}")

    # Convert list → dict for pretty printing
    lookup = {r.code: r for r in results}

    for code, r in lookup.items():
        if r.value is not None:
            print(
                f"{code:25}  "
                f"value={r.value:<12}  "
                f"unit={r.unit or '':<8}  "
                f"conf={r.confidence:<4}  "
                f"src={','.join(r.source)}"
            )
        else:
            print(f"{code:25}  (no extraction)")

print("\n=== Completed ===")

2025-11-19 20:31:23,788 | INFO | esg.extractors.table_grid_extractor | table_grid: extracting from data/samples/esg_simple_text.pdf
2025-11-19 20:31:23,793 | INFO | esg.extractors.table_plain_extractor | table_plain: extracting from data/samples/esg_simple_text.pdf
2025-11-19 20:31:23,800 | INFO | esg.pipeline.pipeline | pipeline: 3 KPIs missing after deterministic extractors; using llm backfill.
2025-11-19 20:31:23,849 | INFO | esg.extractors.llm_extractor | llm: querying model gpt-4o-mini



=== ESG Pipeline – Integration Test ===


--- Running pipeline on: esg_simple_text.pdf ---


2025-11-19 20:31:27,267 | INFO | httpx | HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-19 20:31:27,289 | INFO | esg.extractors.table_grid_extractor | table_grid: extracting from data/samples/esg_simple_table.pdf
2025-11-19 20:31:27,297 | INFO | esg.extractors.table_plain_extractor | table_plain: extracting from data/samples/esg_simple_table.pdf
2025-11-19 20:31:27,311 | INFO | esg.extractors.table_grid_extractor | table_grid: extracting from data/samples/esg_simple_mixed.pdf
2025-11-19 20:31:27,324 | INFO | esg.extractors.table_plain_extractor | table_plain: extracting from data/samples/esg_simple_mixed.pdf
2025-11-19 20:31:27,332 | INFO | esg.extractors.table_plain_extractor | table_plain hit total_ghg_emissions: 123,400 tCO2e (line='Total GHG emissions (tCO2e) 123,400')
2025-11-19 20:31:27,333 | INFO | esg.extractors.table_plain_extractor | table_plain hit energy_consumption: 500,000 MWh (line='Total energy consumption (MWh) 500,000')
2025-11

[saved] out/esg_simple_text.csv
total_ghg_emissions        value=123400.0      unit=tCO2e     conf=0.75  src=llm
energy_consumption         value=500000.0      unit=MWh       conf=0.75  src=llm
water_withdrawal           value=1200000.0     unit=m3        conf=0.75  src=llm

--- Running pipeline on: esg_simple_table.pdf ---
[saved] out/esg_simple_table.csv
total_ghg_emissions        value=123400.0      unit=tCO2e     conf=0.9   src=table_grid
energy_consumption         value=500000.0      unit=MWh       conf=0.9   src=table_grid
water_withdrawal           value=1200000.0     unit=m3        conf=0.9   src=table_grid

--- Running pipeline on: esg_simple_mixed.pdf ---
[saved] out/esg_simple_mixed.csv
total_ghg_emissions        value=123400.0      unit=tCO2e     conf=0.9   src=table_grid
energy_consumption         value=500000.0      unit=MWh       conf=0.9   src=table_grid
water_withdrawal           value=1200000.0     unit=m3        conf=0.9   src=table_grid

--- Running pipeline on: esg

2025-11-19 20:31:29,281 | INFO | httpx | HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-19 20:31:29,338 | INFO | esg.extractors.table_grid_extractor | table_grid: extracting from data/samples/esg_messy_units.pdf
2025-11-19 20:31:29,345 | INFO | esg.extractors.table_plain_extractor | table_plain: extracting from data/samples/esg_messy_units.pdf
2025-11-19 20:31:29,366 | INFO | esg.extractors.table_grid_extractor | table_grid: extracting from data/samples/esg_unstructured_long.pdf
2025-11-19 20:31:29,378 | INFO | esg.extractors.table_plain_extractor | table_plain: extracting from data/samples/esg_unstructured_long.pdf
2025-11-19 20:31:29,390 | INFO | esg.pipeline.pipeline | pipeline: 3 KPIs missing after deterministic extractors; using llm backfill.
2025-11-19 20:31:29,408 | INFO | esg.extractors.llm_extractor | llm: querying model gpt-4o-mini


[saved] out/esg_locale_numbers.csv
total_ghg_emissions        value=1200000.0     unit=tCO2e     conf=0.9   src=table_grid
energy_consumption         (no extraction)
water_withdrawal           (no extraction)

--- Running pipeline on: esg_messy_units.pdf ---
[saved] out/esg_messy_units.csv
total_ghg_emissions        value=123400.0      unit=tCO2e     conf=0.9   src=table_grid
energy_consumption         value=500000.0      unit=MWh       conf=0.9   src=table_grid
water_withdrawal           value=1200000.0     unit=m3        conf=0.9   src=table_grid

--- Running pipeline on: esg_unstructured_long.pdf ---


2025-11-19 20:31:31,889 | INFO | httpx | HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-19 20:31:31,905 | INFO | esg.extractors.table_grid_extractor | table_grid: extracting from data/samples/esg_ocr_noise.pdf
2025-11-19 20:31:31,913 | INFO | esg.extractors.table_plain_extractor | table_plain: extracting from data/samples/esg_ocr_noise.pdf
2025-11-19 20:31:31,922 | INFO | esg.pipeline.pipeline | pipeline: 3 KPIs missing after deterministic extractors; using llm backfill.
2025-11-19 20:31:31,943 | INFO | esg.extractors.llm_extractor | llm: querying model gpt-4o-mini


[saved] out/esg_unstructured_long.csv
total_ghg_emissions        value=123400.0      unit=tCO2e     conf=0.75  src=llm
energy_consumption         value=500000.0      unit=MWh       conf=0.75  src=llm
water_withdrawal           value=1200000.0     unit=m3        conf=0.75  src=llm

--- Running pipeline on: esg_ocr_noise.pdf ---


2025-11-19 20:31:34,061 | INFO | httpx | HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-19 20:31:34,077 | INFO | esg.extractors.table_grid_extractor | table_grid: extracting from data/samples/esg_corrupted_table.pdf
2025-11-19 20:31:34,091 | INFO | esg.extractors.table_plain_extractor | table_plain: extracting from data/samples/esg_corrupted_table.pdf
2025-11-19 20:31:34,103 | INFO | esg.extractors.table_plain_extractor | table_plain hit total_ghg_emissions: 123,400 tCO2e (line='Total GHG emissions (tCO2e) 123,400')
2025-11-19 20:31:34,103 | INFO | esg.extractors.table_plain_extractor | table_plain hit water_withdrawal: 1,200,000 m3 (line='Total water withdrawal (m3) 1,200,000')
2025-11-19 20:31:34,138 | INFO | esg.extractors.table_grid_extractor | table_grid: extracting from data/samples/esg_llm_realistic_1.pdf
2025-11-19 20:31:34,161 | INFO | esg.extractors.table_plain_extractor | table_plain: extracting from data/samples/esg_llm_realistic_1.p

[saved] out/esg_ocr_noise.csv
total_ghg_emissions        value=123400.0      unit=tCO2e     conf=0.75  src=llm
energy_consumption         value=500000.0      unit=MWh       conf=0.75  src=llm
water_withdrawal           value=1200000.0     unit=m3        conf=0.75  src=llm

--- Running pipeline on: esg_corrupted_table.pdf ---
[saved] out/esg_corrupted_table.csv
total_ghg_emissions        value=123400.0      unit=tCO2e     conf=0.9   src=table_grid
energy_consumption         value=500000.0      unit=MWh       conf=0.9   src=table_grid
water_withdrawal           value=1200000.0     unit=m3        conf=0.9   src=table_grid

--- Running pipeline on: esg_llm_realistic_1.pdf ---


2025-11-19 20:31:37,180 | INFO | httpx | HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-19 20:31:37,231 | INFO | esg.extractors.table_grid_extractor | table_grid: extracting from data/samples/esg_llm_realistic_2.pdf
2025-11-19 20:31:37,289 | INFO | esg.extractors.table_plain_extractor | table_plain: extracting from data/samples/esg_llm_realistic_2.pdf
2025-11-19 20:31:37,315 | INFO | esg.pipeline.pipeline | pipeline: 3 KPIs missing after deterministic extractors; using llm backfill.
2025-11-19 20:31:37,333 | INFO | esg.extractors.llm_extractor | llm: querying model gpt-4o-mini


[saved] out/esg_llm_realistic_1.csv
total_ghg_emissions        value=123400.0      unit=tCO2e     conf=0.75  src=llm
energy_consumption         value=500000.0      unit=MWh       conf=0.75  src=llm
water_withdrawal           value=1200000.0     unit=m3        conf=0.85  src=table_plain

--- Running pipeline on: esg_llm_realistic_2.pdf ---


2025-11-19 20:31:39,376 | INFO | httpx | HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[saved] out/esg_llm_realistic_2.csv
total_ghg_emissions        (no extraction)
energy_consumption         (no extraction)
water_withdrawal           (no extraction)

=== Completed ===
