In [1]:
# ------------------------------------------------------------
# Evaluate SAMPLED ESG reports
# ------------------------------------------------------------

from pathlib import Path
from esg.pipeline.pipeline import run_pipeline
from esg.pipeline.io_utils import save_results_to_csv  # optional export

PDF_DIR = Path("data/samples")

TEST_PDFS = [
    "esg_simple_text.pdf",
    "esg_simple_table.pdf",
    "esg_simple_mixed.pdf",
    "esg_locale_numbers.pdf",
    "esg_messy_units.pdf",
    "esg_nlp_test.pdf",
    "esg_unstructured_long.pdf",
    "esg_ocr_noise.pdf",
    "esg_corrupted_table.pdf",
    "esg_llm_realistic_1.pdf",
    "esg_llm_realistic_2.pdf",
]

print("\n=== ESG Pipeline – Integration Test ===\n")

for pdf in TEST_PDFS:
    path = PDF_DIR / pdf
    if not path.exists():
        print(f"[SKIP] {pdf} — file not found")
        continue

    print(f"\n--- Running pipeline on: {pdf} ---")
    try:
        results = run_pipeline(str(path))
    except Exception as e:
        print(f"ERROR while processing {pdf}: {e}")
        continue

    # Optional CSV export
    out_csv = Path("data/out") / f"{pdf.replace('.pdf', '')}.csv"
    save_results_to_csv(results, out_csv)
    print(f"[saved] {out_csv}")

    # Convert list → dict for pretty printing
    lookup = {r.code: r for r in results}

    for code, r in lookup.items():

        print(
            f"{code:25}  "
            f"value={str(r.value):<12}  "
            f"unit={(r.unit or ''):<8}  "
            f"conf={r.confidence:<4}  "
            f"src={','.join(r.source) or '-':<10}  "
            f"status={r.status}"
        )

print("\n=== Completed ===")

2025-11-20 18:49:50,409 | INFO | esg.extractors.table_grid_extractor | table_grid: extracting from data/samples/esg_simple_text.pdf
2025-11-20 18:49:50,415 | INFO | esg.extractors.table_plain_extractor | table_plain: extracting from data/samples/esg_simple_text.pdf
2025-11-20 18:49:50,421 | INFO | esg.extractors.regex_extractor | regex hit total_ghg_emissions (B paren-unit-value): (tCO2e) 123,400
2025-11-20 18:49:50,422 | INFO | esg.extractors.regex_extractor | regex hit energy_consumption (D paren-unit-near-value): (MWh) ... 500,000
2025-11-20 18:49:50,422 | INFO | esg.extractors.regex_extractor | regex hit water_withdrawal (D paren-unit-near-value): (m3) ... 1,200,000
2025-11-20 18:49:50,429 | INFO | esg.extractors.table_grid_extractor | table_grid: extracting from data/samples/esg_simple_table.pdf
2025-11-20 18:49:50,434 | INFO | esg.extractors.table_plain_extractor | table_plain: extracting from data/samples/esg_simple_table.pdf
2025-11-20 18:49:50,438 | INFO | esg.extractors.regex


=== ESG Pipeline – Integration Test ===


--- Running pipeline on: esg_simple_text.pdf ---
[saved] data/out/esg_simple_text.csv
total_ghg_emissions        value=123400.0      unit=tCO2e     conf=0.6   src=regex       status=Not Reported
energy_consumption         value=500000.0      unit=MWh       conf=0.6   src=regex       status=Not Reported
water_withdrawal           value=1200000.0     unit=m3        conf=0.6   src=regex       status=Not Reported

--- Running pipeline on: esg_simple_table.pdf ---
[saved] data/out/esg_simple_table.csv
total_ghg_emissions        value=123400.0      unit=tCO2e     conf=0.9   src=table_grid  status=Not Reported
energy_consumption         value=500000.0      unit=MWh       conf=0.9   src=table_grid  status=Not Reported
water_withdrawal           value=1200000.0     unit=m3        conf=0.9   src=table_grid  status=Not Reported

--- Running pipeline on: esg_simple_mixed.pdf ---
[saved] data/out/esg_simple_mixed.csv
total_ghg_emissions        value=123400

2025-11-20 18:49:52,684 | INFO | httpx | HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-20 18:49:52,707 | INFO | esg.extractors.table_grid_extractor | table_grid: extracting from data/samples/esg_messy_units.pdf
2025-11-20 18:49:52,714 | INFO | esg.extractors.table_plain_extractor | table_plain: extracting from data/samples/esg_messy_units.pdf
2025-11-20 18:49:52,721 | INFO | esg.extractors.regex_extractor | regex hit water_withdrawal (C unit-value): m³ 1,200,000
2025-11-20 18:49:52,732 | INFO | esg.extractors.table_grid_extractor | table_grid: extracting from data/samples/esg_nlp_test.pdf
2025-11-20 18:49:52,768 | INFO | esg.extractors.table_plain_extractor | table_plain: extracting from data/samples/esg_nlp_test.pdf
2025-11-20 18:49:52,775 | INFO | esg.extractors.regex_extractor | regex hit total_ghg_emissions (A value-unit): 123,400 tCO2e
2025-11-20 18:49:52,776 | INFO | esg.extractors.regex_extractor | regex hit energy_consumption (A value-u

[saved] data/out/esg_locale_numbers.csv
total_ghg_emissions        value=1200000.0     unit=tCO2e     conf=0.9   src=table_grid  status=Not Reported
energy_consumption         value=None          unit=          conf=0.0   src=-           status=Not Reported
water_withdrawal           value=None          unit=          conf=0.0   src=-           status=Not Reported

--- Running pipeline on: esg_messy_units.pdf ---
[saved] data/out/esg_messy_units.csv
total_ghg_emissions        value=123400.0      unit=tCO2e     conf=0.9   src=table_grid  status=Not Reported
energy_consumption         value=500000.0      unit=MWh       conf=0.9   src=table_grid  status=Not Reported
water_withdrawal           value=1200000.0     unit=m3        conf=0.9   src=table_grid  status=Not Reported

--- Running pipeline on: esg_nlp_test.pdf ---
[saved] data/out/esg_nlp_test.csv
total_ghg_emissions        value=123400.0      unit=tCO2e     conf=0.6   src=regex       status=Not Reported
energy_consumption         va

2025-11-20 18:49:52,898 | INFO | esg.extractors.table_plain_extractor | table_plain: extracting from data/samples/esg_llm_realistic_1.pdf
2025-11-20 18:49:52,921 | INFO | esg.extractors.table_plain_extractor | table_plain hit water_withdrawal: 1,200,000. m3 (line='(MWh) 500,000; total water withdrawal (m3) 1,200,000.')
2025-11-20 18:49:52,922 | INFO | esg.extractors.regex_extractor | regex hit total_ghg_emissions (B paren-unit-value): (tCO2e) 123,400
2025-11-20 18:49:52,922 | INFO | esg.extractors.regex_extractor | regex hit energy_consumption (B paren-unit-value): (MWh) 500,000
2025-11-20 18:49:52,922 | INFO | esg.extractors.regex_extractor | regex hit water_withdrawal (B paren-unit-value): (m3) 1,200,000.
2025-11-20 18:49:52,948 | INFO | esg.extractors.table_grid_extractor | table_grid: extracting from data/samples/esg_llm_realistic_2.pdf
2025-11-20 18:49:52,970 | INFO | esg.extractors.table_plain_extractor | table_plain: extracting from data/samples/esg_llm_realistic_2.pdf
2025-11-2

[saved] data/out/esg_llm_realistic_1.csv
total_ghg_emissions        value=123400.0      unit=tCO2e     conf=0.6   src=regex       status=Not Reported
energy_consumption         value=500000.0      unit=MWh       conf=0.6   src=regex       status=Not Reported
water_withdrawal           value=1200000.0     unit=m3        conf=0.85  src=table_plain  status=Not Reported

--- Running pipeline on: esg_llm_realistic_2.pdf ---


2025-11-20 18:49:54,801 | INFO | httpx | HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[saved] data/out/esg_llm_realistic_2.csv
total_ghg_emissions        value=None          unit=          conf=0.0   src=-           status=Not Reported
energy_consumption         value=None          unit=          conf=0.0   src=-           status=Not Reported
water_withdrawal           value=None          unit=          conf=0.0   src=-           status=Not Reported

=== Completed ===
