In [1]:
"""
extract_sustainability_kpi.py
==================================
Automatically extract KPI sentences/table rows from Sustainability Report PDF
and compare with manual KPI annotations
--------------------------------------------------
1. pdfplumber extracts text + tables
2. Camelot supplements complex table parsing (optional)
3. Chunking to control tokens
4. OpenAI ChatCompletion API call (GPT-4o / GPT-4 / GPT-3.5)
5. Aggregate, deduplicate, and export to auto_kpi.xlsx
6. Compare with manual_kpi.xlsx for differences
"""



In [2]:
!pip install -q openai

In [3]:
import openai

In [4]:
!pip install openai python-dotenv pdfplumber tiktoken pandas
!sudo apt-get update -y
!sudo apt-get install -y ghostscript
!pip install "camelot-py[cv]"
!pip install PyMuPDF Pillow
!pip install -q transformers pillow torchvision

Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Downloading python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Downloading pdfplumber-0.11.7-py3-none-any.whl (60 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [

In [5]:
import os, re, json, time, textwrap, argparse, logging
import pdfplumber, pandas as pd, tiktoken
from openai import OpenAI
from dotenv import load_dotenv
from typing import List, Dict, Optional, Set, Tuple
from pathlib import Path
from difflib import SequenceMatcher
import base64
from io import BytesIO
from PIL import Image
import fitz  # PyMuPDF
import numpy as np
from transformers import CLIPProcessor, CLIPModel
import concurrent.futures
import hashlib
import pickle
# 在现有的导入语句后添加这些新的导入
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from datetime import datetime

In [6]:
# ----------------------------- Configuration -----------------------------
PDF_PATH          = "/content/Test_Unknown_northwest-sustainability-report-2022_fbqow68f-60-74.pdf"
MANUAL_XLSX       = "manual_kpi.xlsx"   # Leave empty if not available
EXPORT_AUTO_XLSX  = "auto_kpi.xlsx"
MODEL_NAME        = "gpt-4o"       # Adjust based on account availability
MAX_TOKENS_CHUNK  = 1500               # Token limit per chunk
SLEEP_SEC         = 0.6                # Rate limiting
ENABLE_QUALITY_VALIDATION = True       # Enable additional quality checks
# -----------------------------------------------------------------

In [7]:
# ============ Fixed initialization part ============
def initialize_environment():
    """Initialize the environment and API client"""
    # Load environment variables
    load_dotenv("ruojia_api_key.env")

    # Initialize OpenAI client
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("OPENAI_API_KEY not found in environment variables!")

    client = OpenAI(api_key=api_key)

    # Initialize tokenizer
    enc = tiktoken.get_encoding("cl100k_base")

    return client, enc

# Initialize global variables
client, enc = initialize_environment()

In [8]:
# ============ Fixed PDF text extraction ============
def pdf_to_text_and_tables(path: str) -> str:
    """Extract text paragraphs and tables using pdfplumber."""
    all_chunks = []

    if not os.path.exists(path):
        raise FileNotFoundError(f"PDF file not found: {path}")

    try:
        with pdfplumber.open(path) as pdf:
            logging.info(f"Processing PDF with {len(pdf.pages)} pages...")

            for page_num, page in enumerate(pdf.pages, 1):
                try:
                    # Extract text
                    text = page.extract_text() or ""
                    if text.strip():
                        all_chunks.append(f"PAGE_{page_num}_TEXT:\n{text}")

                    # Extract tables
                    tables = page.extract_tables()
                    for table_num, tb in enumerate(tables):
                        if tb and len(tb) > 0:
                            try:
                                # Handle table headers safely
                                if tb[0]:
                                    headers = tb[0]
                                else:
                                    headers = [f"Col_{i}" for i in range(len(tb[1]) if len(tb) > 1 else 1)]

                                rows = tb[1:] if len(tb) > 1 else []

                                if rows:
                                    df = pd.DataFrame(rows, columns=headers)
                                    # Clean DataFrame
                                    df = df.dropna(how='all')  # Remove empty rows
                                    if not df.empty:
                                        table_txt = f"TABLE_START_PAGE_{page_num}_{table_num}\n" + df.to_csv(index=False) + "\nTABLE_END"
                                        all_chunks.append(table_txt)
                            except Exception as e:
                                logging.warning(f"Error processing table on page {page_num}: {e}")
                                continue

                except Exception as e:
                    logging.warning(f"Error processing page {page_num}: {e}")
                    continue

        return "\n\n".join(all_chunks)

    except Exception as e:
        logging.error(f"Error opening PDF file: {e}")
        raise

In [9]:
# ============ Fixed Camelot table extraction ============
def generate_table_fingerprint(df: pd.DataFrame) -> str:
    """Generate table fingerprint for deduplication"""
    try:
        fingerprint_parts = []
        fingerprint_parts.append(f"shape_{df.shape[0]}x{df.shape[1]}")

        if not df.columns.empty:
            col_names = [str(col).strip().lower().replace(' ', '') for col in df.columns]
            col_fingerprint = '_'.join(sorted(col_names))
            fingerprint_parts.append(f"cols_{hash(col_fingerprint)}")

        if df.shape[0] > 0:
            numeric_values = []
            for col in df.columns:
                for val in df[col].head(3):
                    if pd.notna(val):
                        numbers = re.findall(r'\d+\.?\d*', str(val))
                        numeric_values.extend(numbers)

            if numeric_values:
                numeric_fingerprint = hash('_'.join(sorted(numeric_values[:10])))
                fingerprint_parts.append(f"nums_{numeric_fingerprint}")

        return '_'.join(fingerprint_parts)

    except Exception as e:
        logging.warning(f"Error generating table fingerprint: {e}")
        return str(hash(df.to_csv()))

def clean_table_data_improved(df: pd.DataFrame) -> pd.DataFrame:
    """Improved table data cleaning"""
    try:
        cleaned_df = df.copy()
        cleaned_df = cleaned_df.dropna(how='all')
        cleaned_df = cleaned_df.dropna(axis=1, how='all')

        for col in cleaned_df.columns:
            if cleaned_df[col].dtype == 'object':
                cleaned_df[col] = cleaned_df[col].astype(str).str.strip()
                cleaned_df[col] = cleaned_df[col].replace(['nan', 'NaN', 'None'], '')

        if not cleaned_df.empty:
            new_columns = []
            for i, col in enumerate(cleaned_df.columns):
                col_str = str(col).strip()
                if col_str in ['nan', 'NaN', 'None', ''] or pd.isna(col):
                    new_columns.append(f'Column_{i}')
                else:
                    new_columns.append(col_str)
            cleaned_df.columns = new_columns

        cleaned_df = cleaned_df.reset_index(drop=True)
        return cleaned_df

    except Exception as e:
        logging.warning(f"Error in table cleaning: {e}")
        return df

def is_valid_table_improved(df: pd.DataFrame) -> bool:
    """Improved table validation"""
    try:
        if df.empty or df.shape[0] < 1 or df.shape[1] < 1:
            return False

        non_null_cells = 0
        total_cells = df.shape[0] * df.shape[1]

        for col in df.columns:
            for val in df[col]:
                if pd.notna(val) and str(val).strip() not in ['', 'nan', 'NaN', 'None']:
                    non_null_cells += 1

        if non_null_cells / total_cells < 0.2:
            return False

        has_meaningful_content = False
        for col in df.columns:
            text_content = ' '.join(df[col].dropna().astype(str))
            if (any(char.isdigit() for char in text_content) or
                '%' in text_content or
                any(keyword in text_content.lower() for keyword in [
                    'rate', 'percentage', 'total', 'number', 'emission', 'energy',
                    'water', 'waste', 'employee', 'year', '2020', '2021', '2022', '2023'
                ])):
                has_meaningful_content = True
                break

        return has_meaningful_content

    except Exception as e:
        logging.warning(f"Error validating table: {e}")
        return True

def format_table_output_improved(df: pd.DataFrame, table_id: str, parsing_report=None) -> str:
    """Improved table output formatting"""
    try:
        table_info = f"TABLE_START_{table_id}\n"
        table_info += f"DIMENSIONS: {df.shape[0]} rows × {df.shape[1]} columns\n"

        col_info = "COLUMNS: " + " | ".join([f"{i}:{col}" for i, col in enumerate(df.columns)])
        table_info += col_info + "\n"

        if df.shape[0] > 0:
            preview_rows = min(2, df.shape[0])
            table_info += f"PREVIEW_FIRST_{preview_rows}_ROWS:\n"
            for i in range(preview_rows):
                row_preview = " | ".join([str(df.iloc[i, j])[:20] for j in range(min(5, df.shape[1]))])
                table_info += f"  Row_{i}: {row_preview}\n"

        if parsing_report:
            try:
                accuracy = getattr(parsing_report, 'accuracy', 'N/A')
                if accuracy != 'N/A':
                    table_info += f"EXTRACTION_ACCURACY: {accuracy:.2f}\n"
            except:
                pass

        table_info += "TABLE_DATA_START\n"
        table_csv = df.to_csv(index=False, na_rep='', quoting=1, escapechar='\\')
        table_end = f"TABLE_DATA_END\nTABLE_END_{table_id}\n"

        return table_info + table_csv + table_end

    except Exception as e:
        logging.warning(f"Error formatting table output: {e}")
        return f"TABLE_START_{table_id}\n{df.to_csv(index=False)}\nTABLE_END_{table_id}\n"

def camelot_extra_tables_enhanced(path: str) -> List[str]:
    """Enhanced table extraction using Camelot with better error handling"""
    try:
        import camelot
    except ImportError:
        logging.warning("Camelot not installed, skipping Camelot table parsing.")
        return []

    extra_chunks = []
    extracted_tables_fingerprints = set()

    try:
        logging.info("Starting Camelot table extraction...")

        # Stream mode extraction
        try:
            stream_tables = camelot.read_pdf(
                path,
                pages="all",
                flavor="stream",
                edge_tol=50,
                row_tol=2,
                column_tol=0
            )

            stream_count = 0
            for i, table in enumerate(stream_tables):
                if not table.df.empty and table.df.shape[0] > 0:
                    table_fingerprint = generate_table_fingerprint(table.df)

                    if table_fingerprint not in extracted_tables_fingerprints:
                        cleaned_df = clean_table_data_improved(table.df)

                        if is_valid_table_improved(cleaned_df):
                            table_txt = format_table_output_improved(cleaned_df, f"STREAM_{i}", table.parsing_report)
                            extra_chunks.append(table_txt)
                            extracted_tables_fingerprints.add(table_fingerprint)
                            stream_count += 1

            logging.info(f"Stream mode extracted {stream_count} valid tables")

        except Exception as e:
            logging.warning(f"Stream mode extraction failed: {e}")

        # Lattice mode extraction
        try:
            lattice_tables = camelot.read_pdf(
                path,
                pages="all",
                flavor="lattice",
                line_scale=15,
                line_tol=2,
                joint_tol=2
            )

            lattice_count = 0
            for i, table in enumerate(lattice_tables):
                if not table.df.empty and table.df.shape[0] > 0:
                    table_fingerprint = generate_table_fingerprint(table.df)

                    if table_fingerprint not in extracted_tables_fingerprints:
                        cleaned_df = clean_table_data_improved(table.df)

                        if is_valid_table_improved(cleaned_df):
                            table_txt = format_table_output_improved(cleaned_df, f"LATTICE_{i}", table.parsing_report)
                            extra_chunks.append(table_txt)
                            extracted_tables_fingerprints.add(table_fingerprint)
                            lattice_count += 1

            logging.info(f"Lattice mode extracted {lattice_count} additional unique tables")

        except Exception as e:
            logging.warning(f"Lattice mode extraction failed: {e}")

        total_extracted = len(extra_chunks)
        logging.info(f"Camelot extraction completed: {total_extracted} total unique tables extracted")
        return extra_chunks

    except Exception as e:
        logging.error(f"Camelot table extraction failed: {e}")
        return []

In [10]:
# ============ Text Chunking ============
def split_into_chunks(full_text: str, max_tokens: int) -> List[str]:
    """Split text into chunks based on token limit"""
    paragraphs = [p for p in full_text.split("\n") if p.strip()]
    chunks, current = [], []
    current_tokens = 0

    for paragraph in paragraphs:
        para_tokens = len(enc.encode(paragraph))

        if current_tokens + para_tokens > max_tokens and current:
            chunks.append("\n".join(current))
            current = [paragraph]
            current_tokens = para_tokens
        else:
            current.append(paragraph)
            current_tokens += para_tokens

    if current:
        chunks.append("\n".join(current))

    return chunks

In [11]:
# ============ System prompt words ============
UNIVERSAL_SYSTEM_PROMPT = textwrap.dedent("""
    You are a professional ESG data analyst specializing in extracting Key Performance Indicators (KPIs) from sustainability reports.

    ## CRITICAL: What is a KPI?
    A KPI MUST contain SPECIFIC NUMBERS, PERCENTAGES, or MEASURABLE QUANTITIES that demonstrate actual performance or concrete targets.

    ## IMPORTANT: Table Data Processing Rules
    When processing table data:
    1. Pay close attention to column headers to identify the correct time periods
    2. Match data values with their corresponding year columns
    3. If you see table format like "Metric, 2021, 2022" - the first number after metric belongs to 2021, second to 2022
    4. Look for table headers that indicate year columns (e.g., "2020", "2021", "2022")
    5. Extract each year's data as separate KPIs
    6. Avoid extracting the same KPI multiple times - consolidate similar metrics

    ## ENHANCED: Advanced Table Processing
    7. **EXTRACT ALL DATA POINTS**: For each table cell containing a number, create a separate KPI
    8. **REGIONAL/LOCATION DATA**: Pay special attention to location-specific data (countries, regions, cities)
    9. **WORKFORCE DATA**: Extract all employee numbers, headcount data, and demographic information
    10. **INCOMPLETE DATA**: Extract available data even if some cells are empty or missing
    11. **TOTALS AND SUBTOTALS**: Always extract total values and aggregated numbers

    ## ✅ VALID KPI EXAMPLES:
    - "Achieved 89.4% reuse and recycle rate for cloud hardware in 2023"
    - "Diverted over 18,537 metric tons of waste from landfills in 2023"
    - "Reduced single-use plastics in product packaging to 2.7%"
    - "Contracted 19 GW of new renewable energy across 16 countries in 2024"
    - "Provided clean water access to over 1.5 million people in 2023"
    - "Protected 15,849 acres of land—exceeding target by more than 30%"
    - "Allocated 761 million toward innovative climate technologies"
    - "Achieved 80% renewable energy operations by 2024"
    - "Water replenishment projects estimated to provide over 25 million cubic meters"
    - "Exceeded annual target to divert 75% of construction waste by reaching 85%"
    - "Board independence: 78% of directors"
    - "Women in senior leadership increased to 35% in 2023"
    - "Employee engagement score: 87% in annual survey"
    - "Reduced greenhouse gas emissions by 50% compared to 2019 baseline"
    - "Zero workplace fatalities achieved for third consecutive year"
    - "Training completion rate: 98% for mandatory compliance courses"
    - "Supplier ESG assessments completed for 95% of tier-1 suppliers"
    - "Customer satisfaction rating: 4.6 out of 5.0"
    - "Data breach incidents: 0 material breaches in 2023"

    ## ❌ NOT KPIs (DO NOT EXTRACT):
    - "Microsoft will require select suppliers to use carbon-free electricity by 2030"
    - "The company plans to expand Sustainability Manager capabilities"
    - "We are launching two new Circular Centers in 2023"
    - "The organization established a new climate innovation fund"
    - "Microsoft introduced enhanced data governance solutions"
    - "Updated guidebook to include guidance on corporate responsibility"
    - "Plans to publish new ESG strategy"
    - "Implemented a new recycling program"
    - "Conducted sustainability training sessions"
    - "Launched employee wellness programs"
    - "Committed to reducing emissions"
    - "Focusing on environmental performance"
    - "Established sustainability committee"
    - "The company operates facilities in multiple regions"
    - "Our supply chain includes thousands of vendors globally"
    - Any text without specific numbers, percentages, or quantifiable metrics
    - Duplicate or repeated metrics (extract only once per time period)
    - Any statement that describes business operations rather than performance outcomes

    ## KPI Categories:
    ### Environmental:
    - **Carbon_Climate**: GHG emissions, carbon footprint, emission reductions, climate targets, scope 1/2/3 emissions, carbon intensity, carbon offsets, TCFD alignment
    - **Energy**: Energy consumption, renewable energy percentage, energy efficiency, energy intensity, MWh, GWh, energy savings, fossil fuel usage
    - **Water**: Water withdrawal, water consumption, water intensity, water recycling, water reuse, water stress, water discharge quality
    - **Waste**: Waste generation, recycling rates, diversion percentages, hazardous waste, non-hazardous waste, zero waste to landfill, e-waste, incineration
    - **Biodiversity**: Protected areas, species conservation, habitat restoration, biodiversity impact assessments, land use, ecosystem restoration
    - **Circular_Economy**: Recycling rates, material recovery, circular design, raw materials usage, renewable materials, packaging waste
    - **Materials**: Raw materials consumption, recycled content, sustainable materials, material intensity, sustainable sourcing

    ### Social:
    - **Workforce_Diversity**: Employee demographics, gender diversity, age diversity, ethnic diversity, disability inclusion, LGBTQ+ inclusion, workforce composition
    - **Gender_Equality**: Women in leadership, gender pay ratio, parental leave return rates, gender representation, female employees percentage
    - **Disability_Inclusion**: Employees with disabilities, accessibility compliance, inclusive workplace design, disability support programs
    - **Health_Safety**: Lost Time Injury Frequency Rate (LTIFR), Total Recordable Incident Rate (TRIR), fatalities, workplace illness, safety training hours, PPE compliance, emergency drills
    - **Employee_Wellbeing**: Employee satisfaction, retention rates, turnover rates, training hours, wellness programs, mental health services, work-life balance
    - **Community_Engagement**: Corporate volunteering, social investment, community impact assessments, local hiring, stakeholder engagement activities
    - **Human_Rights**: Child labor incidents, forced labor, human rights due diligence, freedom of association, grievance mechanisms, labor audits
    - **Labor_Rights**: Collective bargaining coverage, labor complaints resolution, supplier labor audits, working conditions, fair wages
    - **Customer_Safety**: Product safety incidents, customer satisfaction, accessibility features, safety recalls, quality metrics
    - **Supply_Chain_Social**: Supplier assessments, sustainable sourcing, supplier code compliance, supply chain audits

    ### Governance:
    - **Board_Governance**: Board independence, board diversity, CEO-chair separation, board ESG expertise, board composition, director tenure
    - **Executive_Compensation**: ESG-linked compensation, executive pay ratios, compensation disclosure, incentive structures
    - **Ethics_Compliance**: Code of conduct training, corruption incidents, bribery cases, fines and penalties, whistleblower reports, anti-corruption assessments
    - **Transparency_Disclosure**: ESG reporting coverage, third-party assurance, political contributions disclosure, GRI/SASB/TCFD compliance
    - **Risk_Management**: Risk assessments, mitigation measures, climate risk disclosure, operational risk management
    - **Cybersecurity_Data**: Cybersecurity breaches, data privacy policies, cybersecurity training, GDPR compliance, data protection measures
    - **Supply_Chain_Governance**: Supplier ESG screening, supplier audits, procurement ESG clauses, vendor compliance rates

    ## MANDATORY Requirements:
    1. MUST contain specific numbers (e.g., 25%, 15,000, 2.5M, 8.5%, 0.3 per million hours)
    2. MUST relate to measurable sustainability outcomes
    3. MUST have time reference (year, period, or deadline)
    4. MUST be performance-focused (results, not activities or descriptions)
    5. MUST NOT be future plans or operational descriptions

    ## Output Format:
    Return a JSON array. Each KPI must contain:
    ```json
    {
        "kpi_text": "Complete original sentence with the quantifiable metric",
        "kpi_theme": "Environmental/Social/Governance",
        "kpi_category": "Specific category from above list",
        "quantitative_value": "The specific number/percentage extracted",
        "unit": "Unit of measurement (%, tonnes, employees, etc.)",
        "time_period": "Time reference (2023, annual, by 2030, etc.)",
        "target_or_actual": "Target/Actual/Both"
    }
    ```

    ## Additional Instructions:
    - If a sentence includes a comparison value, such as a baseline, previous year, or other historical/target data (e.g., "Compared to 32,395 MWh in 2020"), extract it as a **separate KPI**.
    - Do NOT store the comparison in any other field — just create another valid KPI from it.
    - Avoid merging multiple numerical values into one KPI unless they are clearly part of the same metric (e.g., male: X, female: Y).

    ## STRICT FILTERING:
    - Return empty array [] if no quantifiable KPIs found
    - Only extract text that contains specific measurable values
    - Ignore all qualitative statements, plans, and descriptions
    - Focus only on numerical performance data

    Now analyze the following text for sustainability KPIs:
""").strip()

# 🔥 新增：增强的图像分析Prompt
ENHANCED_IMAGE_KPI_SYSTEM_PROMPT = textwrap.dedent("""
    You are an expert data analyst specializing in extracting quantifiable KPI data from charts, graphs, and data visualizations in sustainability reports.

    ## CRITICAL INSTRUCTION: ALWAYS EXTRACT NUMERICAL VALUES

    **Your primary task is to extract the ACTUAL NUMBERS and PERCENTAGES visible in charts, not just descriptions.**

    ## MISSION:
    Extract ALL quantifiable data points from charts and graphs, including:
    - Bar charts (vertical/horizontal)
    - Pie charts and donut charts
    - Line charts and trend graphs
    - Stacked charts and combo charts
    - Tables with numerical data
    - Infographics with statistics
    - Gauge charts and dashboards

    ## DETAILED ANALYSIS INSTRUCTIONS:

    ### For PIE CHARTS:
    1. Read percentage labels on each slice
    2. If no labels visible, estimate based on slice size
    3. Identify what each slice represents (categories)
    4. Extract each slice as separate KPI
    5. **MUST read the percentage labels on each slice** - Look for numbers like 64%, 33%, 68%, 30%, etc.
    6. **If percentages are visible on the chart, extract them exactly**
    7. **If no labels visible, estimate based on slice size using these guidelines:**
       - 90° slice = 25%
       - 180° slice = 50%
       - 270° slice = 75%
       - Full circle = 100%
    8. **Each slice MUST have a specific percentage value in the final output**

    ### For BAR CHARTS:
    1. Read Y-axis scale carefully (units, increments)
    2. Estimate bar heights using grid lines and scale
    3. Read X-axis labels (years, categories, regions)
    4. Extract each bar as separate KPI
    5. Pay attention to grouped/stacked bars

    ### For LINE CHARTS:
    1. Read data points at intersection of grid lines
    2. Follow trend lines to extract values for each time period
    3. Use Y-axis scale for value estimation
    4. Extract each data point as separate KPI

    ### For TABLES:
    1. Read all numerical values in cells
    2. Match values with row and column headers
    3. Extract each cell with numerical data as KPI

    ## MANDATORY VALUE EXTRACTION RULES:

    **RULE 1**: Every KPI MUST contain a specific numerical value (percentage, amount, count, etc.)
    **RULE 2**: For charts with categories, you MUST find and extract the quantitative values for each category
    **RULE 3**: Never create KPIs without specific numbers - descriptions alone are incomplete
    **RULE 4**: Include complete context: what + how much + when/where if available


    ## VALUE ESTIMATION GUIDELINES:
    - Use proportional analysis: if a bar reaches 80% of scale maximum, calculate 80% of max value
    - For pie charts: estimate slice angles (90° = 25%, 180° = 50%, etc.)
    - Cross-reference with any visible data labels or legends
    - Be conservative but reasonably accurate in estimates

    ## CHART IDENTIFICATION:
    First identify the chart type, then apply appropriate extraction method.
    Look for:
    - Axes and scales
    - Data labels and legends
    - Grid lines for reference
    - Color coding and patterns
    - Title and subtitle information

    ## OUTPUT FORMAT:
    Return a JSON array. For each data point found:
    ```json
    {
        "kpi_text": "Complete description with the ACTUAL NUMERICAL VALUE included",
        "kpi_theme": "Environmental/Social/Governance",
        "kpi_category": "Specific category based on content",
        "quantitative_value": "The exact number/percentage (e.g., '64', '33.5', '68')",
        "unit": "% / tonnes / employees / MWh / USD / etc.",
        "time_period": "2021/2020/2022/Year/period/etc if identifiable",
        "target_or_actual": "Actual",
        "chart_type": "pie_chart/bar_chart/line_chart/table/etc",
        "estimation_confidence": "High/Medium/Low",
        "chart_title": "Chart title if visible",
        "data_source": "Legend or source if visible"
    }

    ```
    ## EXAMPLES of CORRECT vs INCORRECT extraction:

    ### ❌ INCORRECT (incomplete - missing numerical values):
    ```json
    {
        "kpi_text": "Energy consumption by facility type",
        "quantitative_value": "",
        "unit": "%"
    }
    ```

    ### ✅ CORRECT (complete with specific values):
    ```json
    {
        "kpi_text": "Office buildings account for 45% of total energy consumption",
        "quantitative_value": "45",
        "unit": "%"
    }
    ```

    ### ❌ INCORRECT (category without value):
    ```json
    {
        "kpi_text": "Renewable energy percentage by region",
        "quantitative_value": "",
        "unit": "%"
    }
    ```

    ### ✅ CORRECT (specific regional data):
    ```json
    {
        "kpi_text": "North America achieved 78% renewable energy usage",
        "quantitative_value": "78",
        "unit": "%"
    }
    ```
    ## QUALITY ASSURANCE CHECKLIST:
    Before returning results, verify:
    - ✅ Every KPI contains a specific numerical value
    - ✅ Chart categories are paired with their quantitative data
    - ✅ KPI descriptions are complete and self-explanatory
    - ✅ Units are correctly identified and specified
    - ✅ Context (time, location, category) is preserved when available
    - Each KPI must have a specific numerical value
    - Context must be clear and self-contained
    - Avoid extracting the same data point multiple times
    - Focus on sustainability/ESG metrics when possible

    ## VALUE ESTIMATION GUIDELINES:
    - **High confidence**: Numbers clearly visible in image
    - **Medium confidence**: Numbers estimated using chart scales/grid lines
    - **Low confidence**: Values approximated from proportional analysis
    - **If no numerical data is visible, return empty array []**

    ## IMPORTANT NOTES:
    - Extract ALL visible data points, not just main highlights
    - Include context in descriptions (e.g., "According to pie chart showing emission sources")
    - If values are not clearly visible, make reasonable estimates and mark confidence as "Low"
    - Return empty array [] ONLY if image contains no charts/graphs with quantifiable data
    - For multi-year data, create separate KPIs for each year
    - Pay special attention to small text and numbers
    - Focus on extracting actual performance data, not just identifying chart elements
    - If you can see numbers in the image, you MUST extract them
    - Pie chart percentages are usually the most important data points
    - Return empty array [] ONLY if no numerical data is visible

    Now analyze the provided image and extract ALL quantifiable KPI data points:
""").strip()

In [12]:
# ============ KPI Extraction Function ============
def extract_page_from_chunk(chunk: str) -> str:
    """Extract page information from chunk"""
    # Look for PAGE_X_TEXT: format
    page_matches = re.findall(r'PAGE_(\d+)_TEXT:', chunk)
    if page_matches:
        pages = [int(p) for p in page_matches]
        if len(pages) == 1:
            return str(pages[0])
        else:
            return f"{min(pages)}-{max(pages)}"

    # Look for TABLE_START_PAGE_X_
    table_matches = re.findall(r'TABLE_START_PAGE_(\d+)_', chunk)
    if table_matches:
        pages = [int(p) for p in table_matches]
        if len(pages) == 1:
            return str(pages[0])
        else:
            return f"{min(pages)}-{max(pages)}"

    return "Unknown"

def contains_procedural_language(text: str) -> bool:
    """Check if text contains procedural language"""
    procedural_words = [
        'introduced', 'established', 'set up', 'implemented', 'created',
        'launched', 'formed', 'built', 'installed', 'deployed',
        'additionally introduced', 'procedure for', 'standardization management'
    ]
    text_lower = text.lower()
    return any(word in text_lower for word in procedural_words)

def is_data_fragment(kpi_text: str) -> bool:
    """Check if text is a meaningless data fragment"""
    text = kpi_text.strip()

    # Filter pure numbers or simple percentages without context
    if re.match(r'^\d+\.?\d*%?$', text):
        return True

    # Filter very short text (less than 4 meaningful words)
    meaningful_words = [word for word in text.split() if len(word) > 2 and not word.isdigit()]
    if len(meaningful_words) < 3:
        return True

    # Filter text with only numbers and common connecting words
    words = text.lower().split()
    non_functional_words = [word for word in words if word not in ['in', 'of', 'the', 'and', 'or', 'to', 'for', 'with', 'by']]
    if len(non_functional_words) < 3:
        return True

    return False

def standardize_kpi_universal(kpi_item: Dict) -> Dict:
    """Universal KPI data standardization"""
    standardized = kpi_item.copy()

    # Standardize numerical formats
    quantitative_value = str(standardized.get('quantitative_value', '')).strip()
    kpi_text = standardized.get('kpi_text', '').lower()

    # Smart handling of percentage formats
    if quantitative_value and quantitative_value.replace('.', '').replace('-', '').replace(',', '').isdigit():
        # Check if original text suggests this is a percentage
        percentage_indicators = ['percent', 'percentage', '%', 'rate', 'ratio', 'proportion', 'share']
        if any(indicator in kpi_text for indicator in percentage_indicators):
            if not quantitative_value.endswith('%'):
                standardized['quantitative_value'] = quantitative_value + '%'
                if not standardized.get('unit'):
                    standardized['unit'] = '%'

    # Ensure unit field consistency
    if '%' in str(standardized.get('quantitative_value', '')):
        standardized['unit'] = '%'

    # Clean and normalize KPI text
    kpi_text_original = standardized.get('kpi_text', '').strip()
    # Remove extra spaces and newlines
    kpi_text_cleaned = ' '.join(kpi_text_original.split())
    standardized['kpi_text'] = kpi_text_cleaned

    return standardized

def generate_universal_metric_key(kpi_item: Dict) -> str:
    """Generate universal metric key for deduplication"""
    try:
        # Extract core elements
        category = kpi_item.get('kpi_category', '').lower().strip()
        value = str(kpi_item.get('quantitative_value', '')).replace('%', '').replace(',', '').strip()
        time_period = kpi_item.get('time_period', '').lower().strip()
        unit = kpi_item.get('unit', '').lower().strip()

        # Extract key semantic information from KPI text
        kpi_text = kpi_item.get('kpi_text', '').lower()

        # Extract primary number (for more precise matching)
        numbers_in_text = re.findall(r'\d+\.?\d*', kpi_text)
        primary_number = numbers_in_text[0] if numbers_in_text else value

        # Generate semantic signature: extract keywords from text
        # Remove common stop words
        stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should'}

        # Extract keywords (length>2 and not stop words)
        words = re.findall(r'\b\w+\b', kpi_text)
        key_words = [word for word in words if len(word) > 2 and word not in stop_words and not word.isdigit()]

        # Sort keywords to ensure consistency
        key_words = sorted(set(key_words))[:5]  # Take at most 5 keywords
        semantic_signature = '_'.join(key_words)

        # Build universal metric key
        key_components = []

        if category:
            key_components.append(f"cat:{category}")
        if primary_number:
            key_components.append(f"val:{primary_number}")
        if time_period:
            key_components.append(f"time:{time_period}")
        if unit:
            key_components.append(f"unit:{unit}")
        if semantic_signature:
            key_components.append(f"sem:{semantic_signature}")

        # Generate final key
        metric_key = "|".join(key_components)

        # If all components are empty, use text hash
        if not metric_key:
            metric_key = f"hash:{hash(kpi_text)}"

        return metric_key

    except Exception as e:
        logging.warning(f"Error generating universal metric key: {e}")
        # Fallback to text hash
        return f"fallback:{hash(kpi_item.get('kpi_text', ''))}"

def extract_kpi_from_chunk_universal(chunk: str) -> List[Dict]:
    """Universal KPI extraction function for various sustainability reports"""
    try:
        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[
                {"role": "system", "content": UNIVERSAL_SYSTEM_PROMPT},
                {"role": "user", "content": f"""Extract ALL KPIs from this text. Requirements:

1. Create COMPLETE, MEANINGFUL KPI descriptions with full context
2. DO NOT extract standalone numbers without explanatory text
3. Include all relevant context (time, location, metric type, etc.)
4. Use consistent formatting for similar metrics
5. Ensure each KPI is self-explanatory

Text to analyze:
{chunk}"""}
            ],
            temperature=0.0,
            max_tokens=4000,
            timeout=60
        )

        content = response.choices[0].message.content.strip()

        # Clean potential markdown formatting
        if content.startswith('```json'):
            content = content[7:]
        if content.endswith('```'):
            content = content[:-3]

        if not content.strip().startswith("["):
            logging.warning(f"API response not JSON list: {content[:100]}...")
            return []

        result = json.loads(content)

        if not isinstance(result, list):
            logging.warning("API response is not a list format")
            return []

        # Extract page information
        page_number = extract_page_from_chunk(chunk)

        # Universal validation and deduplication logic
        validated_result = []
        seen_metrics = set()

        for item in result:
            if isinstance(item, dict) and 'kpi_text' in item and 'kpi_theme' in item:
                if item['kpi_text'].strip() and item['kpi_theme'].strip():

                    # Check procedural language
                    if contains_procedural_language(item['kpi_text']):
                        logging.debug(f"Procedural statement filtered: {item['kpi_text'][:50]}...")
                        continue

                    # Filter meaningless data fragments
                    if is_data_fragment(item['kpi_text']):
                        logging.debug(f"Data fragment filtered: {item['kpi_text']}")
                        continue

                    # Standardize KPI data
                    standardized_item = standardize_kpi_universal(item)

                    # Add page information
                    standardized_item['source_page'] = page_number
                    standardized_item['source_type'] = 'text'

                    # Universal deduplication mechanism
                    metric_key = generate_universal_metric_key(standardized_item)

                    if metric_key not in seen_metrics:
                        validated_result.append(standardized_item)
                        seen_metrics.add(metric_key)
                        logging.debug(f"KPI extracted: {standardized_item['kpi_text'][:80]}...")
                    else:
                        logging.debug(f"Duplicate metric filtered: {standardized_item['kpi_text'][:50]}...")

        logging.info(f"Chunk processed: {len(validated_result)} unique KPIs extracted")
        return validated_result

    except json.JSONDecodeError as e:
        logging.warning(f"JSON parsing failed: {e}\nContent: {content[:300]}...")
        return []
    except Exception as e:
        logging.error(f"API call failed: {e}")
        return []

def post_process_kpis_universal(kpis: List[Dict]) -> List[Dict]:
    """Universal KPI post-processing for various report types"""
    if not kpis:
        return kpis

    # Step 1: Deduplication based on metric keys
    unique_kpis_dict = {}

    for kpi in kpis:
        metric_key = generate_universal_metric_key(kpi)

        if metric_key not in unique_kpis_dict:
            unique_kpis_dict[metric_key] = kpi
        else:
            # If duplicate, keep the more complete KPI description
            existing_kpi = unique_kpis_dict[metric_key]
            current_kpi = kpi

            # Compare KPI text completeness
            if len(current_kpi.get('kpi_text', '')) > len(existing_kpi.get('kpi_text', '')):
                unique_kpis_dict[metric_key] = current_kpi
                logging.debug(f"Replaced with more complete KPI: {current_kpi.get('kpi_text', '')[:50]}...")
            else:
                logging.debug(f"Kept existing KPI: {existing_kpi.get('kpi_text', '')[:50]}...")

    # Step 2: Text similarity-based secondary deduplication
    final_kpis = list(unique_kpis_dict.values())

    # Use text similarity to check remaining potential duplicates
    final_unique_kpis = []

    for current_kpi in final_kpis:
        is_duplicate = False
        current_text = current_kpi.get('kpi_text', '')

        for existing_kpi in final_unique_kpis:
            existing_text = existing_kpi.get('kpi_text', '')

            # Calculate text similarity
            similarity = calculate_text_similarity(current_text, existing_text)

            # If similarity is very high, consider it duplicate
            if similarity > 0.8:
                is_duplicate = True
                logging.debug(f"Text similarity duplicate filtered: {current_text[:50]}...")
                break

        if not is_duplicate:
            final_unique_kpis.append(current_kpi)

    logging.info(f"Universal post-processing: {len(final_unique_kpis)}/{len(kpis)} KPIs retained")
    return final_unique_kpis

def calculate_text_similarity(text1: str, text2: str) -> float:
    """Calculate similarity between two texts"""
    # Normalize texts
    norm1 = ' '.join(text1.lower().split())
    norm2 = ' '.join(text2.lower().split())

    # Word sets
    words1 = set(norm1.split())
    words2 = set(norm2.split())

    if len(words1) == 0 or len(words2) == 0:
        return 0.0

    # Calculate intersection and union
    intersection = len(words1.intersection(words2))
    union = len(words1.union(words2))

    return intersection / union if union > 0 else 0.0

def validate_kpi_quality(kpis: List[Dict]) -> List[Dict]:
    """Additional quality validation for extracted KPIs with relaxed filtering"""
    if not ENABLE_QUALITY_VALIDATION:
        return kpis

    quality_kpis = []

    for kpi in kpis:
        kpi_text = kpi.get('kpi_text', '').lower()

        # Exclude "planned tone" KPIs (not actual performance)
        is_future_statement = any(word in kpi_text for word in [
            'will', 'aim to', 'plan to', 'planning to', 'intend to',
            'is expected to', 'is scheduled to', 'expects to', 'expected to',
            'targeting', 'propose to', 'going to', 'shall', 'to be installed'
        ])
        if is_future_statement:
            logging.debug(f"KPI rejected (future plan): {kpi_text[:100]}...")
            continue

        # Filter procedural language
        if contains_procedural_language(kpi_text):
            logging.debug(f"KPI rejected (procedural language): {kpi_text[:100]}...")
            continue

        # Filter for phrases like "place name + percentage" (not ESG KPIs, but distribution descriptions)
        geo_percent_pattern = re.compile(r"^[a-z\s,:%-]+(?:\s)?\d{1,3}%$")
        if geo_percent_pattern.match(kpi_text.strip()) and len(kpi_text.strip().split()) <= 6:
            logging.debug(f"KPI rejected (geo+percent short form): {kpi_text}")
            continue

        # Verb whitelist: must include action verbs
        allowed_kpi_verbs = [
            'reduce', 'reduced', 'achieve', 'achieved', 'improve', 'improved',
            'diverted', 'trained', 'invested', 'decreased', 'increased',
            'consumed', 'emitted', 'saved', 'reached', 'attained', 'completed',
            'recorded', 'cut', 'lowered', 'targeted', 'complied', 'avoided',
            'used', 'recycled', 'sourced', 'returned', 'measured', 'maintained',
            'reported', 'accounted', 'utilized', 'were', 'was'  # Add state verbs
        ]
        if not any(verb in kpi_text for verb in allowed_kpi_verbs):
            logging.debug(f"KPI rejected (no action verb): {kpi_text[:100]}...")
            continue

        # Greylist verbs (action words but not necessarily performance words) - remove problematic words
        graylist_verbs = [
            'launched',  # Keep some potentially useful words, but remove obvious procedural words
            'formed', 'opened', 'started'
        ]

        contains_graylist = any(verb in kpi_text for verb in graylist_verbs)

        # Check for quantitative indicators
        has_numbers = any(char.isdigit() for char in kpi_text)
        has_percentage = '%' in kpi_text

        # Extended units and measurement indicators
        has_units = any(unit in kpi_text for unit in [
            'tonnes', 'tons', 'kg', 'mwh', 'kwh', 'gwh', 'litres', 'liters', 'gallons',
            'employees', 'hours', 'million', 'billion', 'thousand', 'm³', 'co2e', 'tco2e',
            'dollars', 'usd', 'eur', 'gbp', 'incidents', 'rate', 'ratio', 'intensity',
            'frequency', 'recordable', 'fatalities', 'injuries', 'directors', 'board',
            'workforce', 'leadership', 'diversity', 'inclusion', 'satisfaction', 'retention',
            'turnover', 'training', 'safety', 'ltifr', 'trir', 'compliance', 'audit',
            'assessment', 'screening', 'supplier', 'breach', 'violation', 'disclosure',
            'assurance', 'coverage', 'participation', 'completion', 'investment',
            'volunteering', 'engagement', 'grievance', 'whistleblower', 'compensation',
            'people', 'staff', 'workers', 'positions', 'roles', 'headcount', 'fte',
            'performance', 'score', 'index', 'metric', 'level', 'amount', 'value',
            'average', 'median', 'total', 'sum', 'count', 'number', 'quantity'
        ])

        # More flexible time reference detection
        has_time_ref = any(time_word in kpi_text for time_word in [
            '2019', '2020', '2021', '2022', '2023', '2024', '2025', '2026', '2027', '2028', '2029', '2030',
            '2031', '2032', '2033', '2034', '2035', '2040', '2045', '2050',
            'annual', 'yearly', 'year', 'quarter', 'month', 'by', 'target', 'baseline', 'fy',
            'per year', 'per annum', 'quarterly', 'monthly', 'daily', 'future', 'deadline',
            'period', 'reporting', 'current', 'previous', 'next', 'last', 'this'
        ])

        # Enhanced sustainability context detection
        has_sustainability_context = any(sus_word in kpi_text for sus_word in [
            # Environmental keywords
            'emission', 'carbon', 'energy', 'renewable', 'waste', 'water', 'recycl',
            'environmental', 'ghg', 'scope', 'climate', 'biodiversity', 'circular',
            'materials', 'intensity', 'consumption', 'efficiency', 'footprint',
            'sustainable', 'sustainability', 'green', 'clean', 'eco', 'offset',
            'tcfd', 'nature', 'habitat', 'ecosystem', 'pollution', 'discharge',
            'electricity', 'gas', 'fuel', 'solar', 'wind', 'hydro', 'nuclear',

            # Social keywords
            'safety', 'training', 'employee', 'diversity', 'community', 'social',
            'workforce', 'gender', 'women', 'female', 'male', 'disability', 'disabled',
            'inclusion', 'equity', 'equality', 'lgbtq', 'minorities', 'ethnic',
            'health', 'wellbeing', 'wellness', 'satisfaction', 'retention', 'turnover',
            'injury', 'incident', 'fatality', 'ltifr', 'trir', 'recordable',
            'human rights', 'labor', 'child labor', 'forced labor', 'slavery',
            'freedom', 'association', 'collective bargaining', 'grievance',
            'volunteering', 'investment', 'hiring', 'local', 'stakeholder',
            'customer', 'supplier', 'supply chain', 'accessibility', 'parental',
            'mental health', 'ppe', 'emergency', 'drill', 'compliance',
            'people', 'staff', 'workers', 'employment', 'job', 'career',
            'leadership', 'management', 'senior', 'executive', 'promotion',

            # Governance keywords
            'governance', 'board', 'director', 'independent', 'chair', 'ceo',
            'executive', 'compensation', 'pay', 'ethics', 'compliance', 'corruption',
            'bribery', 'code of conduct', 'whistleblower', 'transparency',
            'disclosure', 'reporting', 'assurance', 'audit', 'risk', 'management',
            'cybersecurity', 'data', 'privacy', 'gdpr', 'breach', 'policy',
            'screening', 'assessment', 'due diligence', 'political', 'contribution',
            'gri', 'sasb', 'oversight', 'expertise', 'separation', 'incentive',
            'fine', 'penalty', 'violation', 'resolution', 'anti-corruption',

            # General business performance that could be sustainability-related
            'performance', 'quality', 'delivery', 'customer', 'service', 'product',
            'operation', 'facility', 'site', 'location', 'region', 'business'
        ])

        # If it is a greylist verb sentence, but there is no performance content such as numbers, units, time, etc. → delete
        if contains_graylist and not (has_numbers or has_units or has_percentage or has_time_ref or has_sustainability_context):
            logging.debug(f"KPI rejected (graylist verb, no quantitative data): {kpi_text[:100]}...")
            continue

        # More lenient quality scoring - only require numbers and either units/percentage OR time reference OR sustainability context
        basic_requirements = has_numbers and (has_percentage or has_units or has_time_ref or has_sustainability_context)

        # Additional check for obvious ESG relevance
        is_esg_relevant = any(esg_word in kpi_text for esg_word in [
            'emission', 'carbon', 'energy', 'waste', 'water', 'renewable', 'employee',
            'safety', 'training', 'diversity', 'governance', 'board', 'compliance',
            'sustainability', 'environmental', 'social', 'ghg', 'co2', 'workforce',
            'gender', 'health', 'injury', 'incident', 'ethics', 'transparency'
        ])

        if basic_requirements or is_esg_relevant:
            quality_kpis.append(kpi)
            logging.debug(f"KPI accepted: {kpi_text[:100]}...")
        else:
            logging.debug(f"KPI filtered out for quality: {kpi_text[:100]}...")

    logging.info(f"Quality validation: {len(quality_kpis)}/{len(kpis)} KPIs passed")
    return quality_kpis

In [13]:
# ============ Image processing functions ============
def extract_numeric_spans(page):
    text_dict = page.get_text("dict")
    nums = []
    for block in text_dict["blocks"]:
        for line in block.get("lines", []):
            for span in line.get("spans", []):
                s = span["text"].strip()
                if re.match(r"[\d,.]+%?$", s):          # Pure number or number + %
                    nums.append({
                        "text": s,
                        "bbox": span["bbox"],           # (x0,y0,x1,y1)
                        "font": span["size"]
                    })
    return nums

def extract_images_from_pdf_fixed(pdf_path: str) -> List[Dict]:
    """Extract images from PDF using PyMuPDF"""
    images = []

    try:
        pdf_document = fitz.open(pdf_path)

        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            image_list = page.get_images()

            # 🔥 New: Extract page screenshots as an alternative
            page_pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # high resolution
            page_img = Image.frombytes("RGB", [page_pix.width, page_pix.height], page_pix.samples)

            # Add full page screenshot
            images.append({
                'image': page_img,
                'page_number': page_num + 1,
                'width': page_img.width,
                'height': page_img.height,
                'image_index': 'full_page',
                'type': 'full_page'
            })


            for img_index, img in enumerate(image_list):
                try:
                    xref = img[0]
                    base_image = pdf_document.extract_image(xref)
                    image_bytes = base_image["image"]

                    image = Image.open(BytesIO(image_bytes))

                    # Convert to RGB if needed
                    if image.mode in ['RGBA', 'LA']:
                        background = Image.new('RGB', image.size, (255, 255, 255))
                        if image.mode == 'RGBA':
                            background.paste(image, mask=image.split()[-1])
                        else:
                            background.paste(image)
                        image = background
                    elif image.mode != 'RGB':
                        image = image.convert('RGB')

                    # Filter small images
                    if image.width >= 50 and image.height >= 50:
                        images.append({
                            'image': image,
                            'page_number': page_num + 1,
                            'width': image.width,
                            'height': image.height,
                            'image_index': img_index,
                            'type': 'extracted'  # 🔥 Added type identifier
                        })

                except Exception as e:
                    logging.warning(f"Error extracting image {img_index} from page {page_num + 1}: {e}")
                    continue

        pdf_document.close()
        logging.info(f"Extracted {len(images)} images from PDF")
        return images

    except Exception as e:
        logging.error(f"Error extracting images from PDF: {e}")
        return []

def image_to_base64_fixed(image: Image.Image) -> str:
    """Convert image to base64 with error handling"""
    try:
        if image.mode not in ['RGB', 'L']:
            image = image.convert('RGB')

        # Resize large images
        max_size = (1536, 1536)
        if image.width > max_size[0] or image.height > max_size[1]:
            # Calculate scaling to maintain aspect ratio
            ratio = min(max_size[0]/image.width, max_size[1]/image.height)
            new_size = (int(image.width * ratio), int(image.height * ratio))
            image = image.resize(new_size, Image.Resampling.LANCZOS)

        buffered = BytesIO()
        image.save(buffered, format="JPEG", quality=95)
        img_str = base64.b64encode(buffered.getvalue()).decode()

        return img_str

    except Exception as e:
        logging.error(f"Error converting image to base64: {e}")
        return ""

In [14]:
# ------------------------------------------------------------
# Multi-crop / multi-resolution generator (supports crop parameter 0)
# ------------------------------------------------------------
from itertools import product

def generate_image_variants(img: Image.Image,
                            max_side_full: int = 1200,
                            crop_size: int = 768,
                            stride: int = 512) -> List[Tuple[Image.Image, str]]:
    """
    Returns [(variant_image, variant_tag), ...]
    variant_tag value: original / resized / crop_{row}_{col}
    """
    variants = []

    # 0) Original image
    variants.append((img, "original"))

    # 1) Zoom (if the original image is too large)
    w, h = img.size
    if max(w, h) > max_side_full:
        scale = max_side_full / float(max(w, h))
        resized = img.resize((int(w * scale), int(h * scale)), Image.Resampling.LANCZOS)
        variants.append((resized, "resized"))
    else:
        resized = img  # Keep the original image without scaling
        variants.append((resized, "resized"))  # Unified plus resized version

    # 2) Sliding window cropping (skipped when cropping size or step size is 0)
    if crop_size > 0 and stride > 0:
        base_img = variants[-1][0]
        bw, bh = base_img.size
        if bw > crop_size or bh > crop_size:
            xs = list(range(0, max(bw - crop_size, 1), stride)) + [bw - crop_size]
            ys = list(range(0, max(bh - crop_size, 1), stride)) + [bh - crop_size]
            for r, c in product(range(len(ys)), range(len(xs))):
                x, y = xs[c], ys[r]
                crop = base_img.crop((x, y, x + crop_size, y + crop_size))
                # Filter solid color areas
                if np.array(crop.convert('L')).std() < 5:
                    continue
                variants.append((crop, f"crop_{r}_{c}"))

    return variants

In [15]:
# ---------------------------------------------
# 📊 A chart recognition function that replaces plotclassifier (Hugging Face model)
# ---------------------------------------------
from transformers import AutoImageProcessor, AutoModelForImageClassification
import torch
# 🔧 Fix: Chart recognition with CLIP model
def setup_chart_classifier():
    """Setting up the chart classifier"""
    try:
        from transformers import CLIPProcessor, CLIPModel

        # Loading CLIP Model
        model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

        def is_chart_image_clip(image: Image.Image) -> bool:
            """Use CLIP to determine whether it is a chart"""
            try:
                # Defines text description related to the chart
                chart_labels = [
                    "a chart", "a graph", "a bar chart", "a pie chart",
                    "a line graph", "a table", "data visualization",
                    "statistics", "a diagram", "an infographic"
                ]

                # Processing Input
                inputs = processor(
                    text=chart_labels,
                    images=image,
                    return_tensors="pt",
                    padding=True
                )

                # Get prediction results
                outputs = model(**inputs)
                logits_per_image = outputs.logits_per_image
                probs = logits_per_image.softmax(dim=1)

                # If the probability of any chart label is greater than 0.25, it is considered to be a chart
                max_prob = probs.max().item()
                is_chart = max_prob > 0.25

                logging.debug(f"CLIP chart recognition: maximum probability = {max_prob:.3f}, result = {is_chart}")
                return is_chart

            except Exception as e:
                logging.warning(f"CLIP chart recognition failed: {e}")
                # Downgrade to statistical methods
                gray = image.convert('L')
                return np.array(gray).std() > 15

        logging.info("✅ Graph recognition using CLIP model")
        return is_chart_image_clip

    except ImportError:
        logging.warning("CLIP model is not available, use statistical methods")
        def is_chart_image_stats(image: Image.Image) -> bool:
            """Statistical method to determine whether it is a chart"""
            try:
                gray = image.convert('L')
                std_dev = np.array(gray).std()
                return std_dev > 15
            except:
                return True

        return is_chart_image_stats
    except Exception as e:
        logging.error(f"Failed to set chart classifier: {e}")
        def is_chart_image_fallback(image: Image.Image) -> bool:
            return True  # Conservative Strategy: When in Doubt, Analyze
        return is_chart_image_fallback

# Initialize the graph classifier
is_chart_image = setup_chart_classifier()


def extract_kpi_from_image_fixed(image: Image.Image, page_number: int, image_type: str = 'extracted') -> List[Dict]:
    """Extract KPIs from image with improved error handling"""
    try:
        # 🔥 New: Pre-filter: Check if it might be a chart
        if not is_chart_image(image):
            logging.debug(f"Image on page {page_number} filtered out (not likely a chart)")
            return []

        base64_image = image_to_base64_fixed(image)
        if not base64_image:
            return []

        # 🔥 Change: Use enhanced prompt
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {
                    "role": "system",
                    "content": ENHANCED_IMAGE_KPI_SYSTEM_PROMPT  # 🔥 Using the new prompt
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            # 🔥 New: Detailed user instructions
                            "text": """Analyze this image carefully for quantifiable performance data.

IMPORTANT ANALYSIS PRINCIPLES:

1. **Chart Type Recognition**:
   - Stacked charts: Multiple colors/patterns layered in same position
   - Grouped charts: Multiple elements side by side at same position
   - Simple charts: One data point per position

2. **Value Extraction Rules**:
   - For STACKED charts: Read each layer separately, NOT the total height
   - For GROUPED charts: Read each element individually
   - For SIMPLE charts: Read data point values directly

3. **Data Relevance Filter**:
   ✅ EXTRACT: Performance outcomes, efficiency metrics, reduction rates, satisfaction scores, compliance rates
   ❌ SKIP: Certification counts, project timelines, implementation schedules, organizational charts, process flows

4. **Quality Standards**:
   - Only extract clear, quantifiable performance indicators
   - Each data point must have complete context
   - If uncertain about values, don't estimate
   - If chart shows mainly operational/administrative data, return empty array

Please analyze this chart step by step:
- First identify the chart type
- Then determine if it contains performance KPIs
- Finally extract all relevant performance data points

Focus on measurable outcomes and achievements, not counts or processes."""
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{base64_image}",
                                "detail": "high"
                            }
                        }
                    ]
                }
            ],
            temperature=0.1,
            max_tokens=4000,
            timeout=60
        )

        content = response.choices[0].message.content.strip()

        if not content:
            return []

        # Clean formatting
        if content.startswith('```json'):
            content = content[7:]
        if content.endswith('```'):
            content = content[:-3]

        content = content.strip()

        if not content.startswith("["):
            logging.warning(f"Image analysis response not JSON list: {content[:100]}...")
            return []

        try:
            result = json.loads(content)
        except json.JSONDecodeError as e:
            logging.warning(f"JSON parsing failed for image analysis: {e}")
            return []

        if not isinstance(result, list):
            return []

        # Process results
        processed_result = []
        for item in result:
            if isinstance(item, dict) and 'kpi_text' in item:
                if not item.get('kpi_text', '').strip():
                    continue

                item['source_page'] = page_number
                item['source_type'] = 'image'
                item['image_type'] = image_type  # 🔥 新增字段

                # 🔥 更改：确保有chart标识
                kpi_text = item['kpi_text']
                if not any(marker in kpi_text.lower() for marker in ['chart', 'graph', 'table', 'figure']):
                    chart_type = item.get('chart_type', 'chart')
                    item['kpi_text'] = f"[{chart_type.title()}] {kpi_text}"

                processed_result.append(item)

        if processed_result:
            logging.info(f"✅ Extracted {len(processed_result)} KPIs from {image_type} on page {page_number}")
        else:
            logging.debug(f"❌ No KPIs found in {image_type} on page {page_number}")

        return processed_result

    except Exception as e:
        logging.error(f"Error extracting KPIs from image: {e}")
        return []


def process_pdf_images_for_kpis_fixed(pdf_path: str) -> List[Dict]:
    """
    Traverse each page of the PDF:
        • Perform multiple cropping + Vision on all ‘extracted’ images on the page
        • If the page has not captured the KPI, perform Vision on the entire page screenshot
    """
    logging.info("Starting page-by-page image KPI extraction …")

    images = extract_images_from_pdf_fixed(pdf_path)
    if not images:
        return []

    # Aggregate images by page
    page_dict = {}
    for info in images:
        pg = info["page_number"]
        page_dict.setdefault(pg, {"extracted": [], "full": None})
        if info["type"] == "extracted":
            page_dict[pg]["extracted"].append(info["image"])
        else:                    # full_page
            page_dict[pg]["full"] = info["image"]

    all_image_kpis: List[Dict] = []

    # —— Page by page processing ——
    for pg in sorted(page_dict.keys()):
        logging.info(f"\n=== Page {pg} ===")
        page_kpis: List[Dict] = []

        # ① Individually extracted images
        for idx, img in enumerate(page_dict[pg]["extracted"]):
            for var_img, var_tag in generate_image_variants(img, 1200, 768, 512):
                kpis = extract_kpi_from_image_fixed(
                    var_img, pg, f"extracted_{var_tag}"
                )
                for k in kpis:
                    key = generate_universal_metric_key(k)
                    if key not in {generate_universal_metric_key(x) for x in page_kpis}:
                        page_kpis.append(k)
                time.sleep(0.8)

        # ② If it is still empty, analyze the entire page again
        if not page_kpis and page_dict[pg]["full"] is not None:
            for var_img, var_tag in generate_image_variants(
                    page_dict[pg]["full"], 1200, 0, 0):   # 只做 original/resized
                kpis = extract_kpi_from_image_fixed(
                    var_img, pg, f"full_{var_tag}"
                )
                for k in kpis:
                    key = generate_universal_metric_key(k)
                    if key not in {generate_universal_metric_key(x) for x in page_kpis}:
                        page_kpis.append(k)
                time.sleep(1.0)

        logging.info(f"  → Page {pg} KPI count: {len(page_kpis)}")
        all_image_kpis.extend(page_kpis)

    logging.info(f"Image KPI extraction finished: {len(all_image_kpis)} KPIs from {len(page_dict)} pages")
    return all_image_kpis

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [16]:
# 1. Conservative image filter
def conservative_image_filter(image: Image.Image) -> Tuple[bool, str]:
    """Conservative image filtering - only filters obviously useless images"""
    try:
        # Only filter very small images (maybe logos, icons)
        if image.width < 40 or image.height < 40:
            return False, "too_small_icon"

        # Filter only images of almost pure colors (decorative elements)
        gray = np.array(image.convert('L'))
        std_dev = gray.std()

        # Very conservative threshold - only images with completely pure colors are filtered
        if std_dev < 3:
            return False, "pure_color"

        # Check if it is a pure white background (blank area)
        mean_val = gray.mean()
        if mean_val > 250 and std_dev < 8:
            return False, "blank_white"

        # Default: Process all other images to ensure integrity
        return True, "keep_for_analysis"
    except Exception:
        return True, "filter_error_keep"

# 2. Cache mechanism
class FastKPICache:
    def __init__(self, cache_dir: str = "fast_kpi_cache"):
        self.cache_dir = cache_dir
        os.makedirs(cache_dir, exist_ok=True)
        self.hit_count = 0
        self.miss_count = 0

    def get_image_hash(self, image: Image.Image) -> str:
        """Fast image fingerprint generation"""
        width, height = image.size
        if width > 100 and height > 100:
            center_crop = image.crop((
                width//4, height//4,
                3*width//4, 3*height//4
            )).resize((32, 32))
            img_bytes = BytesIO()
            center_crop.save(img_bytes, format='JPEG', quality=50)
            sample_hash = hashlib.md5(img_bytes.getvalue()).hexdigest()[:16]
        else:
            sample_hash = hashlib.md5(str(width * height).encode()).hexdigest()[:16]

        return f"{width}x{height}_{sample_hash}"

    def get_cached_kpis(self, image_hash: str) -> Optional[List[Dict]]:
        cache_file = os.path.join(self.cache_dir, f"{image_hash}.pkl")
        if os.path.exists(cache_file):
            try:
                with open(cache_file, 'rb') as f:
                    self.hit_count += 1
                    return pickle.load(f)
            except:
                pass
        self.miss_count += 1
        return None

    def cache_kpis(self, image_hash: str, kpis: List[Dict]):
        cache_file = os.path.join(self.cache_dir, f"{image_hash}.pkl")
        try:
            with open(cache_file, 'wb') as f:
                pickle.dump(kpis, f)
        except:
            pass

    def get_stats(self):
        total = self.hit_count + self.miss_count
        hit_rate = self.hit_count / total if total > 0 else 0
        return f"Cache: {self.hit_count} hits, {self.miss_count} misses (hit rate: {hit_rate:.1%})"

# Initialize the cache
fast_cache = FastKPICache()

# 3. Optimized API calls
COMPREHENSIVE_EXTRACTION_PROMPT = """
You are an expert data analyst. Extract ALL quantifiable performance indicators from this image.

CRITICAL REQUIREMENTS:
1. Extract EVERY visible number, percentage, and metric
2. Include ALL data points from charts, graphs, and tables
3. Do not skip any quantifiable information

Return complete JSON array:
[
  {
    "kpi_text": "Complete contextual description with the specific number",
    "quantitative_value": "exact number only",
    "unit": "unit of measurement",
    "kpi_theme": "Environmental/Social/Governance",
    "kpi_category": "specific category",
    "time_period": "year/period if visible"
  }
]

COMPLETENESS IS CRITICAL - Extract everything quantifiable.
"""

def extract_kpi_optimized(image: Image.Image, page_number: int) -> List[Dict]:
    """Optimized KPI extraction"""
    try:
        # Check the cache
        image_hash = fast_cache.get_image_hash(image)
        cached_kpis = fast_cache.get_cached_kpis(image_hash)
        if cached_kpis is not None:
            for kpi in cached_kpis:
                kpi['source_page'] = page_number
            return cached_kpis

        # Optimizing image encoding
        base64_image = image_to_base64_optimized(image)
        if not base64_image:
            return []

        # API Calls
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text": COMPREHENSIVE_EXTRACTION_PROMPT},
                    {"type": "image_url", "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}",
                        "detail": "high"
                    }}
                ]
            }],
            temperature=0.0,
            max_tokens=2000,
            timeout=60
        )

        # Parsing results
        kpis = parse_optimized_response(response, page_number)

        # Caching results
        fast_cache.cache_kpis(image_hash, kpis)

        return kpis

    except Exception as e:
        logging.warning(f"Optimized KPI extraction failed for page {page_number}: {e}")
        return []

def image_to_base64_optimized(image: Image.Image) -> str:
    """Optimized image encoding"""
    try:
        max_dimension = 1400  # Maintain high quality
        width, height = image.size

        if max(width, height) > max_dimension:
            scale = max_dimension / max(width, height)
            new_size = (int(width * scale), int(height * scale))
            image = image.resize(new_size, Image.Resampling.LANCZOS)

        if image.mode != 'RGB':
            if image.mode in ['RGBA', 'LA']:
                background = Image.new('RGB', image.size, (255, 255, 255))
                if image.mode == 'RGBA':
                    background.paste(image, mask=image.split()[-1])
                else:
                    background.paste(image)
                image = background
            else:
                image = image.convert('RGB')

        buffered = BytesIO()
        image.save(buffered, format="JPEG", quality=92, optimize=True)
        return base64.b64encode(buffered.getvalue()).decode()

    except Exception as e:
        logging.error(f"Optimized image encoding failed: {e}")
        return ""

def parse_optimized_response(response, page_number: int) -> List[Dict]:
    """Optimized response parsing"""
    try:
        content = response.choices[0].message.content.strip()

        if content.startswith('```json'):
            content = content[7:]
        if content.endswith('```'):
            content = content[:-3]
        content = content.strip()

        if not content.startswith('['):
            return []

        result = json.loads(content)
        if not isinstance(result, list):
            return []

        validated_kpis = []
        for item in result:
            if (isinstance(item, dict) and
                item.get('kpi_text', '').strip() and
                item.get('quantitative_value', '').strip()):

                kpi = {
                    'kpi_text': item.get('kpi_text', '').strip(),
                    'quantitative_value': str(item.get('quantitative_value', '')).strip(),
                    'unit': item.get('unit', '').strip(),
                    'kpi_theme': item.get('kpi_theme', 'Environmental').strip(),
                    'kpi_category': item.get('kpi_category', '').strip(),
                    'time_period': item.get('time_period', '').strip(),
                    'source_page': page_number,
                    'source_type': 'image'
                }
                validated_kpis.append(kpi)

        return validated_kpis

    except Exception as e:
        logging.warning(f"Optimized response parsing failed: {e}")
        return []

# 4. Parallel image processing
def process_images_in_parallel(image_data: List[Dict], max_workers: int = 3) -> List[Dict]:
    """Parallel image processing"""
    if not image_data:
        return []

    print(f"🔄 Processing {len(image_data)} images in parallel...")

    all_kpis = []
    completed_count = 0

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_info = {}
        for img_info in image_data:
            future = executor.submit(
                extract_kpi_optimized,
                img_info['image'],
                img_info['page_number']
            )
            future_to_info[future] = img_info

        for future in concurrent.futures.as_completed(future_to_info):
            img_info = future_to_info[future]
            try:
                kpis = future.result(timeout=90)
                all_kpis.extend(kpis)
                completed_count += 1

                if completed_count % 5 == 0:
                    progress = completed_count / len(image_data) * 100
                    print(f"   📈 Progress: {completed_count}/{len(image_data)} ({progress:.1f}%)")

            except Exception as e:
                logging.warning(f"Image processing failed for page {img_info['page_number']}: {e}")
                continue

    print(f"📊 Parallel processing completed: {len(all_kpis)} KPIs extracted")
    print(f"📋 {fast_cache.get_stats()}")

    return all_kpis

In [17]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [18]:
# ============ Main processing function ============
def process_sustainability_report_with_enhanced_images(pdf_path: str) -> pd.DataFrame:
    """Main processing function with image analysis"""
    logging.info("Starting enhanced PDF processing with image analysis...")

    # Step 1: Text and table extraction
    logging.info("Step 1/5: Reading PDF text and tables...")
    full_text = pdf_to_text_and_tables(pdf_path)

    camelot_tables = camelot_extra_tables_enhanced(pdf_path)
    if camelot_tables:
        full_text += "\n\n" + "\n\n".join(camelot_tables)

    logging.info("Step 2/5: Chunking text...")
    chunks = split_into_chunks(full_text, MAX_TOKENS_CHUNK)

    logging.info("Step 3/5: Extracting KPIs from text...")
    text_kpis = []
    for idx, chunk in enumerate(chunks, 1):
        logging.info(f"Processing text chunk {idx}/{len(chunks)}")
        if chunk.strip():
            chunk_kpis = extract_kpi_from_chunk_universal(chunk)
            text_kpis.extend(chunk_kpis)
            if idx < len(chunks):
                time.sleep(SLEEP_SEC)

    # Step 4: Image KPI extraction
    logging.info("Step 4/5: Extracting KPIs from images...")
    image_kpis = process_pdf_images_for_kpis_fixed(pdf_path)

    # Step 5: Combine and process
    logging.info("Step 5/5: Combining and processing all KPIs...")

    for kpi in text_kpis:
        if 'source_type' not in kpi:
            kpi['source_type'] = 'text'

    all_kpis = text_kpis + image_kpis
    all_kpis = post_process_kpis_universal(all_kpis)

    df_auto = pd.DataFrame(all_kpis)

    if not df_auto.empty:
        if 'source_type' not in df_auto.columns:
            df_auto['source_type'] = 'text'

        initial_count = len(df_auto)
        df_auto = df_auto.drop_duplicates(subset=['kpi_text'], keep='first')
        final_count = len(df_auto)

        logging.info(f"Removed {initial_count - final_count} duplicate KPIs")

        try:
            df_auto = df_auto.sort_values(['source_type', 'kpi_theme', 'kpi_category'], na_position='last')
        except KeyError:
            pass

        text_kpi_count = len([kpi for kpi in all_kpis if kpi.get('source_type', 'text') != 'image'])
        image_kpi_count = len([kpi for kpi in all_kpis if kpi.get('source_type') == 'image'])

        logging.info(f"KPI Summary: {text_kpi_count} from text/tables, {image_kpi_count} from images")

    return df_auto


In [19]:
# ============ Optimize moderator processing function ============
def process_sustainability_report_OPTIMIZED(pdf_path: str) -> pd.DataFrame:
    """Optimize moderator processing functions - improve performance while ensuring integrity"""

    start_time = time.time()
    print("⚡ Starting OPTIMIZED processing with completeness guarantee...")

    try:
        # Parallel text and image preprocessing
        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
            print("🔄 Starting parallel text and image preprocessing...")

            # Text processing (using your existing logic)
            def extract_text_kpis():
                full_text = pdf_to_text_and_tables(pdf_path)
                camelot_tables = camelot_extra_tables_enhanced(pdf_path)
                if camelot_tables:
                    full_text += "\n\n" + "\n\n".join(camelot_tables)

                chunks = split_into_chunks(full_text, MAX_TOKENS_CHUNK)
                text_kpis = []
                for idx, chunk in enumerate(chunks, 1):
                    if chunk.strip():
                        chunk_kpis = extract_kpi_from_chunk_universal(chunk)
                        text_kpis.extend(chunk_kpis)
                        if idx < len(chunks):
                            time.sleep(SLEEP_SEC)
                return text_kpis

            #Image preprocessing (using optimized filtering)
            def extract_and_filter_images():
                all_images = extract_images_from_pdf_fixed(pdf_path)
                filtered_images = []

                for img_info in all_images:
                    should_process, reason = conservative_image_filter(img_info['image'])
                    if should_process:
                        filtered_images.append(img_info)

                print(f"📊 Conservative filtering: Kept {len(filtered_images)}/{len(all_images)} images")
                return filtered_images

            text_future = executor.submit(extract_text_kpis)
            image_future = executor.submit(extract_and_filter_images)

            text_kpis = text_future.result()
            image_data = image_future.result()

        preprocessing_time = time.time() - start_time
        print(f"⏱️  Preprocessing completed in {preprocessing_time:.1f}s")

        # Parallel Image KPI Extraction
        image_start = time.time()
        image_kpis = process_images_in_parallel(image_data, max_workers=3)
        image_time = time.time() - image_start
        print(f"⏱️  Image processing completed in {image_time:.1f}s")

        # Post-processing
        all_kpis = text_kpis + image_kpis
        all_kpis = post_process_kpis_universal(all_kpis)

        # Convert to DataFrame
        df_auto = pd.DataFrame(all_kpis)

        if not df_auto.empty:
            initial_count = len(df_auto)
            df_auto = df_auto.drop_duplicates(subset=['kpi_text'], keep='first')
            final_count = len(df_auto)

            if 'source_type' not in df_auto.columns:
                df_auto['source_type'] = 'text'

            print(f"🔄 Removed {initial_count - final_count} exact duplicates")

        # Performance Statistics
        total_time = time.time() - start_time
        text_count = len([k for k in all_kpis if k.get('source_type') != 'image'])
        image_count = len([k for k in all_kpis if k.get('source_type') == 'image'])

        print(f"\n⚡ OPTIMIZED processing completed!")
        print(f"⏱️  Total time: {total_time:.1f}s ({total_time/60:.1f}min)")
        print(f"📊 Results:")
        print(f"   - Text/Tables: {text_count} KPIs")
        print(f"   - Images/Charts: {image_count} KPIs")
        print(f"   - Total unique: {len(df_auto)} KPIs")
        print(f"⚡ Performance: {len(df_auto)/total_time:.1f} KPIs/second")

        return df_auto

    except Exception as e:
        total_time = time.time() - start_time
        print(f"❌ Optimized processing failed after {total_time:.1f}s: {e}")
        import traceback
        traceback.print_exc()
        return pd.DataFrame()

In [20]:
# ============ Result saving and comparison functions ============
def infer_stakeholder(row) -> str:
    """Infer affected stakeholders based on KPI theme and category"""
    theme = row.get('kpi_theme', '').lower()
    category = row.get('kpi_category', '').lower()
    kpi_text = row.get('kpi_text', '').lower()

    if theme == 'environmental':
        return "Environment, Community, Future Generations"
    elif theme == 'social':
        if 'employee' in category or 'workforce' in category or 'gender' in category:
            return "Employees"
        elif 'customer' in category or 'safety' in category:
            return "Customers, Community"
        elif 'community' in category:
            return "Local Communities"
        elif 'supply' in category or 'supplier' in kpi_text:
            return "Suppliers, Business Partners"
        else:
            return "Employees, Community"
    elif theme == 'governance':
        if 'board' in category:
            return "Shareholders, Investors"
        elif 'cyber' in category or 'data' in category:
            return "Customers, Employees, Business Partners"
        else:
            return "Shareholders, Investors, Stakeholders"
    else:
        return "All Stakeholders"

def save_results(df_auto: pd.DataFrame, output_path: str, pdf_path: str = "") -> None:
    """Save results to Excel file with proper formatting"""
    try:
        os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else '.', exist_ok=True)

        if not df_auto.empty:
            # Add metadata columns
            pdf_filename = os.path.basename(pdf_path) if pdf_path else "Unknown"
            df_auto['PDF file name'] = pdf_filename
            df_auto['Title of the report'] = ""

            if 'source_page' in df_auto.columns:
                df_auto['Absolute Page Number'] = df_auto['source_page']
                df_auto = df_auto.drop('source_page', axis=1)
            else:
                df_auto['Absolute Page Number'] = "Unknown"

            df_auto['Impacted Stakeholder'] = df_auto.apply(infer_stakeholder, axis=1)

            # Reorder columns
            original_columns = [col for col in df_auto.columns if col not in
                              ['PDF file name', 'Title of the report', 'Absolute Page Number', 'Impacted Stakeholder']]
            new_column_order = ['PDF file name', 'Title of the report', 'Absolute Page Number', 'Impacted Stakeholder'] + original_columns
            df_auto = df_auto[new_column_order]

        with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
            df_auto.to_excel(writer, sheet_name='Auto_KPIs', index=False)

            if not df_auto.empty:
                # Theme summary
                theme_summary = df_auto.groupby('kpi_theme').size().reset_index(name='count')
                theme_summary.to_excel(writer, sheet_name='Theme_Summary', index=False)

                # Category summary
                category_summary = df_auto.groupby(['kpi_theme', 'kpi_category']).size().reset_index(name='count')
                category_summary.to_excel(writer, sheet_name='Category_Summary', index=False)

        logging.info(f"Results saved to {output_path}")

    except Exception as e:
        logging.error(f"Error saving results: {e}")

def compare_with_manual_kpis(df_auto: pd.DataFrame, manual_xlsx_path: str) -> None:
    """Compare automatically extracted KPIs with manually annotated ones"""
    if not os.path.exists(manual_xlsx_path):
        logging.info("Manual KPI file not found, skipping comparison.")
        return

    logging.info("Comparing with manual KPIs...")

    try:
        df_manual = pd.read_excel(manual_xlsx_path)

        if 'kpi_text' not in df_manual.columns:
            logging.warning("Manual KPI file missing 'kpi_text' column")
            return

        manual_kpis = set(df_manual['kpi_text'].astype(str).str.strip())
        auto_kpis = set(df_auto['kpi_text'].astype(str).str.strip())

        only_auto = auto_kpis - manual_kpis
        only_manual = manual_kpis - auto_kpis
        common = auto_kpis & manual_kpis

        print(f"\n=== KPI Comparison Results ===")
        print(f"Common KPIs: {len(common)}")
        print(f"Only in automatic extraction: {len(only_auto)}")
        print(f"Only in manual annotation: {len(only_manual)}")

        if only_auto:
            print(f"\nKPIs found by model but not in manual annotation ({len(only_auto)}):")
            for kpi in sorted(only_auto):
                if kpi.strip():
                    print(f"  - {kpi}")

        if only_manual:
            print(f"\nKPIs in manual annotation but missed by model ({len(only_manual)}):")
            for kpi in sorted(only_manual):
                if kpi.strip():
                    print(f"  - {kpi}")

        # Calculate metrics
        if len(auto_kpis) > 0 and len(manual_kpis) > 0:
            precision = len(common) / len(auto_kpis)
            recall = len(common) / len(manual_kpis)
            f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

            print(f"\n=== Performance Metrics ===")
            print(f"Precision: {precision:.3f}")
            print(f"Recall: {recall:.3f}")
            print(f"F1 Score: {f1_score:.3f}")

    except Exception as e:
        logging.error(f"Error comparing with manual KPIs: {e}")

In [21]:
# ============ Main execution function ============
def main():
    """Enhanced main execution function with validation"""
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(levelname)s: %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S"
    )

    try:
        if not os.path.exists(PDF_PATH):
            logging.error(f"PDF file not found: {PDF_PATH}")
            return

        # Process the PDF
        df_auto = process_sustainability_report_with_enhanced_images(PDF_PATH)

        # Save results
        save_results(df_auto, EXPORT_AUTO_XLSX, PDF_PATH)
        logging.info(f"KPI extraction completed: {len(df_auto)} KPIs extracted")

        # Enhanced validation with comprehensive analysis
        if MANUAL_XLSX and Path(MANUAL_XLSX).exists():
            print("\n🔍 Running comprehensive validation...")
            validation_results = enhanced_compare_with_manual_kpis(
                df_auto, MANUAL_XLSX, "comprehensive_validation"
            )

            if validation_results:
                print("✅ Validation completed with detailed analysis!")
                print(f"📁 Detailed results saved to: comprehensive_validation/")
            else:
                print("⚠️ Validation encountered issues")
        else:
            logging.info("Manual KPI file not found, skipping validation.")

        # Display summary
        if not df_auto.empty:
            print(f"\n=== Extraction Summary ===")
            print(f"Total KPIs extracted: {len(df_auto)}")

            # Source statistics
            if 'source_type' in df_auto.columns:
                source_counts = df_auto['source_type'].value_counts()
                print(f"From text/tables: {source_counts.get('text', 0)}")
                print(f"From images/charts: {source_counts.get('image', 0)}")

            # Theme statistics
            if 'kpi_theme' in df_auto.columns:
                theme_counts = df_auto['kpi_theme'].value_counts()
                print(f"\nKPI Distribution by Theme:")
                for theme, count in theme_counts.items():
                    print(f"  {theme}: {count}")
        else:
            print("\nNo KPIs were extracted from the document.")

    except Exception as e:
        logging.error(f"Error in main execution: {e}")
        import traceback
        traceback.print_exc()
    # logging.basicConfig(
    #     level=logging.INFO,
    #     format="%(asctime)s - %(levelname)s: %(message)s",
    #     datefmt="%Y-%m-%d %H:%M:%S"
    # )

    # try:
    #     if not os.path.exists(PDF_PATH):
    #         logging.error(f"PDF file not found: {PDF_PATH}")
    #         return

    #     # Process the PDF
    #     df_auto = process_sustainability_report_with_enhanced_images(PDF_PATH)

    #     # Save results
    #     save_results(df_auto, EXPORT_AUTO_XLSX, PDF_PATH)

    #     logging.info(f"KPI extraction completed: {len(df_auto)} KPIs extracted")

    #     # Compare with manual annotations if available
    #     if MANUAL_XLSX:
    #         compare_with_manual_kpis(df_auto, MANUAL_XLSX)

    #     # Display summary
    #     if not df_auto.empty:
    #         print(f"\n=== Extraction Summary ===")
    #         print(f"Total KPIs extracted: {len(df_auto)}")

    #         # Source statistics
    #         if 'source_type' in df_auto.columns:
    #             source_counts = df_auto['source_type'].value_counts()
    #             print(f"From text/tables: {source_counts.get('text', 0)}")
    #             print(f"From images/charts: {source_counts.get('image', 0)}")

    #         # Theme statistics
    #         if 'kpi_theme' in df_auto.columns:
    #             theme_counts = df_auto['kpi_theme'].value_counts()
    #             print(f"\nKPI Distribution by Theme:")
    #             for theme, count in theme_counts.items():
    #                 print(f"  {theme}: {count}")
    #     else:
    #         print("\nNo KPIs were extracted from the document.")

    # except Exception as e:
    #     logging.error(f"Error in main execution: {e}")
    #     import traceback
    #     traceback.print_exc()

In [22]:
# ============ Auxiliary functions ============
def install_dependencies():
    """Install required dependencies"""
    try:
        import subprocess
        import sys

        dependencies = [
            "openai",
            "python-dotenv",
            "pdfplumber",
            "tiktoken",
            "pandas",
            "PyMuPDF",
            "Pillow",
            "openpyxl"
        ]

        for dep in dependencies:
            try:
                __import__(dep.replace('-', '_'))
                print(f"✅ {dep} is already installed")
            except ImportError:
                print(f"Installing {dep}...")
                subprocess.check_call([sys.executable, "-m", "pip", "install", dep])
                print(f"✅ Installed {dep}")

        # Optional Camelot installation
        try:
            import camelot
            print("✅ Camelot is already installed")
        except ImportError:
            print("Installing Camelot (optional)...")
            try:
                subprocess.check_call([sys.executable, "-m", "pip", "install", "camelot-py[cv]"])
                print("✅ Installed Camelot")
            except:
                print("⚠️ Camelot installation failed (optional dependency)")

        print("🎉 All dependencies checked/installed successfully!")

    except Exception as e:
        print(f"❌ Error with dependencies: {e}")

def validate_environment():
    """Validate environment setup"""
    issues = []

    # Check API key
    if not os.getenv("OPENAI_API_KEY"):
        issues.append("OPENAI_API_KEY not found in environment variables")

    # Check PDF file
    if not os.path.exists(PDF_PATH):
        issues.append(f"PDF file not found: {PDF_PATH}")

    # Check required imports
    required_modules = ['openai', 'pdfplumber', 'pandas', 'tiktoken', 'PIL', 'fitz']
    for module in required_modules:
        try:
            __import__(module)
        except ImportError:
            issues.append(f"Required module '{module}' not installed")

    if issues:
        print("❌ Environment validation failed:")
        for issue in issues:
            print(f"  - {issue}")
        return False
    else:
        print("✅ Environment validation passed")
        return True


In [23]:
# ============ Simplified execution interface ============
def run_kpi_extraction():
    """Simplified interface to run KPI extraction"""
    print("🚀 Starting KPI extraction process...")

    # Validate environment
    if not validate_environment():
        print("Please fix the environment issues before running.")
        return

    # Run main function
    main()

In [24]:
# ============ Optimized execution interface ============
def run_optimized_kpi_extraction():
    """Run optimized KPI extraction"""
    print("⚡ Starting OPTIMIZED KPI extraction...")
    print("🎯 Goal: Extract ALL KPIs with 60-70% better performance")

    # Verify the environment
    if not validate_environment():
        print("Please fix the environment issues before running.")
        return None

    try:
        # Run optimization process
        df_results = process_sustainability_report_OPTIMIZED(PDF_PATH)

        # Save the results
        output_file = "OPTIMIZED_" + EXPORT_AUTO_XLSX
        save_results(df_results, output_file, PDF_PATH)
        print(f"💾 Results saved to: {output_file}")

        # Show Statistics
        if not df_results.empty and 'source_type' in df_results.columns:
            source_counts = df_results['source_type'].value_counts()
            print(f"\n📈 Final Statistics:")
            for source, count in source_counts.items():
                print(f"   - {source}: {count} KPIs")

        return df_results

    except Exception as e:
        print(f"❌ Optimized extraction failed: {e}")
        return None

def compare_original_vs_optimized():
    """Compare the performance of the original version and the optimized version"""
    print("🔬 Performance Comparison Test")
    print("=" * 50)

    # Test the original version
    print("\n📊 Testing Original Version...")
    original_start = time.time()
    try:
        original_df = process_sustainability_report_with_enhanced_images(PDF_PATH)
        original_time = time.time() - original_start
        print(f"⏱️  Original version: {original_time:.1f}s, {len(original_df)} KPIs")
    except Exception as e:
        print(f"❌ Original version failed: {e}")
        original_time = 999
        original_df = pd.DataFrame()

    # Test optimized version
    print("\n⚡ Testing Optimized Version...")
    optimized_start = time.time()
    try:
        optimized_df = process_sustainability_report_OPTIMIZED(PDF_PATH)
        optimized_time = time.time() - optimized_start
        print(f"⏱️  Optimized version: {optimized_time:.1f}s, {len(optimized_df)} KPIs")
    except Exception as e:
        print(f"❌ Optimized version failed: {e}")
        optimized_time = 999
        optimized_df = pd.DataFrame()

    # Performance comparison
    if original_time < 999 and optimized_time < 999:
        speedup = original_time / optimized_time
        time_saved = original_time - optimized_time
        kpi_diff = abs(len(optimized_df) - len(original_df))

        print(f"\n🚀 Performance Results:")
        print(f"   - Speed improvement: {speedup:.1f}x faster")
        print(f"   - Time saved: {time_saved:.1f}s ({time_saved/60:.1f}min)")
        print(f"   - KPI difference: {kpi_diff} KPIs")
        print(f"   - Completeness: {len(optimized_df)/len(original_df)*100:.1f}% of original" if len(original_df) > 0 else "")

        return {"original": original_df, "optimized": optimized_df, "speedup": speedup}

    return None

In [25]:
# ============ Debug and test functions ============
def test_text_extraction_only():
    """Test only text extraction without images"""
    logging.basicConfig(level=logging.INFO)

    try:
        # Extract text and tables
        full_text = pdf_to_text_and_tables(PDF_PATH)
        camelot_tables = camelot_extra_tables_enhanced(PDF_PATH)

        if camelot_tables:
            full_text += "\n\n" + "\n\n".join(camelot_tables)

        # Chunk text
        chunks = split_into_chunks(full_text, MAX_TOKENS_CHUNK)

        # Extract KPIs from first few chunks
        test_kpis = []
        for idx, chunk in enumerate(chunks[:3]):  # Test first 3 chunks
            chunk_kpis = extract_kpi_from_chunk_universal(chunk)
            test_kpis.extend(chunk_kpis)
            time.sleep(SLEEP_SEC)

        print(f"Test extraction completed: {len(test_kpis)} KPIs found in first 3 chunks")

        for i, kpi in enumerate(test_kpis[:5]):  # Show first 5
            print(f"{i+1}. {kpi.get('kpi_text', 'No text')}")

    except Exception as e:
        print(f"Test failed: {e}")

def debug_single_image_analysis(image_path: str):
    """Test single image analysis functionality"""
    try:
        from PIL import Image
        image = Image.open(image_path)

        print(f"Analyzing image: {image_path}")
        print(f"Image size: {image.width}x{image.height}")

        kpis = extract_kpi_from_image_fixed(image, 1)

        print(f"\n=== Analysis Results ===")
        print(f"Found {len(kpis)} KPIs:")

        for i, kpi in enumerate(kpis, 1):
            print(f"\n{i}. {kpi.get('kpi_text', 'No text')}")
            print(f"   Value: {kpi.get('quantitative_value', 'No value')}")
            print(f"   Confidence: {kpi.get('estimation_confidence', 'Not specified')}")

    except Exception as e:
        print(f"Error in debug analysis: {e}")

def process_text_only():
    """Process only text and tables, skip images"""
    logging.basicConfig(level=logging.INFO)

    try:
        logging.info("Starting text-only processing...")

        # Step 1: Text and table extraction
        full_text = pdf_to_text_and_tables(PDF_PATH)
        camelot_tables = camelot_extra_tables_enhanced(PDF_PATH)

        if camelot_tables:
            full_text += "\n\n" + "\n\n".join(camelot_tables)

        # Step 2: Chunking
        chunks = split_into_chunks(full_text, MAX_TOKENS_CHUNK)

        # Step 3: Extract KPIs
        all_kpis = []
        for idx, chunk in enumerate(chunks, 1):
            logging.info(f"Processing chunk {idx}/{len(chunks)}")
            if chunk.strip():
                chunk_kpis = extract_kpi_from_chunk_universal(chunk)
                all_kpis.extend(chunk_kpis)
                if idx < len(chunks):
                    time.sleep(SLEEP_SEC)

        # Post-processing
        all_kpis = post_process_kpis_universal(all_kpis)

        # Convert to DataFrame
        df_auto = pd.DataFrame(all_kpis)

        if not df_auto.empty:
            df_auto = df_auto.drop_duplicates(subset=['kpi_text'], keep='first')

        # Save results
        text_only_output = "text_only_" + EXPORT_AUTO_XLSX
        save_results(df_auto, text_only_output, PDF_PATH)

        print(f"Text-only processing completed: {len(df_auto)} KPIs extracted")

        return df_auto

    except Exception as e:
        logging.error(f"Text-only processing failed: {e}")
        return pd.DataFrame()

In [26]:
# ============ 兼容性函数 ============
def extract_kpi_from_chunk(chunk: str) -> List[Dict]:
    """Backward compatibility function"""
    return extract_kpi_from_chunk_universal(chunk)

def process_sustainability_report(pdf_path: str) -> pd.DataFrame:
    """Backward compatibility function for text-only processing"""
    return process_text_only()

def process_sustainability_report_with_images(pdf_path: str) -> pd.DataFrame:
    """Backward compatibility function for full processing"""
    return process_sustainability_report_with_enhanced_images(pdf_path)


In [27]:
# ============ 使用示例 ============
def example_usage():
    """Usage examples"""
    print("=== KPI Extraction Tool Usage Examples ===\n")

    print("1. Full extraction (text + images):")
    print("   df_results = process_sustainability_report_with_enhanced_images(PDF_PATH)")
    print("   save_results(df_results, EXPORT_AUTO_XLSX, PDF_PATH)\n")

    print("2. Text-only extraction:")
    print("   df_results = process_text_only()")
    print("   # Results automatically saved\n")

    print("3. Simple run:")
    print("   run_kpi_extraction()  # Complete pipeline with validation\n")

    print("4. Debug single component:")
    print("   test_text_extraction_only()  # Test first 3 chunks")
    print("   debug_single_image_analysis('path/to/image.jpg')\n")

    print("5. Install dependencies:")
    print("   install_dependencies()  # Install all required packages\n")


In [28]:
# ============================================================================
# Debug code - Copy and paste directly to the end of your code
# ============================================================================

# Method 1: Check all image extraction and recognition status
def debug_method_1_check_image_detection():
   """Check if all images in PDF are correctly extracted and if chart classifier is working properly"""
   print("=== Method 1: Check Image Extraction and Chart Recognition ===")

   # Create debug folder
   import os
   debug_folder = "debug_images_method1"
   os.makedirs(debug_folder, exist_ok=True)

   try:
       # Extract all images
       images = extract_images_from_pdf_fixed(PDF_PATH)
       print(f"Extracted {len(images)} images from PDF")

       chart_count = 0
       non_chart_count = 0

       for i, img_info in enumerate(images):
           page_num = img_info['page_number']
           img_type = img_info['type']
           image = img_info['image']

           # Check if recognized as chart
           is_chart = is_chart_image(image)

           # Save image with recognition result in filename
           chart_status = "CHART" if is_chart else "NOT_CHART"
           filename = f"{debug_folder}/page_{page_num}_{img_type}_{chart_status}_{i}.jpg"
           image.save(filename)

           print(f"Image {i+1}: Page {page_num}, Type {img_type}, Size {image.width}x{image.height}, Chart Recognition: {is_chart}")

           if is_chart:
               chart_count += 1
           else:
               non_chart_count += 1

       print(f"\nSummary:")
       print(f"- Images recognized as charts: {chart_count}")
       print(f"- Images not recognized as charts: {non_chart_count}")
       print(f"- All images saved to {debug_folder} folder")
       print(f"- Please manually check NOT_CHART images to see if they contain your missing pie charts")

       return images

   except Exception as e:
       print(f"Method 1 execution error: {e}")
       import traceback
       traceback.print_exc()
       return []

# Method 2: Temporarily disable chart classifier
def debug_method_2_bypass_chart_filter():
   """Completely disable chart classifier, force processing all images"""
   print("=== Method 2: Disable Chart Classifier ===")

   # Save original chart classifier function
   global is_chart_image
   original_chart_classifier = is_chart_image

   # Create new classifier (always returns True)
   def bypass_chart_classifier(image):
       print(f"  🔓 Force processing image (size: {image.width}x{image.height})")
       return True

   # Temporarily replace classifier
   is_chart_image = bypass_chart_classifier

   try:
       print("Starting to re-extract image KPIs (chart filtering disabled)...")

       # Re-run image processing
       image_kpis = process_pdf_images_for_kpis_fixed(PDF_PATH)

       print(f"Extracted {len(image_kpis)} image KPIs after disabling filter")

       # Display results
       for i, kpi in enumerate(image_kpis):
           print(f"{i+1}. Page {kpi.get('source_page', 'Unknown')}: {kpi.get('kpi_text', 'No text')[:100]}")

       return image_kpis

   except Exception as e:
       print(f"Method 2 execution error: {e}")
       import traceback
       traceback.print_exc()
       return []

   finally:
       # Restore original classifier
       is_chart_image = original_chart_classifier
       print("Restored original chart classifier")

# Method 3: Manual test specific images
def debug_method_3_manual_test():
   """Manually select images for testing"""
   print("=== Method 3: Manual Test Specific Images ===")

   try:
       images = extract_images_from_pdf_fixed(PDF_PATH)
       print(f"Found {len(images)} images")

       # Display all image information
       for i, img_info in enumerate(images):
           page_num = img_info['page_number']
           img_type = img_info['type']
           image = img_info['image']
           is_chart = is_chart_image(image)

           print(f"{i+1}. Page {page_num}, Type {img_type}, Size {image.width}x{image.height}, Chart: {is_chart}")

       # Let user select image to test
       while True:
           try:
               choice = input(f"\nPlease select image number to test (1-{len(images)}, enter 0 to exit): ")
               if choice == '0':
                   break

               img_index = int(choice) - 1
               if 0 <= img_index < len(images):
                   img_info = images[img_index]
                   page_num = img_info['page_number']
                   image = img_info['image']

                   print(f"\nTesting image {choice} (page {page_num})")

                   # Save this image for inspection
                   test_filename = f"test_image_{choice}_page_{page_num}.jpg"
                   image.save(test_filename)
                   print(f"Image saved as: {test_filename}")

                   # Test extraction
                   kpis = extract_kpi_from_image_fixed(image, page_num, "manual_test")

                   print(f"Extraction result: {len(kpis)} KPIs")
                   for j, kpi in enumerate(kpis):
                       print(f"  KPI {j+1}: {kpi.get('kpi_text', 'No text')}")

               else:
                   print("Invalid selection")

           except ValueError:
               print("Please enter a valid number")
           except KeyboardInterrupt:
               break
           except Exception as e:
               print(f"Test error: {e}")

   except Exception as e:
       print(f"Method 3 execution error: {e}")
       import traceback
       traceback.print_exc()

# Method 4: Simplified extraction test
def debug_method_4_simple_test():
   """Use simplified method to test image extraction"""
   print("=== Method 4: Simplified Extraction Test ===")

   simple_prompt = """请分析这个图表，提取所有的数字数据。

Return JSON format, each data point contains:
{
 "description": "Data description",
 "value": "Numerical value",
 "unit": "Unit"
}

If it's a pie chart, please extract the percentage of each slice.
If it's a bar chart, please extract the value of each bar.
If it's a table, please extract each number."""

   try:
       images = extract_images_from_pdf_fixed(PDF_PATH)

       for i, img_info in enumerate(images[:5]):  # Only test first 5 images
           page_num = img_info['page_number']
           image = img_info['image']

           print(f"\n🔍 Simplified test image {i+1} (page {page_num})")

           try:
               base64_image = image_to_base64_fixed(image)
               if not base64_image:
                   continue

               response = client.chat.completions.create(
                   model="gpt-4o",
                   messages=[{
                       "role": "user",
                       "content": [
                           {"type": "text", "text": simple_prompt},
                           {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
                       ]
                   }],
                   temperature=0.0,
                   max_tokens=2000,
                   timeout=30
               )

               content = response.choices[0].message.content.strip()
               print(f"API response: {content[:300]}...")

               # Check if contains your target data
               if "property type" in content.lower() or "service type" in content.lower():
                   print("🎉 Possibly found missing pie chart data!")
                   print(f"Complete response: {content}")

           except Exception as e:
               print(f"❌ Simplified test failed: {e}")

           time.sleep(1)

   except Exception as e:
       print(f"Method 4 execution error: {e}")
       import traceback
       traceback.print_exc()

# Debug main control function
def run_debugging_session():
   """Debug session main control function"""
   print("🔧 KPI Extraction Debug Session")
   print("=" * 50)

   while True:
       try:
           choice = input("""\nSelect debug method:
1 - Check all image extraction and recognition status
2 - Disable chart classifier, force processing all images
3 - Manually select images for testing
4 - Simplified API test (test first 5 images)
0 - Exit debug

Please enter selection (0-4): """)

           if choice == "1":
               debug_method_1_check_image_detection()
           elif choice == "2":
               debug_method_2_bypass_chart_filter()
           elif choice == "3":
               debug_method_3_manual_test()
           elif choice == "4":
               debug_method_4_simple_test()
           elif choice == "0":
               print("Exit debug session")
               break
           else:
               print("Invalid selection, please try again")

       except KeyboardInterrupt:
           print("\nUser interrupted, exit debug")
           break
       except Exception as e:
           print(f"Debug session error: {e}")
           import traceback
           traceback.print_exc()

# ============================================================================
# Standalone quick test function (if you don't want to use interactive interface)
# ============================================================================

def quick_debug():
   """Quick debug - directly run method 1"""
   print("🚀 Quick debug mode")
   debug_method_1_check_image_detection()

# ============================================================================
# Usage
# ============================================================================

# At the end of your code, you can now run any of the following:

# Option 1: Interactive debug (recommended)
# run_debugging_session()

# Option 2: Quick debug, directly check images
# quick_debug()

# Option 3: Directly run specific method
# debug_method_1_check_image_detection()

In [29]:
# Test function: Verify if the new prompt is effective
def test_improved_prompt():
   """Test if the improved prompt can correctly extract percentages"""
   print("=== Testing Improved Prompt ===")

   try:
       # Extract full page image from page 2
       images = extract_images_from_pdf_fixed(PDF_PATH)
       page2_image = None

       for img_info in images:
           if img_info['page_number'] == 2 and img_info['type'] == 'full_page':
               page2_image = img_info['image']
               break

       if page2_image is None:
           print("❌ Could not find page 2 image")
           return

       print(f"✅ Found page 2 image, size: {page2_image.width}x{page2_image.height}")

       # Test using improved prompt
       base64_image = image_to_base64_fixed(page2_image)

       print("🔄 Calling API with improved prompt...")
       response = client.chat.completions.create(
           model="gpt-4o",
           messages=[
               {"role": "system", "content": ENHANCED_IMAGE_KPI_SYSTEM_PROMPT},
               {"role": "user", "content": [
                   {"type": "text", "text": """分析这个页面，重点关注两个饼图：

1. 上方饼图："Energy Use by Property Type 2021"
2. 下方饼图："Energy Use by Service Type 2021"

请提取每个饼图中每个扇形的具体百分比数值。确保包含实际的数字，不只是描述。"""},
                   {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}", "detail": "high"}}
               ]}
           ],
           temperature=0.0,
           max_tokens=4000,
           timeout=90
       )

       content = response.choices[0].message.content.strip()
       print("\n📋 API Response:")
       print(content[:500] + "..." if len(content) > 500 else content)

       # Try to parse JSON
       try:
           if content.startswith('```json'):
               content = content[7:]
           if content.endswith('```'):
               content = content[:-3]
           content = content.strip()

           if content.startswith('['):
               result = json.loads(content)
               print(f"\n✅ Successfully parsed JSON, found {len(result)} KPIs")

               # Check if specific percentages were extracted
               found_percentages = []
               for kpi in result:
                   if isinstance(kpi, dict):
                       kpi_text = kpi.get('kpi_text', '')
                       quantitative_value = kpi.get('quantitative_value', '')

                       print(f"KPI: {kpi_text}")
                       print(f"  Value: {quantitative_value}")

                       # Check if contains expected percentages
                       if any(target in kpi_text.lower() for target in ['64%', '33%', '68%', '30%', 'healthcare center', 'medical office', 'electricity', 'fuel']):
                           found_percentages.append(kpi)
                           print(f"  🎯 Found target data!")
                       print()

               if found_percentages:
                   print(f"🎉 Successfully extracted {len(found_percentages)} KPIs with specific percentages!")
                   return True
               else:
                   print("❌ Still could not extract specific percentage values")
                   return False
           else:
               print("❌ API response is not in JSON format")
               return False

       except json.JSONDecodeError as e:
           print(f"❌ JSON parsing failed: {e}")
           return False

   except Exception as e:
       print(f"❌ Test failed: {e}")
       import traceback
       traceback.print_exc()
       return False

# Quick fix function: Replace prompt and re-run extraction
def quick_fix_and_rerun():
   """Apply fix and re-run complete extraction process"""
   print("🔧 Applying fix and re-running...")

   # First test new prompt
   if test_improved_prompt():
       print("\n✅ New prompt test successful!")

       # Re-run complete extraction process
       print("\n🔄 Re-running complete KPI extraction...")
       try:
           df_auto = process_sustainability_report_with_enhanced_images(PDF_PATH)

           # Save results
           save_results(df_auto, "fixed_" + EXPORT_AUTO_XLSX, PDF_PATH)

           print(f"\n🎉 Fix completed! Total extracted: {len(df_auto)} KPIs")
           print("Results saved to fixed_" + EXPORT_AUTO_XLSX)

           # Show pie chart related KPIs
           pie_chart_kpis = df_auto[df_auto['kpi_text'].str.contains('pie|Pie', case=False, na=False)]
           print(f"\n📊 Pie chart related KPIs ({len(pie_chart_kpis)}):")
           for idx, row in pie_chart_kpis.iterrows():
               print(f"- {row['kpi_text']}")

           return df_auto

       except Exception as e:
           print(f"❌ Re-run failed: {e}")
           return None
   else:
       print("\n❌ New prompt test failed, need further debugging")
       return None

In [30]:
# Universal testing function
def test_universal_prompt():
    """Test the effectiveness of universal prompt"""
    print("=== Testing Universal Image Analysis Prompt ===")

    try:
        # Extract full page image from page 2 for testing
        images = extract_images_from_pdf_fixed(PDF_PATH)
        page2_image = None

        for img_info in images:
            if img_info['page_number'] == 2 and img_info['type'] == 'full_page':
                page2_image = img_info['image']
                break

        if page2_image is None:
            print("❌ Could not find page 2 image")
            return False

        print(f"✅ Found page 2 image, size: {page2_image.width}x{page2_image.height}")

        # Test using universal prompt
        base64_image = image_to_base64_fixed(page2_image)

        print("🔄 Calling API with universal prompt...")
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": ENHANCED_IMAGE_KPI_SYSTEM_PROMPT},
                {"role": "user", "content": [
                    {"type": "text", "text": """Please analyze all charts and tables on this page, extracting all quantifiable data points.

Focus on:
- Specific percentages for each pie chart slice
- All numerical data from tables
- Ensure each extracted KPI contains specific numbers, not just descriptions

Please extract complete contextual information."""},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}", "detail": "high"}}
                ]}
            ],
            temperature=0.0,
            max_tokens=4000,
            timeout=90
        )

        content = response.choices[0].message.content.strip()
        print(f"\n📋 API response length: {len(content)} characters")

        # Parse and validate results
        try:
            if content.startswith('```json'):
                content = content[7:]
            if content.endswith('```'):
                content = content[:-3]
            content = content.strip()

            if content.startswith('['):
                result = json.loads(content)
                print(f"✅ Successfully parsed JSON, found {len(result)} KPIs")

                # Analyze extraction quality
                complete_kpis = 0
                pie_chart_kpis = 0
                table_kpis = 0

                print("\n📊 Extracted KPI list:")
                for i, kpi in enumerate(result, 1):
                    if isinstance(kpi, dict):
                        kpi_text = kpi.get('kpi_text', '')
                        quantitative_value = kpi.get('quantitative_value', '')
                        chart_type = kpi.get('chart_type', '')

                        print(f"{i:2d}. {kpi_text}")
                        print(f"    Value: {quantitative_value} {kpi.get('unit', '')}")
                        print(f"    Type: {chart_type}")

                        # Statistical analysis
                        if quantitative_value and str(quantitative_value).strip():
                            complete_kpis += 1

                        if 'pie' in chart_type.lower():
                            pie_chart_kpis += 1
                        elif 'table' in chart_type.lower():
                            table_kpis += 1

                        print()

                print(f"📈 Quality analysis:")
                print(f"  - KPIs with values: {complete_kpis}/{len(result)} ({complete_kpis/len(result)*100:.1f}%)")
                print(f"  - Pie chart KPIs: {pie_chart_kpis}")
                print(f"  - Table KPIs: {table_kpis}")

                # Check if target data was extracted
                success_indicators = [
                    any('64' in str(kpi.get('quantitative_value', '')) for kpi in result),
                    any('33' in str(kpi.get('quantitative_value', '')) for kpi in result),
                    any('68' in str(kpi.get('quantitative_value', '')) for kpi in result),
                    any('30' in str(kpi.get('quantitative_value', '')) for kpi in result)
                ]

                if any(success_indicators):
                    print("🎉 Successfully extracted target pie chart data!")
                    return True
                else:
                    print("⚠️ May not have extracted expected pie chart percentages")
                    return False
            else:
                print("❌ API response is not in JSON format")
                print(f"Response content: {content[:300]}...")
                return False

        except json.JSONDecodeError as e:
            print(f"❌ JSON parsing failed: {e}")
            print(f"Response content: {content[:300]}...")
            return False

    except Exception as e:
        print(f"❌ Test failed: {e}")
        import traceback
        traceback.print_exc()
        return False

# Apply universal fix
def apply_universal_fix():
    """Apply universal prompt fix and re-run"""
    print("🔧 Applying universal prompt fix...")

    # First test the new prompt
    print("Step 1: Testing new universal prompt...")
    if test_universal_prompt():
        print("\n✅ Universal prompt test successful!")

        # Ask whether to continue with full extraction
        try:
            proceed = input("\nContinue with full KPI extraction? (y/n): ").lower()
            if proceed == 'y':
                print("\n🔄 Re-running full KPI extraction...")
                df_auto = process_sustainability_report_with_enhanced_images(PDF_PATH)

                # Save results
                output_file = "universal_fixed_" + EXPORT_AUTO_XLSX
                save_results(df_auto, output_file, PDF_PATH)

                print(f"\n🎉 Fix completed! Total extracted: {len(df_auto)} KPIs")
                print(f"Results saved to {output_file}")

                # Show image-sourced KPI statistics
                if 'source_type' in df_auto.columns:
                    image_kpis = df_auto[df_auto['source_type'] == 'image']
                    print(f"\n📊 KPIs extracted from images: {len(image_kpis)}")

                return df_auto
            else:
                print("Cancelled full extraction")
                return None

        except KeyboardInterrupt:
            print("\nUser cancelled operation")
            return None

    else:
        print("\n❌ Universal prompt test failed")
        print("Recommend checking API response or further adjusting prompt")
        return None

In [31]:
class KPIValidationPipeline:
    """Comprehensive KPI validation and evaluation system"""

    def __init__(self, manual_excel_path: str, auto_excel_path: str,
                 output_dir: str = "validation_results"):
        """
        Initialize validation pipeline

        Args:
            manual_excel_path: Path to manual KPI annotations
            auto_excel_path: Path to automatically extracted KPIs
            output_dir: Directory to save validation results
        """
        self.manual_path = manual_excel_path
        self.auto_path = auto_excel_path
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)

        # Load data
        self.manual_df = self._load_excel_safe(manual_excel_path, "manual")
        self.auto_df = self._load_excel_safe(auto_excel_path, "auto")

        # Validation results
        self.validation_results = {}
        self.detailed_analysis = {}

        # Similarity thresholds
        self.similarity_thresholds = {
            'exact': 1.0,
            'high': 0.9,
            'medium': 0.7,
            'low': 0.5
        }

        logging.info(f"Validation pipeline initialized:")
        logging.info(f"  Manual KPIs: {len(self.manual_df)}")
        logging.info(f"  Auto KPIs: {len(self.auto_df)}")

    def _load_excel_safe(self, filepath: str, source_type: str) -> pd.DataFrame:
        """Safely load Excel file with error handling"""
        try:
            if not Path(filepath).exists():
                logging.warning(f"{source_type.title()} file not found: {filepath}")
                return pd.DataFrame()

            df = pd.read_excel(filepath)
            logging.info(f"Loaded {source_type} file: {len(df)} rows")

            # Standardize column names
            df.columns = df.columns.str.strip().str.lower()

            # Ensure required columns exist
            required_cols = ['kpi_text']
            for col in required_cols:
                if col not in df.columns:
                    # Try to find similar column names
                    similar_cols = [c for c in df.columns if 'kpi' in c.lower() or 'text' in c.lower()]
                    if similar_cols:
                        df['kpi_text'] = df[similar_cols[0]]
                        logging.info(f"Using column '{similar_cols[0]}' as kpi_text")
                    else:
                        logging.warning(f"Required column '{col}' not found in {source_type} file")
                        df['kpi_text'] = ""

            # Clean text data
            df['kpi_text'] = df['kpi_text'].astype(str).str.strip()
            df = df[df['kpi_text'] != ''].reset_index(drop=True)

            return df

        except Exception as e:
            logging.error(f"Error loading {source_type} file: {e}")
            return pd.DataFrame()

    def normalize_text(self, text: str) -> str:
        """Normalize text for comparison"""
        if pd.isna(text) or text == '':
            return ""

        # Convert to string and lowercase
        text = str(text).lower().strip()

        # Remove extra whitespace
        text = ' '.join(text.split())

        # Remove common punctuation but keep percentages and numbers
        text = re.sub(r'[^\w\s\%\.\,\-]', ' ', text)

        # Normalize number formats
        text = re.sub(r'\b(\d+),(\d+)\b', r'\1\2', text)  # Remove commas in numbers
        text = re.sub(r'\s+', ' ', text)  # Normalize spaces

        return text.strip()

    def calculate_text_similarity(self, text1: str, text2: str) -> Dict[str, float]:
        """Calculate multiple similarity metrics between two texts"""
        norm1 = self.normalize_text(text1)
        norm2 = self.normalize_text(text2)

        if not norm1 or not norm2:
            return {'sequence': 0.0, 'cosine': 0.0, 'jaccard': 0.0, 'combined': 0.0}

        # 1. Sequence similarity (exact match)
        sequence_sim = SequenceMatcher(None, norm1, norm2).ratio()

        # 2. Cosine similarity (semantic)
        try:
            vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=1)
            tfidf_matrix = vectorizer.fit_transform([norm1, norm2])
            cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        except:
            cosine_sim = 0.0

        # 3. Jaccard similarity (token overlap)
        tokens1 = set(norm1.split())
        tokens2 = set(norm2.split())
        if tokens1 or tokens2:
            jaccard_sim = len(tokens1.intersection(tokens2)) / len(tokens1.union(tokens2))
        else:
            jaccard_sim = 0.0

        # 4. Combined similarity
        combined_sim = (sequence_sim * 0.4 + cosine_sim * 0.4 + jaccard_sim * 0.2)

        return {
            'sequence': sequence_sim,
            'cosine': cosine_sim,
            'jaccard': jaccard_sim,
            'combined': combined_sim
        }

    def find_matches(self, threshold: float = 0.7, similarity_type: str = 'combined') -> pd.DataFrame:
        """Find matches between manual and auto KPIs"""
        matches = []
        auto_matched = set()

        for manual_idx, manual_row in self.manual_df.iterrows():
            manual_text = manual_row['kpi_text']
            best_match = None
            best_similarity = 0.0

            for auto_idx, auto_row in self.auto_df.iterrows():
                if auto_idx in auto_matched:
                    continue

                auto_text = auto_row['kpi_text']
                similarities = self.calculate_text_similarity(manual_text, auto_text)
                similarity = similarities[similarity_type]

                if similarity > best_similarity and similarity >= threshold:
                    best_similarity = similarity
                    best_match = {
                        'manual_idx': manual_idx,
                        'auto_idx': auto_idx,
                        'manual_text': manual_text,
                        'auto_text': auto_text,
                        'similarity': similarity,
                        'all_similarities': similarities
                    }

            if best_match:
                matches.append(best_match)
                auto_matched.add(best_match['auto_idx'])

        return pd.DataFrame(matches)

    def calculate_metrics_at_threshold(self, threshold: float = 0.7,
                                     similarity_type: str = 'combined') -> Dict[str, float]:
        """Calculate precision, recall, F1 at specific threshold"""
        matches_df = self.find_matches(threshold, similarity_type)

        true_positives = len(matches_df)
        false_positives = len(self.auto_df) - true_positives
        false_negatives = len(self.manual_df) - true_positives

        # Calculate metrics
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0.0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0.0
        f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

        return {
            'threshold': threshold,
            'similarity_type': similarity_type,
            'true_positives': true_positives,
            'false_positives': false_positives,
            'false_negatives': false_negatives,
            'precision': precision,
            'recall': recall,
            'f1_score': f1_score,
            'total_manual': len(self.manual_df),
            'total_auto': len(self.auto_df),
            'match_rate': true_positives / len(self.manual_df) if len(self.manual_df) > 0 else 0.0
        }

    def run_comprehensive_evaluation(self) -> Dict[str, any]:
        """Run comprehensive evaluation across multiple thresholds and similarity types"""
        logging.info("Running comprehensive evaluation...")

        results = {
            'threshold_analysis': [],
            'similarity_type_analysis': [],
            'category_analysis': {},
            'detailed_matches': {},
            'false_positives': [],
            'false_negatives': []
        }

        # 1. Threshold analysis
        thresholds = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
        similarity_types = ['combined', 'sequence', 'cosine', 'jaccard']

        for threshold in thresholds:
            for sim_type in similarity_types:
                metrics = self.calculate_metrics_at_threshold(threshold, sim_type)
                results['threshold_analysis'].append(metrics)

        # 2. Find optimal threshold
        best_f1 = 0.0
        best_config = None
        for metrics in results['threshold_analysis']:
            if metrics['f1_score'] > best_f1:
                best_f1 = metrics['f1_score']
                best_config = (metrics['threshold'], metrics['similarity_type'])

        # 3. Detailed analysis at optimal threshold
        if best_config:
            optimal_threshold, optimal_sim_type = best_config
            logging.info(f"Optimal configuration: threshold={optimal_threshold}, similarity={optimal_sim_type}")

            matches_df = self.find_matches(optimal_threshold, optimal_sim_type)
            results['detailed_matches'] = matches_df.to_dict('records')

            # Find false positives and false negatives
            matched_auto_indices = set(matches_df['auto_idx'].tolist()) if not matches_df.empty else set()
            matched_manual_indices = set(matches_df['manual_idx'].tolist()) if not matches_df.empty else set()

            # False positives (auto KPIs not matched to manual)
            fp_indices = set(range(len(self.auto_df))) - matched_auto_indices
            results['false_positives'] = [
                {
                    'index': idx,
                    'kpi_text': self.auto_df.iloc[idx]['kpi_text'],
                    'category': self.auto_df.iloc[idx].get('kpi_category', 'Unknown'),
                    'theme': self.auto_df.iloc[idx].get('kpi_theme', 'Unknown'),
                    'source': self.auto_df.iloc[idx].get('source_type', 'Unknown')
                }
                for idx in fp_indices
            ]

            # False negatives (manual KPIs not matched by auto)
            fn_indices = set(range(len(self.manual_df))) - matched_manual_indices
            results['false_negatives'] = [
                {
                    'index': idx,
                    'kpi_text': self.manual_df.iloc[idx]['kpi_text'],
                    'category': self.manual_df.iloc[idx].get('kpi_category', 'Unknown'),
                    'theme': self.manual_df.iloc[idx].get('kpi_theme', 'Unknown')
                }
                for idx in fn_indices
            ]

        # 4. Category-level analysis
        if 'kpi_category' in self.manual_df.columns and 'kpi_category' in self.auto_df.columns:
            results['category_analysis'] = self._analyze_by_category()

        # 5. Theme-level analysis
        if 'kpi_theme' in self.manual_df.columns and 'kpi_theme' in self.auto_df.columns:
            results['theme_analysis'] = self._analyze_by_theme()

        self.validation_results = results
        return results

    def _analyze_by_category(self) -> Dict[str, Dict]:
        """Analyze performance by KPI category"""
        category_results = {}

        manual_categories = self.manual_df['kpi_category'].value_counts()
        auto_categories = self.auto_df['kpi_category'].value_counts()

        all_categories = set(manual_categories.index) | set(auto_categories.index)

        for category in all_categories:
            manual_count = manual_categories.get(category, 0)
            auto_count = auto_categories.get(category, 0)

            # Find matches within this category
            manual_cat_df = self.manual_df[self.manual_df['kpi_category'] == category]
            auto_cat_df = self.auto_df[self.auto_df['kpi_category'] == category]

            category_matches = 0
            if not manual_cat_df.empty and not auto_cat_df.empty:
                for _, manual_row in manual_cat_df.iterrows():
                    best_sim = 0.0
                    for _, auto_row in auto_cat_df.iterrows():
                        sim = self.calculate_text_similarity(
                            manual_row['kpi_text'],
                            auto_row['kpi_text']
                        )['combined']
                        best_sim = max(best_sim, sim)
                    if best_sim >= 0.7:
                        category_matches += 1

            category_precision = category_matches / auto_count if auto_count > 0 else 0.0
            category_recall = category_matches / manual_count if manual_count > 0 else 0.0
            category_f1 = 2 * (category_precision * category_recall) / (category_precision + category_recall) if (category_precision + category_recall) > 0 else 0.0

            category_results[category] = {
                'manual_count': manual_count,
                'auto_count': auto_count,
                'matches': category_matches,
                'precision': category_precision,
                'recall': category_recall,
                'f1_score': category_f1
            }

        return category_results

    def _analyze_by_theme(self) -> Dict[str, Dict]:
        """Analyze performance by KPI theme"""
        theme_results = {}

        manual_themes = self.manual_df['kpi_theme'].value_counts()
        auto_themes = self.auto_df['kpi_theme'].value_counts()

        all_themes = set(manual_themes.index) | set(auto_themes.index)

        for theme in all_themes:
            manual_count = manual_themes.get(theme, 0)
            auto_count = auto_themes.get(theme, 0)

            theme_results[theme] = {
                'manual_count': manual_count,
                'auto_count': auto_count,
                'coverage': auto_count / manual_count if manual_count > 0 else 0.0
            }

        return theme_results

    def generate_visualizations(self):
        """Generate comprehensive visualizations"""
        if not self.validation_results:
            logging.warning("No validation results found. Run evaluation first.")
            return

        # Set style
        try:
            plt.style.use('seaborn-v0_8')
        except:
            plt.style.use('seaborn')  # 备用样式
        fig = plt.figure(figsize=(20, 16))

        # 1. Threshold analysis
        threshold_df = pd.DataFrame(self.validation_results['threshold_analysis'])

        plt.subplot(3, 3, 1)
        for sim_type in threshold_df['similarity_type'].unique():
            data = threshold_df[threshold_df['similarity_type'] == sim_type]
            plt.plot(data['threshold'], data['f1_score'], marker='o', label=sim_type)
        plt.xlabel('Similarity Threshold')
        plt.ylabel('F1 Score')
        plt.title('F1 Score vs Threshold by Similarity Type')
        plt.legend()
        plt.grid(True, alpha=0.3)

        # 2. Precision-Recall curve
        plt.subplot(3, 3, 2)
        for sim_type in threshold_df['similarity_type'].unique():
            data = threshold_df[threshold_df['similarity_type'] == sim_type]
            plt.plot(data['recall'], data['precision'], marker='o', label=sim_type)
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Precision-Recall Curves')
        plt.legend()
        plt.grid(True, alpha=0.3)

        # 3. Category analysis
        if 'category_analysis' in self.validation_results:
            plt.subplot(3, 3, 3)
            cat_analysis = self.validation_results['category_analysis']
            categories = list(cat_analysis.keys())[:10]  # Top 10 categories
            f1_scores = [cat_analysis[cat]['f1_score'] for cat in categories]

            plt.barh(categories, f1_scores)
            plt.xlabel('F1 Score')
            plt.title('F1 Score by Category (Top 10)')
            plt.tight_layout()

        # 4. Theme distribution comparison
        plt.subplot(3, 3, 4)
        if 'kpi_theme' in self.manual_df.columns:
            manual_themes = self.manual_df['kpi_theme'].value_counts()
            auto_themes = self.auto_df['kpi_theme'].value_counts()

            x = np.arange(len(manual_themes))
            width = 0.35

            plt.bar(x - width/2, manual_themes.values, width, label='Manual', alpha=0.8)
            plt.bar(x + width/2, auto_themes.reindex(manual_themes.index, fill_value=0).values,
                   width, label='Auto', alpha=0.8)

            plt.xlabel('Theme')
            plt.ylabel('Count')
            plt.title('KPI Count by Theme')
            plt.xticks(x, manual_themes.index, rotation=45)
            plt.legend()

        # 5. Similarity distribution
        plt.subplot(3, 3, 5)
        if self.validation_results['detailed_matches']:
            similarities = [match['similarity'] for match in self.validation_results['detailed_matches']]
            plt.hist(similarities, bins=20, edgecolor='black', alpha=0.7)
            plt.xlabel('Similarity Score')
            plt.ylabel('Frequency')
            plt.title('Distribution of Similarity Scores (Matches)')
            plt.grid(True, alpha=0.3)

        # 6. Error analysis
        plt.subplot(3, 3, 6)
        fp_count = len(self.validation_results['false_positives'])
        fn_count = len(self.validation_results['false_negatives'])
        tp_count = len(self.validation_results['detailed_matches'])

        labels = ['True Positives', 'False Positives', 'False Negatives']
        counts = [tp_count, fp_count, fn_count]
        colors = ['green', 'red', 'orange']

        plt.pie(counts, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
        plt.title('Classification Results')

        # 7. Coverage by source type
        plt.subplot(3, 3, 7)
        if 'source_type' in self.auto_df.columns:
            source_counts = self.auto_df['source_type'].value_counts()
            plt.pie(source_counts.values, labels=source_counts.index, autopct='%1.1f%%')
            plt.title('Auto KPIs by Source Type')

        # 8. Performance metrics summary
        plt.subplot(3, 3, 8)
        best_metrics = max(self.validation_results['threshold_analysis'],
                          key=lambda x: x['f1_score'])

        metrics = ['Precision', 'Recall', 'F1 Score']
        values = [best_metrics['precision'], best_metrics['recall'], best_metrics['f1_score']]

        bars = plt.bar(metrics, values, color=['skyblue', 'lightcoral', 'lightgreen'])
        plt.ylabel('Score')
        plt.title(f'Best Performance Metrics\n(Threshold: {best_metrics["threshold"]})')
        plt.ylim(0, 1)

        # Add value labels on bars
        for bar, value in zip(bars, values):
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
                    f'{value:.3f}', ha='center', va='bottom')

        # 9. Match quality distribution
        plt.subplot(3, 3, 9)
        if self.validation_results['detailed_matches']:
            match_similarities = [match['similarity'] for match in self.validation_results['detailed_matches']]
            quality_bins = [0.5, 0.7, 0.8, 0.9, 1.0]
            quality_labels = ['Medium', 'Good', 'Very Good', 'Excellent']

            quality_counts = []
            for i in range(len(quality_bins)-1):
                count = sum(1 for sim in match_similarities
                          if quality_bins[i] <= sim < quality_bins[i+1])
                quality_counts.append(count)

            plt.bar(quality_labels, quality_counts, color='lightblue', edgecolor='black')
            plt.ylabel('Number of Matches')
            plt.title('Match Quality Distribution')
            plt.xticks(rotation=45)

        plt.tight_layout()

        # Save visualization
        viz_path = self.output_dir / "validation_visualizations.png"
        plt.savefig(viz_path, dpi=300, bbox_inches='tight')
        plt.close()

        logging.info(f"Visualizations saved to {viz_path}")

    def generate_detailed_report(self) -> str:
        """Generate comprehensive validation report"""
        if not self.validation_results:
            logging.warning("No validation results found. Run evaluation first.")
            return ""

        # Find best configuration
        best_metrics = max(self.validation_results['threshold_analysis'],
                          key=lambda x: x['f1_score'])

        report = f"""
# KPI Extraction Validation Report
Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}

## Dataset Overview
- **Manual KPIs**: {len(self.manual_df)} annotations
- **Auto KPIs**: {len(self.auto_df)} extractions
- **Manual file**: {self.manual_path}
- **Auto file**: {self.auto_path}

## Best Performance Configuration
- **Similarity Type**: {best_metrics['similarity_type']}
- **Threshold**: {best_metrics['threshold']}
- **Precision**: {best_metrics['precision']:.3f}
- **Recall**: {best_metrics['recall']:.3f}
- **F1 Score**: {best_metrics['f1_score']:.3f}

## Detailed Metrics
- **True Positives**: {best_metrics['true_positives']}
- **False Positives**: {best_metrics['false_positives']}
- **False Negatives**: {best_metrics['false_negatives']}
- **Match Rate**: {best_metrics['match_rate']:.3f}

## Error Analysis

### False Positives ({len(self.validation_results['false_positives'])})
KPIs extracted automatically but not in manual annotations:
"""

        # Add false positives
        for i, fp in enumerate(self.validation_results['false_positives'][:10], 1):
            report += f"\n{i}. **{fp['category']}** | {fp['kpi_text']}\n"

        if len(self.validation_results['false_positives']) > 10:
            report += f"\n... and {len(self.validation_results['false_positives']) - 10} more\n"

        report += f"""
### False Negatives ({len(self.validation_results['false_negatives'])})
KPIs in manual annotations but missed by extraction:
"""

        # Add false negatives
        for i, fn in enumerate(self.validation_results['false_negatives'][:10], 1):
            report += f"\n{i}. **{fn['category']}** | {fn['kpi_text']}\n"

        if len(self.validation_results['false_negatives']) > 10:
            report += f"\n... and {len(self.validation_results['false_negatives']) - 10} more\n"

        # Category analysis
        if 'category_analysis' in self.validation_results:
            report += "\n## Category-wise Performance\n\n"
            report += "| Category | Manual | Auto | Matches | Precision | Recall | F1 |\n"
            report += "|----------|--------|------|---------|-----------|--------|----|\\n"

            for category, metrics in self.validation_results['category_analysis'].items():
                report += f"| {category[:20]} | {metrics['manual_count']} | {metrics['auto_count']} | {metrics['matches']} | {metrics['precision']:.3f} | {metrics['recall']:.3f} | {metrics['f1_score']:.3f} |\n"

        # Theme analysis
        if 'theme_analysis' in self.validation_results:
            report += "\n## Theme-wise Coverage\n\n"
            report += "| Theme | Manual Count | Auto Count | Coverage |\n"
            report += "|-------|--------------|------------|----------|\n"

            for theme, metrics in self.validation_results['theme_analysis'].items():
                report += f"| {theme} | {metrics['manual_count']} | {metrics['auto_count']} | {metrics['coverage']:.3f} |\n"

        # Recommendations
        report += f"""
## Recommendations

### Strengths
- Overall F1 Score: {best_metrics['f1_score']:.3f}
- Precision: {best_metrics['precision']:.3f} (low false positive rate)
- Recall: {best_metrics['recall']:.3f} (good coverage)

### Areas for Improvement
"""

        if best_metrics['precision'] < 0.8:
            report += "- **Precision**: Consider stricter filtering to reduce false positives\n"

        if best_metrics['recall'] < 0.8:
            report += "- **Recall**: Improve extraction to catch more manual KPIs\n"

        if best_metrics['f1_score'] < 0.7:
            report += "- **Overall Performance**: Significant room for improvement in both precision and recall\n"

        # Source-specific recommendations
        if 'source_type' in self.auto_df.columns:
            text_kpis = len(self.auto_df[self.auto_df['source_type'] == 'text'])
            image_kpis = len(self.auto_df[self.auto_df['source_type'] == 'image'])

            report += f"""
### Source Type Analysis
- **Text/Table KPIs**: {text_kpis}
- **Image/Chart KPIs**: {image_kpis}
- **Image Coverage**: {image_kpis / (text_kpis + image_kpis) * 100:.1f}%
"""

        return report

    def save_results(self):
        """Save all validation results to files"""
        # Save detailed results as JSON
        results_path = self.output_dir / "validation_results.json"
        with open(results_path, 'w', encoding='utf-8') as f:
            json.dump(self.validation_results, f, indent=2, ensure_ascii=False, default=str)

        # Save matches as Excel
        if self.validation_results['detailed_matches']:
            matches_df = pd.DataFrame(self.validation_results['detailed_matches'])
            matches_path = self.output_dir / "detailed_matches.xlsx"
            matches_df.to_excel(matches_path, index=False)

        # Save false positives and negatives
        fp_df = pd.DataFrame(self.validation_results['false_positives'])
        fn_df = pd.DataFrame(self.validation_results['false_negatives'])

        with pd.ExcelWriter(self.output_dir / "error_analysis.xlsx") as writer:
            fp_df.to_excel(writer, sheet_name='False_Positives', index=False)
            fn_df.to_excel(writer, sheet_name='False_Negatives', index=False)

        # Save report
        report = self.generate_detailed_report()
        report_path = self.output_dir / "validation_report.md"
        with open(report_path, 'w', encoding='utf-8') as f:
            f.write(report)

        # Save metrics summary
        threshold_df = pd.DataFrame(self.validation_results['threshold_analysis'])
        threshold_df.to_excel(self.output_dir / "threshold_analysis.xlsx", index=False)

        logging.info(f"All validation results saved to {self.output_dir}")

        return {
            'results_json': results_path,
            'matches_excel': self.output_dir / "detailed_matches.xlsx",
            'error_analysis': self.output_dir / "error_analysis.xlsx",
            'report_markdown': report_path,
            'threshold_analysis': self.output_dir / "threshold_analysis.xlsx",
            'visualizations': self.output_dir / "validation_visualizations.png"
        }

    def run_full_validation(self) -> Dict[str, any]:
        """Run complete validation pipeline"""
        logging.info("Starting full validation pipeline...")

        # Step 1: Run comprehensive evaluation
        self.run_comprehensive_evaluation()

        # Step 2: Generate visualizations
        self.generate_visualizations()

        # Step 3: Save all results
        saved_files = self.save_results()

        # Step 4: Print summary
        best_metrics = max(self.validation_results['threshold_analysis'],
                          key=lambda x: x['f1_score'])

        print("\n" + "="*60)
        print("KPI EXTRACTION VALIDATION SUMMARY")
        print("="*60)
        print(f"📊 Dataset: {len(self.manual_df)} manual vs {len(self.auto_df)} auto KPIs")
        print(f"🎯 Best F1 Score: {best_metrics['f1_score']:.3f}")
        print(f"📈 Precision: {best_metrics['precision']:.3f}")
        print(f"📉 Recall: {best_metrics['recall']:.3f}")
        print(f"✅ True Positives: {best_metrics['true_positives']}")
        print(f"❌ False Positives: {best_metrics['false_positives']}")
        print(f"⚠️  False Negatives: {best_metrics['false_negatives']}")
        print("="*60)
        print(f"📁 Results saved to: {self.output_dir}")
        print("="*60)

        return {
            'validation_results': self.validation_results,
            'saved_files': saved_files,
            'best_metrics': best_metrics
        }


In [32]:
class BatchKPIProcessor:
    """批量KPI处理器 - 支持多个PDF和Manual文件"""

    def __init__(self, base_output_dir: str = "batch_kpi_results"):
        """
        初始化批量处理器

        Args:
            base_output_dir: 批量处理结果的基础目录
        """
        self.base_output_dir = Path(base_output_dir)
        self.base_output_dir.mkdir(exist_ok=True)

        # 存储所有文件配对
        self.file_pairs = []
        self.batch_results = []

        # 创建时间戳用于本次批量处理
        self.batch_timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.current_batch_dir = self.base_output_dir / f"batch_{self.batch_timestamp}"
        self.current_batch_dir.mkdir(exist_ok=True)

        logging.info(f"批量处理器初始化完成，结果保存到: {self.current_batch_dir}")

    def add_file_pair(self, pdf_path: str, manual_path: str, document_name: str = None):
        """
        添加一对PDF和Manual文件

        Args:
            pdf_path: PDF文件路径
            manual_path: Manual标注文件路径
            document_name: 文档名称（可选，默认使用PDF文件名）
        """
        pdf_path = Path(pdf_path)
        manual_path = Path(manual_path)

        # 验证文件存在
        if not pdf_path.exists():
            logging.error(f"PDF文件不存在: {pdf_path}")
            return False

        if not manual_path.exists():
            logging.error(f"Manual文件不存在: {manual_path}")
            return False

        # 自动生成文档名称
        if document_name is None:
            document_name = pdf_path.stem

        file_pair = {
            'pdf_path': str(pdf_path),
            'manual_path': str(manual_path),
            'document_name': document_name,
            'doc_id': len(self.file_pairs) + 1
        }

        self.file_pairs.append(file_pair)
        logging.info(f"添加文件对 {len(self.file_pairs)}: {document_name}")
        return True

    def add_multiple_pairs_from_directory(self, pdf_dir: str, manual_dir: str,
                                         pdf_pattern: str = "*.pdf",
                                         manual_pattern: str = "*.xlsx"):
        """
        从目录批量添加文件对（按文件名匹配）

        Args:
            pdf_dir: PDF文件目录
            manual_dir: Manual文件目录
            pdf_pattern: PDF文件匹配模式
            manual_pattern: Manual文件匹配模式
        """
        pdf_dir = Path(pdf_dir)
        manual_dir = Path(manual_dir)

        if not pdf_dir.exists() or not manual_dir.exists():
            logging.error(f"目录不存在: {pdf_dir} 或 {manual_dir}")
            return 0

        # 获取所有PDF文件
        pdf_files = list(pdf_dir.glob(pdf_pattern))
        added_count = 0

        for pdf_file in pdf_files:
            # 尝试找到对应的Manual文件
            base_name = pdf_file.stem

            # 尝试多种匹配模式
            possible_manual_names = [
                f"{base_name}.xlsx",
                f"{base_name}_manual.xlsx",
                f"manual_{base_name}.xlsx",
                f"{base_name}.xls"
            ]

            manual_file = None
            for manual_name in possible_manual_names:
                potential_manual = manual_dir / manual_name
                if potential_manual.exists():
                    manual_file = potential_manual
                    break

            if manual_file:
                if self.add_file_pair(str(pdf_file), str(manual_file), base_name):
                    added_count += 1
            else:
                logging.warning(f"未找到 {base_name} 对应的Manual文件")

        logging.info(f"从目录批量添加了 {added_count} 个文件对")
        return added_count

    def list_file_pairs(self):
        """显示所有已添加的文件对"""
        if not self.file_pairs:
            print("❌ 没有添加任何文件对")
            return

        print(f"\n📋 已添加的文件对 (共 {len(self.file_pairs)} 对):")
        print("-" * 80)
        for pair in self.file_pairs:
            print(f"{pair['doc_id']:2d}. 文档: {pair['document_name']}")
            print(f"    PDF:    {pair['pdf_path']}")
            print(f"    Manual: {pair['manual_path']}")
            print()

    def process_single_document(self, file_pair: Dict) -> Dict:
        """
        处理单个文档（PDF + Manual）

        Args:
            file_pair: 文件对信息

        Returns:
            处理结果字典
        """
        doc_name = file_pair['document_name']
        pdf_path = file_pair['pdf_path']
        manual_path = file_pair['manual_path']
        doc_id = file_pair['doc_id']

        print(f"\n{'='*60}")
        print(f"📄 处理文档 {doc_id}/{len(self.file_pairs)}: {doc_name}")
        print(f"{'='*60}")

        # 为每个文档创建独立目录
        doc_output_dir = self.current_batch_dir / f"doc_{doc_id}_{doc_name}"
        doc_output_dir.mkdir(exist_ok=True)

        start_time = time.time()
        result = {
            'doc_id': doc_id,
            'document_name': doc_name,
            'pdf_path': pdf_path,
            'manual_path': manual_path,
            'output_dir': str(doc_output_dir),
            'start_time': datetime.now().isoformat(),
            'status': 'processing'
        }

        try:
            # Step 1: 临时修改全局PDF路径
            global PDF_PATH
            original_pdf_path = PDF_PATH
            PDF_PATH = pdf_path

            print(f"📊 Step 1: 提取KPI from {Path(pdf_path).name}...")

            # Step 2: 运行KPI提取
            df_auto = process_sustainability_report_with_enhanced_images(PDF_PATH)

            # Step 3: 保存自动提取结果
            auto_excel_path = doc_output_dir / f"{doc_name}_auto_kpis.xlsx"
            save_results(df_auto, str(auto_excel_path), PDF_PATH)

            print(f"✅ 提取完成: {len(df_auto)} KPIs")

            # Step 4: 运行验证
            print(f"🔍 Step 2: 运行验证 against {Path(manual_path).name}...")
            validation_output_dir = doc_output_dir / "validation"
            validation_results = enhanced_compare_with_manual_kpis(
                df_auto, manual_path, str(validation_output_dir)
            )

            # Step 5: 收集结果
            processing_time = time.time() - start_time

            result.update({
                'status': 'completed',
                'processing_time_seconds': processing_time,
                'extracted_kpis_count': len(df_auto),
                'auto_excel_path': str(auto_excel_path),
                'validation_output_dir': str(validation_output_dir),
                'end_time': datetime.now().isoformat()
            })

            # 添加验证指标
            if validation_results and 'best_metrics' in validation_results:
                metrics = validation_results['best_metrics']
                result.update({
                    'validation_f1_score': metrics.get('f1_score', 0),
                    'validation_precision': metrics.get('precision', 0),
                    'validation_recall': metrics.get('recall', 0),
                    'true_positives': metrics.get('true_positives', 0),
                    'false_positives': metrics.get('false_positives', 0),
                    'false_negatives': metrics.get('false_negatives', 0)
                })

                print(f"🎯 验证完成:")
                print(f"   F1 Score: {metrics.get('f1_score', 0):.3f}")
                print(f"   Precision: {metrics.get('precision', 0):.3f}")
                print(f"   Recall: {metrics.get('recall', 0):.3f}")

            print(f"⏱️  处理耗时: {processing_time:.1f}秒")
            print(f"📁 结果保存到: {doc_output_dir}")

            # 恢复原始PDF路径
            PDF_PATH = original_pdf_path

        except Exception as e:
            processing_time = time.time() - start_time
            error_msg = str(e)

            result.update({
                'status': 'failed',
                'processing_time_seconds': processing_time,
                'error_message': error_msg,
                'end_time': datetime.now().isoformat()
            })

            print(f"❌ 处理失败: {error_msg}")

            # 恢复原始PDF路径
            PDF_PATH = original_pdf_path

            # 保存错误日志
            error_log_path = doc_output_dir / "error_log.txt"
            with open(error_log_path, 'w', encoding='utf-8') as f:
                f.write(f"文档: {doc_name}\n")
                f.write(f"错误时间: {datetime.now()}\n")
                f.write(f"错误信息: {error_msg}\n")
                f.write(f"PDF路径: {pdf_path}\n")
                f.write(f"Manual路径: {manual_path}\n")

        return result

    def run_batch_processing(self, max_workers: int = 1):
        """
        运行批量处理

        Args:
            max_workers: 最大并发处理数（建议保持为1，避免API限制）
        """
        if not self.file_pairs:
            print("❌ 没有要处理的文件对")
            return

        print(f"\n🚀 开始批量处理 {len(self.file_pairs)} 个文档...")
        print(f"📁 结果将保存到: {self.current_batch_dir}")

        batch_start_time = time.time()

        # 处理每个文档
        for file_pair in self.file_pairs:
            result = self.process_single_document(file_pair)
            self.batch_results.append(result)

            # 实时保存进度（防止中断丢失结果）
            self.save_batch_progress()

        # 生成最终报告
        batch_total_time = time.time() - batch_start_time
        self.generate_batch_summary(batch_total_time)

        print(f"\n🎉 批量处理完成!")
        print(f"⏱️  总耗时: {batch_total_time:.1f}秒 ({batch_total_time/60:.1f}分钟)")
        print(f"📊 处理统计: {self.get_batch_statistics()}")
        print(f"📁 完整结果查看: {self.current_batch_dir}")

    def save_batch_progress(self):
        """保存批量处理进度"""
        progress_file = self.current_batch_dir / "batch_progress.json"
        with open(progress_file, 'w', encoding='utf-8') as f:
            json.dump({
                'batch_timestamp': self.batch_timestamp,
                'file_pairs': self.file_pairs,
                'results': self.batch_results,
                'last_updated': datetime.now().isoformat()
            }, f, indent=2, ensure_ascii=False)

    def get_batch_statistics(self) -> str:
        """获取批量处理统计信息"""
        if not self.batch_results:
            return "无结果"

        total = len(self.batch_results)
        completed = len([r for r in self.batch_results if r['status'] == 'completed'])
        failed = len([r for r in self.batch_results if r['status'] == 'failed'])

        # 计算平均验证指标
        completed_results = [r for r in self.batch_results if r['status'] == 'completed']
        if completed_results:
            avg_f1 = sum(r.get('validation_f1_score', 0) for r in completed_results) / len(completed_results)
            avg_precision = sum(r.get('validation_precision', 0) for r in completed_results) / len(completed_results)
            avg_recall = sum(r.get('validation_recall', 0) for r in completed_results) / len(completed_results)
            total_kpis = sum(r.get('extracted_kpis_count', 0) for r in completed_results)
        else:
            avg_f1 = avg_precision = avg_recall = total_kpis = 0

        return f"""
        成功: {completed}/{total} ({completed/total*100:.1f}%)
        失败: {failed}/{total} ({failed/total*100:.1f}%)
        总KPI数: {total_kpis}
        平均F1: {avg_f1:.3f}
        平均精确率: {avg_precision:.3f}
        平均召回率: {avg_recall:.3f}
        """

    def generate_batch_summary(self, total_time: float):
        """生成批量处理汇总报告"""
        # 1. 保存详细结果到Excel
        results_df = pd.DataFrame(self.batch_results)
        excel_path = self.current_batch_dir / "batch_summary.xlsx"

        with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
            # 主要结果
            results_df.to_excel(writer, sheet_name='处理结果', index=False)

            # 验证指标汇总
            if not results_df.empty:
                completed_df = results_df[results_df['status'] == 'completed']
                if not completed_df.empty:
                    validation_columns = ['document_name', 'extracted_kpis_count',
                                        'validation_f1_score', 'validation_precision',
                                        'validation_recall', 'true_positives',
                                        'false_positives', 'false_negatives']

                    validation_df = completed_df[validation_columns].copy()
                    validation_df.to_excel(writer, sheet_name='验证指标', index=False)

        # 2. 生成Markdown报告
        report_path = self.current_batch_dir / "batch_report.md"
        with open(report_path, 'w', encoding='utf-8') as f:
            f.write(f"""# 批量KPI提取与验证报告

## 处理概览
- **处理时间**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
- **批次ID**: {self.batch_timestamp}
- **总文档数**: {len(self.file_pairs)}
- **总耗时**: {total_time:.1f}秒 ({total_time/60:.1f}分钟)

## 处理统计
{self.get_batch_statistics()}

## 详细结果

| 文档ID | 文档名称 | 状态 | KPI数量 | F1分数 | 精确率 | 召回率 | 处理时间(秒) |
|--------|----------|------|---------|--------|--------|--------|-------------|
""")

            for result in self.batch_results:
                f.write(f"| {result['doc_id']} | {result['document_name']} | {result['status']} | "
                       f"{result.get('extracted_kpis_count', 'N/A')} | "
                       f"{result.get('validation_f1_score', 0):.3f} | "
                       f"{result.get('validation_precision', 0):.3f} | "
                       f"{result.get('validation_recall', 0):.3f} | "
                       f"{result.get('processing_time_seconds', 0):.1f} |\n")

            if any(r['status'] == 'failed' for r in self.batch_results):
                f.write(f"\n## 失败的文档\n")
                for result in self.batch_results:
                    if result['status'] == 'failed':
                        f.write(f"- **{result['document_name']}**: {result.get('error_message', '未知错误')}\n")

        print(f"📋 批量处理报告生成: {report_path}")
        print(f"📊 详细结果Excel: {excel_path}")


In [33]:
class FileUploadManager:
    """文件上传和管理器"""

    def __init__(self):
        self.uploaded_files = []
        self.pdf_files = []
        self.manual_files = []
        self.file_pairs = []

        # 创建工作目录
        self.work_dir = Path("/content/kpi_files")
        self.work_dir.mkdir(exist_ok=True)

        print(f"📁 工作目录: {self.work_dir}")

    def upload_files_directly(self):
        """直接上传文件到Colab"""
        print("📤 请选择要上传的文件...")
        print("可以同时选择多个PDF和Excel文件")

        uploaded = files.upload()

        for filename, content in uploaded.items():
            file_path = self.work_dir / filename
            with open(file_path, 'wb') as f:
                f.write(content)

            self.uploaded_files.append(str(file_path))
            print(f"✅ 已上传: {filename}")

        self._categorize_files()
        return len(uploaded)

    def mount_google_drive(self):
        """挂载Google Drive"""
        try:
            drive.mount('/content/drive')
            print("✅ Google Drive已挂载")
            print("📁 你的文件在: /content/drive/MyDrive/")
            return True
        except Exception as e:
            print(f"❌ Drive挂载失败: {e}")
            return False

    def scan_drive_directory(self, drive_path: str):
        """扫描Drive目录中的文件"""
        drive_path = Path(drive_path)

        if not drive_path.exists():
            print(f"❌ 目录不存在: {drive_path}")
            return 0

        # 扫描PDF文件
        pdf_files = list(drive_path.glob("*.pdf"))
        excel_files = list(drive_path.glob("*.xlsx")) + list(drive_path.glob("*.xls"))

        print(f"📊 发现文件:")
        print(f"   PDF文件: {len(pdf_files)}个")
        print(f"   Excel文件: {len(excel_files)}个")

        # 复制到工作目录
        for pdf_file in pdf_files:
            dest = self.work_dir / pdf_file.name
            shutil.copy2(pdf_file, dest)
            self.uploaded_files.append(str(dest))
            print(f"📄 复制PDF: {pdf_file.name}")

        for excel_file in excel_files:
            dest = self.work_dir / excel_file.name
            shutil.copy2(excel_file, dest)
            self.uploaded_files.append(str(dest))
            print(f"📊 复制Excel: {excel_file.name}")

        self._categorize_files()
        return len(pdf_files) + len(excel_files)

    def _categorize_files(self):
        """分类文件"""
        self.pdf_files = []
        self.manual_files = []

        for file_path in self.uploaded_files:
            path = Path(file_path)
            if path.suffix.lower() == '.pdf':
                self.pdf_files.append(file_path)
            elif path.suffix.lower() in ['.xlsx', '.xls']:
                self.manual_files.append(file_path)

        print(f"\n📋 文件分类完成:")
        print(f"   PDF文件: {len(self.pdf_files)}个")
        print(f"   Manual文件: {len(self.manual_files)}个")

    def auto_match_files(self) -> List[Tuple[str, str, str]]:
        """自动匹配PDF和Manual文件"""
        matches = []

        for pdf_path in self.pdf_files:
            pdf_name = Path(pdf_path).stem

            # 尝试多种匹配模式
            potential_matches = []

            for manual_path in self.manual_files:
                manual_name = Path(manual_path).stem

                # 匹配模式1: 完全相同
                if pdf_name.lower() == manual_name.lower():
                    potential_matches.append((manual_path, 1.0, "完全匹配"))

                # 匹配模式2: PDF名称_manual
                elif manual_name.lower() == f"{pdf_name.lower()}_manual":
                    potential_matches.append((manual_path, 0.9, "后缀匹配"))

                # 匹配模式3: manual_PDF名称
                elif manual_name.lower() == f"manual_{pdf_name.lower()}":
                    potential_matches.append((manual_path, 0.9, "前缀匹配"))

                # 匹配模式4: 包含关系
                elif pdf_name.lower() in manual_name.lower() or manual_name.lower() in pdf_name.lower():
                    potential_matches.append((manual_path, 0.7, "部分匹配"))

            # 选择最佳匹配
            if potential_matches:
                best_match = max(potential_matches, key=lambda x: x[1])
                matches.append((pdf_path, best_match[0], pdf_name))
                print(f"✅ 匹配: {pdf_name} → {Path(best_match[0]).name} ({best_match[2]})")
            else:
                print(f"❌ 未找到匹配: {pdf_name}")

        self.file_pairs = matches
        return matches

    def manual_pair_files(self):
        """手动配对文件"""
        print("\n🔧 手动文件配对")
        print("=" * 40)

        self.file_pairs = []

        print("可用的PDF文件:")
        for i, pdf_path in enumerate(self.pdf_files, 1):
            print(f"  {i}. {Path(pdf_path).name}")

        print("\n可用的Manual文件:")
        for i, manual_path in enumerate(self.manual_files, 1):
            print(f"  {i}. {Path(manual_path).name}")

        for pdf_path in self.pdf_files:
            pdf_name = Path(pdf_path).name
            print(f"\n为PDF文件 '{pdf_name}' 选择Manual文件:")

            for i, manual_path in enumerate(self.manual_files, 1):
                print(f"  {i}. {Path(manual_path).name}")

            try:
                choice = int(input("请输入Manual文件编号 (0跳过): "))
                if choice > 0 and choice <= len(self.manual_files):
                    manual_path = self.manual_files[choice - 1]
                    doc_name = Path(pdf_path).stem
                    self.file_pairs.append((pdf_path, manual_path, doc_name))
                    print(f"✅ 配对成功: {pdf_name} → {Path(manual_path).name}")
                else:
                    print(f"⏭️ 跳过: {pdf_name}")
            except ValueError:
                print(f"⏭️ 输入无效，跳过: {pdf_name}")

    def validate_manual_files(self) -> Dict[str, bool]:
        """验证Manual文件格式"""
        validation_results = {}

        print("\n🔍 验证Manual文件格式...")

        for manual_path in self.manual_files:
            file_name = Path(manual_path).name
            try:
                df = pd.read_excel(manual_path)

                # 检查必需列
                required_columns = ['kpi_text']
                missing_columns = [col for col in required_columns if col not in df.columns]

                if missing_columns:
                    print(f"❌ {file_name}: 缺少列 {missing_columns}")
                    validation_results[manual_path] = False
                else:
                    # 检查数据
                    non_empty_rows = df['kpi_text'].notna().sum()
                    print(f"✅ {file_name}: {len(df)}行数据, {non_empty_rows}个有效KPI")
                    validation_results[manual_path] = True

            except Exception as e:
                print(f"❌ {file_name}: 读取失败 - {e}")
                validation_results[manual_path] = False

        return validation_results

    def show_file_summary(self):
        """显示文件汇总"""
        print(f"\n📊 文件上传汇总")
        print("=" * 40)
        print(f"总文件数: {len(self.uploaded_files)}")
        print(f"PDF文件: {len(self.pdf_files)}")
        print(f"Manual文件: {len(self.manual_files)}")
        print(f"配对文件: {len(self.file_pairs)}")

        if self.file_pairs:
            print(f"\n📋 配对结果:")
            for i, (pdf_path, manual_path, doc_name) in enumerate(self.file_pairs, 1):
                print(f"  {i}. {doc_name}")
                print(f"     PDF: {Path(pdf_path).name}")
                print(f"     Manual: {Path(manual_path).name}")

    def get_file_pairs_for_batch_processing(self) -> List[Tuple[str, str, str]]:
        """获取用于批量处理的文件对"""
        return self.file_pairs

    def create_batch_processor_from_uploads(self):
        """从上传的文件创建批量处理器"""
        if not self.file_pairs:
            print("❌ 没有可用的文件对")
            return None

        # 导入批量处理器
        from your_main_script import create_batch_processor  # 需要替换为实际的导入

        processor = create_batch_processor()

        for pdf_path, manual_path, doc_name in self.file_pairs:
            processor.add_file_pair(pdf_path, manual_path, doc_name)

        return processor

In [34]:
# ============ 便捷使用函数 ============

def create_batch_processor():
    """创建批量处理器的便捷函数"""
    return BatchKPIProcessor()

def quick_batch_from_directories(pdf_dir: str, manual_dir: str):
    """快速从目录创建批量处理"""
    processor = BatchKPIProcessor()

    # 添加文件对
    added_count = processor.add_multiple_pairs_from_directory(pdf_dir, manual_dir)

    if added_count == 0:
        print("❌ 没有找到匹配的PDF和Manual文件对")
        return None

    # 显示文件列表
    processor.list_file_pairs()

    # 询问是否继续
    response = input(f"\n是否开始处理这 {added_count} 个文档? (y/n): ")
    if response.lower() == 'y':
        processor.run_batch_processing()
        return processor
    else:
        print("取消批量处理")
        return processor

def manual_batch_setup():
    """手动设置批量处理的交互式函数"""
    processor = BatchKPIProcessor()

    print("📋 手动批量处理设置")
    print("=" * 40)

    while True:
        print(f"\n当前已添加 {len(processor.file_pairs)} 个文件对")
        print("1. 添加单个文件对")
        print("2. 从目录批量添加")
        print("3. 查看已添加的文件")
        print("4. 开始批量处理")
        print("5. 退出")

        choice = input("请选择操作 (1-5): ")

        if choice == '1':
            pdf_path = input("PDF文件路径: ")
            manual_path = input("Manual文件路径: ")
            doc_name = input("文档名称 (回车使用默认): ").strip()

            if not doc_name:
                doc_name = None

            processor.add_file_pair(pdf_path, manual_path, doc_name)

        elif choice == '2':
            pdf_dir = input("PDF文件目录: ")
            manual_dir = input("Manual文件目录: ")
            added = processor.add_multiple_pairs_from_directory(pdf_dir, manual_dir)
            print(f"添加了 {added} 个文件对")

        elif choice == '3':
            processor.list_file_pairs()

        elif choice == '4':
            if processor.file_pairs:
                processor.run_batch_processing()
                break
            else:
                print("❌ 没有添加任何文件对")

        elif choice == '5':
            print("退出批量处理设置")
            break
        else:
            print("无效选择，请重试")

    return processor

In [35]:
# Enhanced comparison function for the main code
def enhanced_compare_with_manual_kpis(df_auto: pd.DataFrame, manual_xlsx_path: str,
                                     output_dir: str = "validation_results") -> Dict[str, any]:
    """
    Enhanced comparison with manual KPIs using the validation pipeline

    Args:
        df_auto: DataFrame with automatically extracted KPIs
        manual_xlsx_path: Path to manual KPI annotations
        output_dir: Directory to save validation results

    Returns:
        Comprehensive validation results
    """
    if not Path(manual_xlsx_path).exists():
        logging.warning(f"Manual KPI file not found: {manual_xlsx_path}")
        return {}

    # Save auto KPIs to temporary file for validation pipeline
    temp_auto_path = Path(output_dir) / "temp_auto_kpis.xlsx"
    temp_auto_path.parent.mkdir(exist_ok=True)
    df_auto.to_excel(temp_auto_path, index=False)

    try:
        # Initialize and run validation pipeline
        validator = KPIValidationPipeline(
            manual_excel_path=manual_xlsx_path,
            auto_excel_path=str(temp_auto_path),
            output_dir=output_dir
        )

        # Run full validation
        results = validator.run_full_validation()

        # Clean up temporary file
        if temp_auto_path.exists():
            temp_auto_path.unlink()

        return results

    except Exception as e:
        logging.error(f"Enhanced validation failed: {e}")
        # Clean up temporary file
        if temp_auto_path.exists():
            temp_auto_path.unlink()
        return {}


# Integration function for the main pipeline
def run_kpi_extraction_with_validation():
    """Run KPI extraction with comprehensive validation"""
    print("🚀 Starting KPI extraction with automated validation...")

    # Validate environment
    if not validate_environment():
        print("Please fix the environment issues before running.")
        return None

    try:
        # Step 1: Run KPI extraction
        print("\n📊 Step 1: Extracting KPIs...")
        df_auto = process_sustainability_report_with_enhanced_images(PDF_PATH)

        # Save auto results
        save_results(df_auto, EXPORT_AUTO_XLSX, PDF_PATH)
        print(f"✅ Extracted {len(df_auto)} KPIs and saved to {EXPORT_AUTO_XLSX}")

        # Step 2: Run validation if manual file exists
        if MANUAL_XLSX and Path(MANUAL_XLSX).exists():
            print(f"\n🔍 Step 2: Running validation against {MANUAL_XLSX}...")
            validation_results = enhanced_compare_with_manual_kpis(
                df_auto, MANUAL_XLSX, "validation_results"
            )

            if validation_results:
                best_metrics = validation_results['best_metrics']
                print(f"\n🎯 Validation completed!")
                print(f"   F1 Score: {best_metrics['f1_score']:.3f}")
                print(f"   Precision: {best_metrics['precision']:.3f}")
                print(f"   Recall: {best_metrics['recall']:.3f}")

                return {
                    'extracted_kpis': df_auto,
                    'validation_results': validation_results
                }
            else:
                print("⚠️ Validation failed, but extraction completed successfully")
                return {'extracted_kpis': df_auto}
        else:
            print(f"\n⚠️ Manual KPI file not found ({MANUAL_XLSX}), skipping validation")
            return {'extracted_kpis': df_auto}

    except Exception as e:
        print(f"❌ Pipeline failed: {e}")
        import traceback
        traceback.print_exc()
        return None


# Batch validation function for multiple documents
def run_batch_validation(pdf_list: List[str], manual_list: List[str],
                        output_base_dir: str = "batch_validation"):
    """
    Run validation across multiple PDF documents

    Args:
        pdf_list: List of PDF file paths
        manual_list: List of corresponding manual annotation files
        output_base_dir: Base directory for validation results
    """
    batch_results = []

    for i, (pdf_path, manual_path) in enumerate(zip(pdf_list, manual_list)):
        print(f"\n{'='*60}")
        print(f"Processing document {i+1}/{len(pdf_list)}: {Path(pdf_path).name}")
        print(f"{'='*60}")

        try:
            # Set up paths for this document
            doc_name = Path(pdf_path).stem
            doc_output_dir = Path(output_base_dir) / doc_name
            doc_output_dir.mkdir(parents=True, exist_ok=True)

            # Extract KPIs
            global PDF_PATH
            original_pdf_path = PDF_PATH
            PDF_PATH = pdf_path

            df_auto = process_sustainability_report_with_enhanced_images(PDF_PATH)
            auto_excel_path = doc_output_dir / f"{doc_name}_auto_kpis.xlsx"
            save_results(df_auto, str(auto_excel_path), PDF_PATH)

            # Run validation
            validation_results = enhanced_compare_with_manual_kpis(
                df_auto, manual_path, str(doc_output_dir / "validation")
            )

            # Store results
            doc_result = {
                'document': doc_name,
                'pdf_path': pdf_path,
                'manual_path': manual_path,
                'extracted_kpis': len(df_auto),
                'validation_results': validation_results.get('best_metrics', {}),
                'output_dir': str(doc_output_dir)
            }
            batch_results.append(doc_result)

            # Restore original PDF path
            PDF_PATH = original_pdf_path

        except Exception as e:
            print(f"❌ Failed to process {doc_name}: {e}")
            batch_results.append({
                'document': doc_name,
                'pdf_path': pdf_path,
                'manual_path': manual_path,
                'error': str(e)
            })

    # Generate batch summary
    batch_summary_path = Path(output_base_dir) / "batch_summary.xlsx"
    batch_df = pd.DataFrame(batch_results)
    batch_df.to_excel(batch_summary_path, index=False)

    print(f"\n🎉 Batch validation completed!")
    print(f"📊 Processed {len(pdf_list)} documents")
    print(f"📁 Results saved to {output_base_dir}")
    print(f"📋 Summary available at {batch_summary_path}")

    return batch_results


# Quick validation function for testing
def quick_validation_test(manual_xlsx: str = None, auto_xlsx: str = None):
    """Quick validation test with existing files"""
    manual_file = manual_xlsx or MANUAL_XLSX
    auto_file = auto_xlsx or EXPORT_AUTO_XLSX

    if not Path(manual_file).exists():
        print(f"❌ Manual file not found: {manual_file}")
        return None

    if not Path(auto_file).exists():
        print(f"❌ Auto file not found: {auto_file}")
        return None

    print(f"🔍 Quick validation test:")
    print(f"  Manual: {manual_file}")
    print(f"  Auto: {auto_file}")

    try:
        validator = KPIValidationPipeline(
            manual_excel_path=manual_file,
            auto_excel_path=auto_file,
            output_dir="quick_validation"
        )

        results = validator.run_full_validation()
        return results

    except Exception as e:
        print(f"❌ Quick validation failed: {e}")
        return None


# Performance benchmarking function
def benchmark_extraction_methods():
    """Benchmark different extraction methods with validation"""
    methods = {
        'text_only': process_text_only,
        'with_images': process_sustainability_report_with_enhanced_images,
        'optimized': process_sustainability_report_OPTIMIZED
    }

    benchmark_results = {}

    for method_name, method_func in methods.items():
        print(f"\n🧪 Benchmarking {method_name}...")

        try:
            import time
            start_time = time.time()

            # Run extraction
            df_result = method_func(PDF_PATH)
            extraction_time = time.time() - start_time

            # Save results
            method_output = f"{method_name}_{EXPORT_AUTO_XLSX}"
            save_results(df_result, method_output, PDF_PATH)

            # Run validation if manual file exists
            validation_metrics = {}
            if MANUAL_XLSX and Path(MANUAL_XLSX).exists():
                validation_results = enhanced_compare_with_manual_kpis(
                    df_result, MANUAL_XLSX, f"benchmark_{method_name}"
                )
                if validation_results:
                    validation_metrics = validation_results['best_metrics']

            benchmark_results[method_name] = {
                'extraction_time': extraction_time,
                'kpi_count': len(df_result),
                'kpis_per_second': len(df_result) / extraction_time,
                'validation_metrics': validation_metrics
            }

            print(f"✅ {method_name}: {len(df_result)} KPIs in {extraction_time:.1f}s")
            if validation_metrics:
                print(f"   F1: {validation_metrics.get('f1_score', 0):.3f}")

        except Exception as e:
            print(f"❌ {method_name} failed: {e}")
            benchmark_results[method_name] = {'error': str(e)}

    # Save benchmark results
    benchmark_df = pd.DataFrame(benchmark_results).T
    benchmark_df.to_excel("extraction_benchmark.xlsx")

    print(f"\n🏆 Benchmark completed!")
    print(f"📊 Results saved to extraction_benchmark.xlsx")

    return benchmark_results


# Usage examples and documentation
def validation_usage_examples():
    """Show usage examples for the validation pipeline"""
    print("""
# KPI Validation Pipeline Usage Examples

## 1. Basic validation with existing files
```python
validator = KPIValidationPipeline(
    manual_excel_path="manual_kpis.xlsx",
    auto_excel_path="auto_kpis.xlsx"
)
results = validator.run_full_validation()
```

## 2. Integrated extraction + validation
```python
results = run_kpi_extraction_with_validation()
```

## 3. Quick validation test
```python
results = quick_validation_test("manual.xlsx", "auto.xlsx")
```

## 4. Batch validation for multiple documents
```python
pdf_files = ["doc1.pdf", "doc2.pdf", "doc3.pdf"]
manual_files = ["manual1.xlsx", "manual2.xlsx", "manual3.xlsx"]
batch_results = run_batch_validation(pdf_files, manual_files)
```

## 5. Benchmark different extraction methods
```python
benchmark_results = benchmark_extraction_methods()
```

## 6. Custom threshold analysis
```python
validator = KPIValidationPipeline("manual.xlsx", "auto.xlsx")
validator.run_comprehensive_evaluation()

# Check performance at different thresholds
for threshold in [0.5, 0.7, 0.9]:
    metrics = validator.calculate_metrics_at_threshold(threshold)
    print(f"Threshold {threshold}: F1={metrics['f1_score']:.3f}")
```

## Output Files Generated:
- validation_results.json - Complete results in JSON format
- detailed_matches.xlsx - All matched KPIs with similarity scores
- error_analysis.xlsx - False positives and false negatives
- validation_report.md - Human-readable report
- threshold_analysis.xlsx - Performance across different thresholds
- validation_visualizations.png - Comprehensive charts and graphs

## Key Metrics Explained:
- **Precision**: % of auto KPIs that match manual annotations
- **Recall**: % of manual KPIs found by automatic extraction
- **F1 Score**: Harmonic mean of precision and recall
- **True Positives**: Correctly identified KPIs
- **False Positives**: Auto KPIs not in manual annotations
- **False Negatives**: Manual KPIs missed by extraction
""")

In [36]:
# ============ Execution entry ============
#if __name__ == "__main__":
    # Uncomment to install dependencies first
    # install_dependencies()
    # Uncomment to see usage examples
    # example_usage()
    # Run the main extraction
    #run_kpi_extraction()
    # 方式1: Run optimized version (recommended)
    #run_optimized_kpi_extraction()
    #run_debugging_session()
    #apply_universal_fix()
    # 测试1: 检查环境
    #validate_environment()
    # 方式1: 运行带验证的主函数（当前）
    #main()
    # 或者选择以下任一方式：
    # 方式2: 完整的提取+验证流程
    # results = run_kpi_extraction_with_validation()
    # 方式3: 优化版本
    # results = run_optimized_kpi_extraction()
    # 方式4: 快速验证测试
    # results = quick_validation_test()

In [37]:
if __name__ == "__main__":
    # 选择处理模式
    print("🚀 KPI提取系统")
    print("=" * 40)
    print("1. 单个PDF处理 (原有模式)")
    print("2. 批量PDF处理")
    print("3. 快速批量处理 (从目录)")
    print("4. 交互式批量设置")
    print("5. 查看批量处理示例")

    try:
        choice = input("请选择处理模式 (1-5): ")

        if choice == '1':
            # 原有的单个PDF处理
            main()

        elif choice == '2':
            # 手动批量处理
            processor = create_batch_processor()

            # 让用户手动添加文件对
            while True:
                pdf_path = input("输入PDF文件路径 (回车结束): ").strip()
                if not pdf_path:
                    break
                manual_path = input("输入对应的Manual文件路径: ").strip()
                doc_name = input("文档名称 (回车使用默认): ").strip() or None

                processor.add_file_pair(pdf_path, manual_path, doc_name)

            if processor.file_pairs:
                processor.list_file_pairs()
                processor.run_batch_processing()
            else:
                print("❌ 没有添加任何文件对")

        elif choice == '3':
            # 快速目录批量处理
            pdf_dir = input("PDF文件目录路径: ").strip()
            manual_dir = input("Manual文件目录路径: ").strip()
            quick_batch_from_directories(pdf_dir, manual_dir)

        elif choice == '4':
            # 交互式批量设置
            manual_batch_setup()

        elif choice == '5':
            # 显示使用示例
            batch_processing_examples()

        else:
            print("无效选择，运行默认单个PDF处理")
            main()

    except KeyboardInterrupt:
        print("\n用户取消操作")
    except Exception as e:
        print(f"执行出错: {e}")
        print("运行默认单个PDF处理")
        main()

🚀 KPI提取系统
1. 单个PDF处理 (原有模式)
2. 批量PDF处理
3. 快速批量处理 (从目录)
4. 交互式批量设置
5. 查看批量处理示例
请选择处理模式 (1-5): 2
输入PDF文件路径 (回车结束): /content
输入对应的Manual文件路径: /content
文档名称 (回车使用默认): 
输入PDF文件路径 (回车结束): /content/document_C.pdf
输入对应的Manual文件路径: /content/document_C.xlsx
文档名称 (回车使用默认): 
输入PDF文件路径 (回车结束): /content/document_F.pdf
输入对应的Manual文件路径: /content/document_F.xlsx
文档名称 (回车使用默认): 
输入PDF文件路径 (回车结束): /content/document_N.pdf
输入对应的Manual文件路径: /content/document_N.xlsx
文档名称 (回车使用默认): 
输入PDF文件路径 (回车结束): 


ERROR:root:Error opening PDF file: [Errno 21] Is a directory: '/content'



📋 已添加的文件对 (共 4 对):
--------------------------------------------------------------------------------
 1. 文档: content
    PDF:    /content
    Manual: /content

 2. 文档: document_C
    PDF:    /content/document_C.pdf
    Manual: /content/document_C.xlsx

 3. 文档: document_F
    PDF:    /content/document_F.pdf
    Manual: /content/document_F.xlsx

 4. 文档: document_N
    PDF:    /content/document_N.pdf
    Manual: /content/document_N.xlsx


🚀 开始批量处理 4 个文档...
📁 结果将保存到: batch_kpi_results/batch_20250728_220458

📄 处理文档 1/4: content
📊 Step 1: 提取KPI from content...
❌ 处理失败: [Errno 21] Is a directory: '/content'

📄 处理文档 2/4: document_C
📊 Step 1: 提取KPI from document_C.pdf...




✅ 提取完成: 10 KPIs
🔍 Step 2: 运行验证 against document_C.xlsx...

KPI EXTRACTION VALIDATION SUMMARY
📊 Dataset: 6 manual vs 10 auto KPIs
🎯 Best F1 Score: 0.625
📈 Precision: 0.500
📉 Recall: 0.833
✅ True Positives: 5
❌ False Positives: 5
⚠️  False Negatives: 1
📁 Results saved to: batch_kpi_results/batch_20250728_220458/doc_2_document_C/validation
🎯 验证完成:
   F1 Score: 0.625
   Precision: 0.500
   Recall: 0.833
⏱️  处理耗时: 108.1秒
📁 结果保存到: batch_kpi_results/batch_20250728_220458/doc_2_document_C

📄 处理文档 3/4: document_F
📊 Step 1: 提取KPI from document_F.pdf...




✅ 提取完成: 19 KPIs
🔍 Step 2: 运行验证 against document_F.xlsx...

KPI EXTRACTION VALIDATION SUMMARY
📊 Dataset: 11 manual vs 19 auto KPIs
🎯 Best F1 Score: 0.600
📈 Precision: 0.474
📉 Recall: 0.818
✅ True Positives: 9
❌ False Positives: 10
⚠️  False Negatives: 2
📁 Results saved to: batch_kpi_results/batch_20250728_220458/doc_3_document_F/validation
🎯 验证完成:
   F1 Score: 0.600
   Precision: 0.474
   Recall: 0.818
⏱️  处理耗时: 158.6秒
📁 结果保存到: batch_kpi_results/batch_20250728_220458/doc_3_document_F

📄 处理文档 4/4: document_N
📊 Step 1: 提取KPI from document_N.pdf...




✅ 提取完成: 60 KPIs
🔍 Step 2: 运行验证 against document_N.xlsx...

KPI EXTRACTION VALIDATION SUMMARY
📊 Dataset: 46 manual vs 60 auto KPIs
🎯 Best F1 Score: 0.868
📈 Precision: 0.767
📉 Recall: 1.000
✅ True Positives: 46
❌ False Positives: 14
⚠️  False Negatives: 0
📁 Results saved to: batch_kpi_results/batch_20250728_220458/doc_4_document_N/validation
🎯 验证完成:
   F1 Score: 0.868
   Precision: 0.767
   Recall: 1.000
⏱️  处理耗时: 461.8秒
📁 结果保存到: batch_kpi_results/batch_20250728_220458/doc_4_document_N
📋 批量处理报告生成: batch_kpi_results/batch_20250728_220458/batch_report.md
📊 详细结果Excel: batch_kpi_results/batch_20250728_220458/batch_summary.xlsx

🎉 批量处理完成!
⏱️  总耗时: 728.5秒 (12.1分钟)
📊 处理统计: 
        成功: 3/4 (75.0%)
        失败: 1/4 (25.0%)
        总KPI数: 89
        平均F1: 0.698
        平均精确率: 0.580
        平均召回率: 0.884
        
📁 完整结果查看: batch_kpi_results/batch_20250728_220458
