In [None]:
import os
import json
import base64
import tempfile
from pathlib import Path
from dotenv import load_dotenv
from tqdm import tqdm
import pandas as pd
import shutil

import fitz  # PyMuPDF
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
from openai import OpenAI
import openai

# Load OpenAI API key from .env file
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Paths
PDF_DIR = Path("pdf_folder")  # Ensure this path contains your PDFs
OUTPUT_CSV = Path("output/full_pipeline_results.csv")
os.makedirs("output", exist_ok=True)


def extract_text_with_ocr(pdf_path, front_n=5, back_n=5, dpi=400):
    import warnings
    from pdf2image.exceptions import PDFPageCountError
    try:
        with tempfile.TemporaryDirectory() as path:
            try:
                images = convert_from_path(str(pdf_path), dpi=dpi, output_folder=path)
            except PDFPageCountError as e:
                return f"OCR ERROR: PDF structure invalid – {str(e)}"
            except Exception as e:
                return f"OCR ERROR: {str(e)}"

            total_pages = len(images)
            if total_pages == 0:
                return "OCR ERROR: No images extracted"

            selected = images[:front_n] + images[-back_n:]
            texts = []
            for img in selected:
                img = img.convert("L")  # 灰度增强
                text = pytesseract.image_to_string(img, lang="eng")
                texts.append(text)
            return "\n".join(texts)

    except Exception as e:
        return f"OCR ERROR (outer): {str(e)}"


def extract_front_back_text(pdf_path, front_n=5, back_n=5, dpi=400):
    try:
        doc = fitz.open(str(pdf_path))
        texts = []
        for i in range(min(front_n, len(doc))):
            texts.append(doc[i].get_text())
        for i in range(max(0, len(doc) - back_n), len(doc)):
            texts.append(doc[i].get_text())
        doc.close()
        full_text = "\n".join(texts)
        if len(full_text.strip()) < 100:
            raise ValueError("Too short, fallback to OCR.")
        return full_text
    except Exception as e:
        print(f"⚠️ Fallback to OCR on: {pdf_path.name} due to {str(e)}")
        return extract_text_with_ocr(pdf_path, front_n, back_n, dpi)

# === Task 1: 分类 ===
def classify_report_type(text):
    system_prompt = "You are an expert in classification."
    
    user_prompt = f"""
The following text is extracted from a corporate report (typically the first and last few pages, or a table of contents).

Your tasks:

1. Classify the report into one of the following types:
- "sustainability report": The document primarily focuses on ESG, sustainability, CSR, or GRI/SDGs-related topics. It does NOT contain full audited financial statements.
- "annual report": The document primarily contains audited financial disclosures (e.g., income statement, cash flow statement, balance sheet, auditor’s report). Any ESG content is minimal, scattered, or clearly secondary.
- "integrated report": The document contains BOTH full audited financial disclosures AND a dedicated or structured sustainability section (e.g. a chapter explicitly named “Sustainability”, “ESG”, or “CSR” in the table of contents or headings). ESG content must be substantial and integrated into the structure, not just mentioned briefly.
- "other": The document does not match any of the above categories (e.g., promotional brochures, regulatory filings, environmental statements, etc.)

2. Important judgment rules:
- If the document contains a clear section titled “Sustainability”, “ESG”, “CSR”, “GRI” etc., and this content is more than a few paragraphs, treat it as substantial.
- If BOTH financials and structured sustainability content are found, classify as "integrated report", regardless of the title.
- Do NOT rely solely on the report title (“Annual Report” / “Sustainability Report”) to decide. Always cross-check contents.

3. Also indicate whether a clear sustainability section is present, and extract the section name if available.

Return a JSON object with the following structure:
{{
  "reasoning": "...",
  "report_type": "...",
  "has_sustainability_section": true/false,
  "sustainability_section_name": "..."
}}

Content:
{text}
"""
 
    try:
        response = client.chat.completions.create(
            model="gpt-4.1-mini",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0
        )
        reply = response.choices[0].message.content
        parsed = json.loads(reply)
        return parsed
    except Exception as e:
        return {
            "report_type": "ERROR",
            "has_sustainability_section": False,
            "sustainability_section_name": "",
            "reasoning": str(e)
        }

    

# === Task 2: Vision 年份识别 ===
def encode_image_to_base64(image_path):
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")

def call_gpt4_vision(image_path):
    b64_image = encode_image_to_base64(image_path)
    vision_prompt = """
This image is from the first few pages of a corporate report.

Please identify the time period that the report covers, based on any visible text, charts, or tables.

Return examples like:
- “April 2023 – March 2024”
- “Fiscal Year Ending 31 March 2023”
- “FY2022”

If no such date range is visible, respond with “NOT FOUND”.
Return only the most specific date range or fiscal year visible.
"""
    try:
        response = openai.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": "You are an expert in the field of pdf information extraction"},
                {"role": "user", "content": [
                    {"type": "text", "text": vision_prompt},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64_image}"}}
                ]}
            ],
            max_tokens=300
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return f"ERROR: {e}"

def extract_report_year_from_pdf(pdf_path, max_pages=5):
    with tempfile.TemporaryDirectory() as tmpdir:
        try:
            images = convert_from_path(pdf_path, dpi=300, first_page=1, last_page=max_pages, output_folder=tmpdir)
            for img in images:
                img_path = Path(tmpdir) / "temp.png"
                img.save(img_path)
                result = call_gpt4_vision(img_path)
                if "not found" not in result.lower():
                    return result
        except Exception as e:
            return f"ERROR: {e}"
    return "NOT FOUND"

# === Task 3: 公司名称识别 ===

def extract_company_or_publisher_with_gpt(text):
    import json

    # 第一阶段：尝试识别公司名称
    primary_prompt = f"""
You are a corporate reporting analyst.

You are given the first and last pages of a report. Your task:
1. Identify the official **company name** that issued the report.
2. Briefly explain your reasoning (e.g., where you found the name, any indicative phrasing).

Return your answer as a JSON object with the following format:
{{
  "company_name": "...",
  "reasoning": "..."
}}

If no company name is clearly found, return:
{{
  "company_name": "UNKNOWN",
  "reasoning": "No clear indication of the issuing company in the provided text."
}}

Text:
{text}
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4.1-mini",  # 更稳
            messages=[
                {"role": "system", "content": "You are an expert in ESG and corporate reporting."},
                {"role": "user", "content": primary_prompt}
            ],
            temperature=0
        )
        content = response.choices[0].message.content.strip()
        result = json.loads(content)
    except Exception as e:
        return {
            "company_name": "GPT_ERROR",
            "reasoning": str(e),
            "publisher": None
        }
    except json.JSONDecodeError:
        return {
            "company_name": "PARSE_ERROR",
            "reasoning": f"Raw response: {content}",
            "publisher": None
        }

    # 第二阶段：如果公司名未知，则继续识别 publisher
    if result.get("company_name", "").upper() == "UNKNOWN":
        try:
            secondary_prompt = f"""
You are analyzing a report that is **not issued by a company**. Instead, it may be from a government agency, international organization, academic institution, or other publisher.

From the following text, identify the **publisher** or issuing organization of this report.

Return only the publisher name as a JSON object:
{{ "publisher": "..." }}

If no organization is found, return:
{{ "publisher": null }}

Text:
{text}
"""
            response2 = client.chat.completions.create(
                model="gpt-4.1-mini",  # 或 gpt-4.1
                messages=[
                    {"role": "system", "content": "You are an expert in academic and institutional publishing."},
                    {"role": "user", "content": secondary_prompt}
                ],
                temperature=0
            )
            content2 = response2.choices[0].message.content.strip()
            pub_result = json.loads(content2)
            result["publisher"] = pub_result.get("publisher", "UNKNOWN")
        except Exception as e:
            result["publisher"] = f"GPT_ERROR: {str(e)}"

    else:
        result["publisher"] = None  # 如果识别出公司名，不需要 publisher

    # ✅ 将 UNKNOWN 替换为空字符串
    if result.get("company_name", "").upper() == "UNKNOWN":
        result["company_name"] = ""
    return result

# === 主流程入口 ===
def run_pipeline(pdf_dir, output_csv):
    results = []

    for pdf_path in tqdm(list(Path(pdf_dir).glob("*.pdf")), desc="Processing PDFs"):
        filename = pdf_path.name
        text = extract_front_back_text(pdf_path)

        # Task 1
        classification = classify_report_type(text)

        # Task 2
        report_year = extract_report_year_from_pdf(pdf_path)

        # Task 3
        company_info = extract_company_or_publisher_with_gpt(text)

        results.append({
            "filename": filename,
            "report_type": classification.get("report_type"),
            "has_sustainability_section": classification.get("has_sustainability_section"),
            "sustainability_section_name": classification.get("sustainability_section_name"),
            "report_year_vision": report_year,
            "company_name": company_info.get("company_name"),
            "publisher": company_info.get("publisher"),
            "reasoning": company_info.get("reasoning")
        })

    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    print(f"✅ Saved full pipeline results to: {output_csv}")


if __name__ == "__main__":
    run_pipeline(PDF_DIR, OUTPUT_CSV)

Processing PDFs:  20%|█▉        | 251/1277 [51:55<3:13:44, 11.33s/it] 

⚠️ Fallback to OCR on: Unknown_adbi-managing-transition-low-carbon-economy_087is5zy.pdf due to Too short, fallback to OCR.


Processing PDFs:  20%|██        | 256/1277 [53:01<4:00:43, 14.15s/it]

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



Processing PDFs:  20%|██        | 257/1277 [53:16<4:03:37, 14.33s/it]

⚠️ Fallback to OCR on: Unknown_2014SustainRpt_FNL_lr_7mrwsfm7.pdf due to Too short, fallback to OCR.


Processing PDFs:  28%|██▊       | 360/1277 [1:15:02<5:33:35, 21.83s/it]

⚠️ Fallback to OCR on: Hansae_Yes24_Holdings_Co_Ltd_HANSAE20YES2420HOLDINGS20ESG20REPORT202022_th5kzsfk.pdf due to Too short, fallback to OCR.


Processing PDFs:  34%|███▎      | 430/1277 [1:31:52<3:36:47, 15.36s/it]

⚠️ Fallback to OCR on: Home_Inns__Hotels_Management_Inc_Barclays_Bank_PLC_Annual_Report_202014_5lj1epic.pdf due to Too short, fallback to OCR.


Processing PDFs:  39%|███▉      | 498/1277 [1:48:04<2:51:25, 13.20s/it]

⚠️ Fallback to OCR on: Unknown_adp07-sus-fr_95qx6prh.pdf due to Too short, fallback to OCR.


Processing PDFs:  53%|█████▎    | 672/1277 [2:29:30<2:39:57, 15.86s/it]

⚠️ Fallback to OCR on: Armstrong_Flooring_Inc_SustainabilityReport-2020_kot54emv.pdf due to Too short, fallback to OCR.


Processing PDFs:  57%|█████▋    | 724/1277 [2:41:20<2:20:27, 15.24s/it]

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: obje

Processing PDFs:  60%|██████    | 771/1277 [2:52:12<1:40:31, 11.92s/it]

⚠️ Fallback to OCR on: Tam_Jai_International_Co_Ltd_2022083101184_go5rbp4a.pdf due to Too short, fallback to OCR.


Processing PDFs:  70%|███████   | 896/1277 [3:19:46<1:42:05, 16.08s/it]

⚠️ Fallback to OCR on: Hyosung_Corp_SR_2020_en_8g98j6gk.pdf due to Too short, fallback to OCR.


Processing PDFs:  76%|███████▋  | 974/1277 [3:36:25<1:13:07, 14.48s/it]

⚠️ Fallback to OCR on: Boryung_Corporation_EBB3B4EBA0B920ECA780EC868DEAB080EB8AA5EAB2BDEC9881EBB3B4EAB3A0EC849CEC9881EBACB8_ebpit5lz.pdf due to Too short, fallback to OCR.


Processing PDFs:  77%|███████▋  | 987/1277 [3:39:03<59:06, 12.23s/it]  

⚠️ Fallback to OCR on: Arvind_Ltd_Arvind_AR_2022-23_0_iwp4673c.pdf due to Too short, fallback to OCR.


Processing PDFs:  87%|████████▋ | 1106/1277 [4:05:39<41:28, 14.55s/it]  

⚠️ Fallback to OCR on: EKI_Energy_Services_Limited_69298543284_zj7y1tjh.pdf due to Too short, fallback to OCR.


Processing PDFs:  88%|████████▊ | 1122/1277 [4:09:00<30:51, 11.94s/it]

⚠️ Fallback to OCR on: Unknown_23076_Whitbread_AR2020_web_0v2mxh4f.pdf due to Too short, fallback to OCR.


Processing PDFs:  90%|████████▉ | 1149/1277 [4:14:49<23:50, 11.17s/it]

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



Processing PDFs:  91%|█████████▏| 1168/1277 [4:19:00<24:22, 13.42s/it]

MuPDF error: format error: No default Layer config



Processing PDFs:  95%|█████████▍| 1209/1277 [4:28:20<16:07, 14.23s/it]

⚠️ Fallback to OCR on: Unknown_2023042101335_kyzhtmjn.pdf due to Too short, fallback to OCR.


Processing PDFs:  97%|█████████▋| 1239/1277 [4:35:40<08:24, 13.27s/it]

⚠️ Fallback to OCR on: Titan_Company_Ltd_Annual20Report202013_p4r8w07u.pdf due to Too short, fallback to OCR.


Processing PDFs:  99%|█████████▉| 1269/1277 [4:42:21<01:49, 13.64s/it]

⚠️ Fallback to OCR on: BASF_SE_2012_BASF_Report_lmq79gwn.pdf due to Too short, fallback to OCR.


Processing PDFs: 100%|██████████| 1277/1277 [4:43:40<00:00, 13.33s/it]

✅ Saved full pipeline results to: output/full_pipeline_results.csv



