In [1]:
import os
import fitz  # PyMuPDF
import openai
import pytesseract
import tempfile
import base64
import json
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
from tqdm import tqdm
from pdf2image import convert_from_path
from PIL import Image
from openai import OpenAI

# ========== 1. 初始化 OpenAI ==========
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# ========== 2. 提取文本：前后20页 ==========
def extract_front_back_text(pdf_path, front_n=10, back_n=10):
    try:
        doc = fitz.open(pdf_path)
        texts = [doc[i].get_text() for i in range(min(front_n, len(doc)))]
        texts += [doc[i].get_text() for i in range(max(0, len(doc) - back_n), len(doc))]
        doc.close()
        full_text = "\n".join(texts)
        if len(full_text.strip()) < 50:
            raise ValueError("Too little text")
        return full_text
    except Exception as e:
        print(f"⚠️ PyMuPDF failed on {pdf_path.name}, switching to OCR...")
        return extract_text_with_ocr(pdf_path)

# ========== 3. OCR 补救 ==========
def extract_text_with_ocr(pdf_path, dpi=300):
    with tempfile.TemporaryDirectory() as path:
        images = convert_from_path(pdf_path, dpi=dpi, output_folder=path)
        text_parts = [pytesseract.image_to_string(img) for img in images[:3] + images[-3:]]
        return "\n".join(text_parts)

# ========== 4. 构造 Prompt ==========
def build_report_year_prompt(text):
    return f"""
You are an expert assistant helping to extract reporting years from corporate reports.

Please read the following text carefully, and complete the following tasks:

1. Identify any phrase that refers to the **reporting period**, **fiscal year**, or **financial year**.
   Examples:
   - "for the year ended 31 December 2021"
   - "reporting period: April 2021 – March 2022"
   - "2011-14" or "FY2017"

2. Normalize the expression into a date range format:
   - Always use “to” instead of “–” or “-” (e.g. "April 2021 to March 2022")
   - If only year is provided (e.g. "2011-14"), convert to: "2011 to 2014"
   - If full date is provided, remove ordinal suffixes like “1st”, “2nd”, “31st”, etc.
   - Convert to long form like: "1 January 2021 to 31 December 2021"

3. If a valid expression is found, return a JSON with:
{{
  "normalized_report_year": "...",
  "original_expression": "...",
  "source": "e.g. Page 1, main text"
}}

4. If nothing is found, return:
{{
  "normalized_report_year": null,
  "original_expression": null,
  "source": "NOT FOUND"
}}

Only output the JSON object. Do not add explanations.

Text:
{text}
"""

# ========== 5. Vision 模型辅助 ==========
def encode_image_to_base64(pil_image):
    with tempfile.NamedTemporaryFile(suffix=".png") as f:
        pil_image.save(f.name, format="PNG")
        with open(f.name, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")

def extract_year_from_vision(pdf_path, client, page_limit=3):
    try:
        images = convert_from_path(pdf_path, dpi=200)
        for i, img in enumerate(images[:page_limit]):
            b64 = encode_image_to_base64(img)
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "user", "content": [
                        {"type": "text", "text": f"Please extract the fiscal year or reporting period from page {i+1}. Please return in this JSON format:\n"
                                                 "{\n  \"normalized_report_year\": \"...\",\n  \"original_expression\": \"...\",\n  \"source\": \"Page {i+1}, image-based\"\n}"},
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}"}}
                    ]}
                ],
                max_tokens=300
            )
            content = response.choices[0].message.content
            parsed = json.loads(content.strip()) if content.strip().startswith("{") else eval(content.strip())
            if parsed.get("normalized_report_year"):
                return parsed
    except Exception as e:
        return {
            "normalized_report_year": None,
            "original_expression": None,
            "source": f"Vision ERROR: {e}"
        }
    return {
        "normalized_report_year": None,
        "original_expression": None,
        "source": "Vision NOT FOUND"
    }

# ========== 6. 主函数：先文本，再 Vision ==========
def extract_report_year(pdf_path, client):
    try:
        text = extract_front_back_text(pdf_path)
        prompt = build_report_year_prompt(text)
        response = client.chat.completions.create(
            model="gpt-4.1-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=500
        )
        content = response.choices[0].message.content
        parsed = json.loads(content.strip()) if content.strip().startswith("{") else eval(content.strip())
        if parsed.get("normalized_report_year"):
            return parsed
    except Exception as e:
        print(f"⚠️ GPT-4.1-mini failed on {Path(pdf_path).name}, fallback to Vision...")
    return extract_year_from_vision(pdf_path, client)


from joblib import Parallel, delayed

# 多线程处理函数
def process_pdf(pdf_path):
    try:
        out = extract_report_year(str(pdf_path), client)
        out["filename"] = pdf_path.name
    except Exception as e:
        out = {
            "filename": pdf_path.name,
            "normalized_report_year": None,
            "original_expression": None,
            "source": f"ERROR: {e}"
        }
    return out

def batch_extract_years_multithread(pdf_dir, output_csv="results/extracted_report_years_formal1.csv", n_jobs=2):
    pdf_dir = Path(pdf_dir)
    pdf_files = sorted(pdf_dir.glob("*.pdf"))
    os.makedirs(Path(output_csv).parent, exist_ok=True)

    # ✅ 并行处理
    results = Parallel(n_jobs=n_jobs, prefer="threads")(
        delayed(process_pdf)(pdf) for pdf in tqdm(pdf_files, desc="📄 Processing PDFs")
    )

    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    print(f"\n✅ Extraction complete! Results saved to: {output_csv}")

# ========== 8. 运行 ==========
if __name__ == "__main__":
    batch_extract_years_multithread("pdf_folder")  

📄 Processing PDFs:   1%|          | 13/1277 [00:14<19:15,  1.09it/s]

⚠️ GPT-4.1-mini failed on APA_Corp_US_APA-SustainabilityReport-2022_puby5ax5.pdf, fallback to Vision...


📄 Processing PDFs:   2%|▏         | 21/1277 [00:30<34:04,  1.63s/it]

⚠️ GPT-4.1-mini failed on Ab_Inbev_Si_Limited_AB20InBev_202220ESG20Report_FINAL_ncbtav56.pdf, fallback to Vision...


📄 Processing PDFs:   3%|▎         | 32/1277 [00:58<34:42,  1.67s/it]  

⚠️ GPT-4.1-mini failed on Adani_Green_Energy_Limited_AR-2018-19_7hq7wx9k.pdf, fallback to Vision...


📄 Processing PDFs:   3%|▎         | 37/1277 [01:09<41:52,  2.03s/it]

⚠️ GPT-4.1-mini failed on Adecco_Group_AG_The-Adecco-Group-2018-Sustainability-Report_vwbxgyc4.pdf, fallback to Vision...


📄 Processing PDFs:   4%|▎         | 45/1277 [01:28<33:15,  1.62s/it]  

⚠️ GPT-4.1-mini failed on Alcoa_Inc_2017-Sustainability-Report_uu0ohm6g.pdf, fallback to Vision...


📄 Processing PDFs:   4%|▎         | 46/1277 [01:43<1:41:11,  4.93s/it]

⚠️ GPT-4.1-mini failed on Algonquin_Power__Utilities_Corp_AQN-ESG-Report-2022_f02bs8x6.pdf, fallback to Vision...


📄 Processing PDFs:   4%|▍         | 49/1277 [01:45<53:46,  2.63s/it]  

⚠️ GPT-4.1-mini failed on Alibaba_Pictures_Group_Ltd_5bf385982121c_gf6s8h4p.pdf, fallback to Vision...


📄 Processing PDFs:   4%|▍         | 50/1277 [02:01<2:00:55,  5.91s/it]

⚠️ GPT-4.1-mini failed on All_for_One_Group_SE_sustainability_report_2020_21_all_for_one_group_0e1erpz2.pdf, fallback to Vision...


📄 Processing PDFs:   5%|▌         | 68/1277 [02:24<28:16,  1.40s/it]  

⚠️ GPT-4.1-mini failed on Amorepacific_Corp_2015_AMOREPACIFIC_SR_ENG_qjf991dq.pdf, fallback to Vision...


📄 Processing PDFs:   6%|▋         | 82/1277 [02:47<42:53,  2.15s/it]

⚠️ GPT-4.1-mini failed on Armstrong_Flooring_Inc_SustainabilityReport-2020_kot54emv.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Arvind_Ltd_Arvind_AR_2022-23_0_iwp4673c.pdf, fallback to Vision...


📄 Processing PDFs:   7%|▋         | 88/1277 [02:52<22:23,  1.13s/it]

⚠️ GPT-4.1-mini failed on Asian_Paints_Ltd_q4_3gbbkfbt.pdf, fallback to Vision...


📄 Processing PDFs:   8%|▊         | 104/1277 [03:16<22:21,  1.14s/it]

⚠️ GPT-4.1-mini failed on Avery_Dennison_Corp_AveryDennison_SustainabilityReport_2017_f_wdemqg51.pdf, fallback to Vision...


📄 Processing PDFs:   9%|▉         | 112/1277 [03:35<47:08,  2.43s/it]

⚠️ GPT-4.1-mini failed on BANDAI_NAMCO_Holdings_Inc_en_2015_socialreport_yoox8j0o.pdf, fallback to Vision...


📄 Processing PDFs:   9%|▉         | 114/1277 [03:37<37:16,  1.92s/it]

⚠️ GPT-4.1-mini failed on BASF_SE_2012_BASF_Report_lmq79gwn.pdf, fallback to Vision...


📄 Processing PDFs:  10%|▉         | 124/1277 [03:52<31:32,  1.64s/it]

⚠️ GPT-4.1-mini failed on Bank_Of_Ireland_Group_Plc_boi-responsibility-report-2018_kebfsaxj.pdf, fallback to Vision...


📄 Processing PDFs:  10%|▉         | 126/1277 [03:56<34:08,  1.78s/it]

⚠️ GPT-4.1-mini failed on Banyan_Tree_Holdings_Ltd_BTG2022_Sustainability_Report_wg4mg9gf.pdf, fallback to Vision...


📄 Processing PDFs:  10%|█         | 128/1277 [04:04<50:13,  2.62s/it]

⚠️ GPT-4.1-mini failed on Barrick_Gold_Corp_TSX_ABX_2014_odr8mi7i.pdf, fallback to Vision...


📄 Processing PDFs:  12%|█▏        | 158/1277 [05:14<26:57,  1.45s/it]  

⚠️ GPT-4.1-mini failed on Boryung_Corporation_EBB3B4EBA0B920ECA780EC868DEAB080EB8AA5EAB2BDEC9881EBB3B4EAB3A0EC849CEC9881EBACB8_ebpit5lz.pdf, fallback to Vision...


📄 Processing PDFs:  13%|█▎        | 160/1277 [05:16<23:32,  1.26s/it]

⚠️ GPT-4.1-mini failed on Borosil_Limited_BRL_AR_2023_e19geovm.pdf, fallback to Vision...


📄 Processing PDFs:  13%|█▎        | 167/1277 [05:28<28:42,  1.55s/it]

⚠️ GPT-4.1-mini failed on Britvic_Plc_britvic-annual-report-2021_alpq7ar7.pdf, fallback to Vision...


📄 Processing PDFs:  13%|█▎        | 171/1277 [05:46<48:02,  2.61s/it]  

⚠️ GPT-4.1-mini failed on Brookfield_Asset_Management_Ltd_bam_esg_report_2021_final_2_k054r4g5.pdf, fallback to Vision...


📄 Processing PDFs:  14%|█▎        | 174/1277 [06:06<1:12:20,  3.94s/it]

⚠️ GPT-4.1-mini failed on Bunge_Ltd_Bunge_RAS11_xp0uscgm.pdf, fallback to Vision...


📄 Processing PDFs:  14%|█▍        | 180/1277 [06:16<40:23,  2.21s/it]  

⚠️ GPT-4.1-mini failed on CDW_Holdings_Ltd_2022-cdw-esg-report-final_u3zkt6td.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on CF_Industries_Holdings_Inc_ar2011_4ocuzeva.pdf, fallback to Vision...


📄 Processing PDFs:  14%|█▍        | 183/1277 [06:57<2:01:33,  6.67s/it]

⚠️ GPT-4.1-mini failed on CF_Industries_Holdings_Inc_cfi_2014_sr_ogdworyw.pdf, fallback to Vision...


📄 Processing PDFs:  15%|█▌        | 197/1277 [07:25<19:37,  1.09s/it]  

⚠️ GPT-4.1-mini failed on Cadence_Bank_2022_ESG_Report_wz968nkm.pdf, fallback to Vision...


📄 Processing PDFs:  17%|█▋        | 212/1277 [07:49<18:28,  1.04s/it]

⚠️ GPT-4.1-mini failed on Cegedim_SA_CEGEDIM_URD_2020_EN_Version_PDF_MEL_OPTI_ews3ardc.pdf, fallback to Vision...


📄 Processing PDFs:  18%|█▊        | 235/1277 [08:37<30:36,  1.76s/it]

⚠️ GPT-4.1-mini failed on Cloudflare_Inc_Cloudflare_Impact_Report_2021_q8u89qsl.pdf, fallback to Vision...


📄 Processing PDFs:  21%|██        | 263/1277 [09:13<15:11,  1.11it/s]

⚠️ GPT-4.1-mini failed on Daiichi_Sankyo_Company_Ltd_Sustainability-Report-2022_2023_9x2n90ql.pdf, fallback to Vision...


📄 Processing PDFs:  21%|██        | 265/1277 [09:17<21:28,  1.27s/it]

⚠️ GPT-4.1-mini failed on Danske_Bank_As_cr-report-2014_og33p2ai.pdf, fallback to Vision...


📄 Processing PDFs:  21%|██▏       | 273/1277 [09:45<33:10,  1.98s/it]  

⚠️ GPT-4.1-mini failed on Denso_Ten_Manufacturing_Limited_sustainability_report_2022_en_eu1v3tvc.pdf, fallback to Vision...


📄 Processing PDFs:  22%|██▏       | 280/1277 [09:56<23:38,  1.42s/it]

⚠️ GPT-4.1-mini failed on Dongwon_Systems_Corp_230728_eng_m3oi2q0z.pdf, fallback to Vision...


📄 Processing PDFs:  22%|██▏       | 287/1277 [10:10<26:36,  1.61s/it]

⚠️ GPT-4.1-mini failed on Duke_Realty_Corp_145c7001-fba3-41b1-a982-440ad0f9df8a_dl305tls.pdf, fallback to Vision...


📄 Processing PDFs:  23%|██▎       | 292/1277 [10:24<29:33,  1.80s/it]  

⚠️ GPT-4.1-mini failed on EKI_Energy_Services_Limited_69298543284_zj7y1tjh.pdf, fallback to Vision...


📄 Processing PDFs:  23%|██▎       | 297/1277 [10:28<16:53,  1.03s/it]

⚠️ GPT-4.1-mini failed on Edison_Mission_Energy_2014-sce-corporate-responsibility-report_j27jgmh6.pdf, fallback to Vision...


📄 Processing PDFs:  25%|██▌       | 321/1277 [11:03<15:24,  1.03it/s]

⚠️ GPT-4.1-mini failed on Exelon_Corp_dwnld_Exelon_CSR201_eo6gewce.pdf, fallback to Vision...


📄 Processing PDFs:  26%|██▌       | 331/1277 [11:24<25:55,  1.64s/it]

⚠️ GPT-4.1-mini failed on FirstEnergy_Corp_climate-report_6r4bnjf4.pdf, fallback to Vision...


📄 Processing PDFs:  28%|██▊       | 353/1277 [11:54<13:44,  1.12it/s]

⚠️ GPT-4.1-mini failed on GEO_Group_Inc_The_2021-Human-Rights-ESG-Report_zkt6tdd8.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on GL_Ltd_2014-sustainability-report_t3e1ocv7.pdf, fallback to Vision...


📄 Processing PDFs:  30%|██▉       | 381/1277 [12:35<12:58,  1.15it/s]

⚠️ GPT-4.1-mini failed on Grasim_Industries_Ltd_grasim-sustainability-report-2018-19_f9vl6a9c.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Great_Eagle_Holdings_Ltd_2019-annual-report-e_ccbq83cz.pdf, fallback to Vision...


📄 Processing PDFs:  30%|██▉       | 383/1277 [12:49<51:32,  3.46s/it]  

⚠️ GPT-4.1-mini failed on Great_Eastern_Shipping_Co_Ltd_GESCL_68AnnualReport_2016_0xns664s.pdf, fallback to Vision...


📄 Processing PDFs:  30%|███       | 384/1277 [13:07<1:52:38,  7.57s/it]

⚠️ GPT-4.1-mini failed on Greenyard_Foods_NV_Sustainability20report202019_6p2rb2ia.pdf, fallback to Vision...


📄 Processing PDFs:  30%|███       | 388/1277 [13:15<58:55,  3.98s/it]  

⚠️ GPT-4.1-mini failed on Guan_Chao_Holdings_Ltd_2022042802280_o5obnwh9.pdf, fallback to Vision...


📄 Processing PDFs:  32%|███▏      | 406/1277 [13:55<19:08,  1.32s/it]  

⚠️ GPT-4.1-mini failed on Hansae_Yes24_Holdings_Co_Ltd_HANSAE20YES2420HOLDINGS20ESG20REPORT202022_th5kzsfk.pdf, fallback to Vision...


📄 Processing PDFs:  32%|███▏      | 408/1277 [13:55<13:00,  1.11it/s]

⚠️ GPT-4.1-mini failed on Harley-Davidson_Inc_44786834-62f7-4213-8f38-4bb5758fbf6f_5uhavaao.pdf, fallback to Vision...


📄 Processing PDFs:  33%|███▎      | 418/1277 [14:15<27:09,  1.90s/it]

⚠️ GPT-4.1-mini failed on Hertz_Global_Holdings_Inc_Hertz-Living-Journey-Sustainability-Report-2014_uztxxxy7.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Hengyi_Petrochemical_Co_Ltd_1210198337_o6z8jvzy.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Hess_Corp_hess-2021-sustainability-report_n2wei7r5.pdf, fallback to Vision...


📄 Processing PDFs:  34%|███▎      | 428/1277 [14:54<32:37,  2.31s/it]  

⚠️ GPT-4.1-mini failed on Hindustan_Petroleum_Corporation_Limited_HPCL_Sustainability_Report_2017-18_75wuyccf.pdf, fallback to Vision...


📄 Processing PDFs:  34%|███▍      | 434/1277 [15:07<35:00,  2.49s/it]

⚠️ GPT-4.1-mini failed on Home_Inns__Hotels_Management_Inc_Barclays_Bank_PLC_Annual_Report_202014_5lj1epic.pdf, fallback to Vision...


📄 Processing PDFs:  34%|███▍      | 440/1277 [15:19<24:45,  1.78s/it]

⚠️ GPT-4.1-mini failed on Hyosung_Corp_SR_2020_en_8g98j6gk.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Hyundai_Merchant_Marine_Co_Ltd_Environmental_Report_2015E_igps5w00.pdf, fallback to Vision...


📄 Processing PDFs:  36%|███▌      | 455/1277 [15:39<14:58,  1.09s/it]

⚠️ GPT-4.1-mini failed on Ibstock_PLC_ibstock-plc-environmental-report-2017_hz2wfx9f.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on India_Glycols_Ltd_annual-report-2019-20_g3mx3rps.pdf, fallback to Vision...


📄 Processing PDFs:  36%|███▌      | 456/1277 [15:51<50:43,  3.71s/it]

⚠️ GPT-4.1-mini failed on India_Nippon_Electricals_annual_report_2021_22_d523ze7c.pdf, fallback to Vision...


📄 Processing PDFs:  37%|███▋      | 475/1277 [16:20<12:45,  1.05it/s]

⚠️ GPT-4.1-mini failed on International_Public_Partnerships_Ltd_0672B_-2013-3-27_upqndajc.pdf, fallback to Vision...


📄 Processing PDFs:  38%|███▊      | 484/1277 [16:36<21:30,  1.63s/it]

⚠️ GPT-4.1-mini failed on Itochu_Corp_17fulle-all_jgubss5s.pdf, fallback to Vision...


📄 Processing PDFs:  39%|███▊      | 494/1277 [16:59<21:59,  1.69s/it]

⚠️ GPT-4.1-mini failed on Jefferies_Financial_Group_Inc_JEF-2021-ESG-Report_ebyr480a.pdf, fallback to Vision...


📄 Processing PDFs:  40%|████      | 516/1277 [17:41<21:58,  1.73s/it]

⚠️ GPT-4.1-mini failed on Kennametal_Inc_KMT-ESG20Report202021_50xsbdzz.pdf, fallback to Vision...


📄 Processing PDFs:  41%|████      | 518/1277 [17:47<27:13,  2.15s/it]

⚠️ GPT-4.1-mini failed on Keurig_Dr_Pepper_Inc_GMCRSustainabilityReport_2008_ijmbwtys.pdf, fallback to Vision...


📄 Processing PDFs:  41%|████      | 520/1277 [17:57<42:02,  3.33s/it]

⚠️ GPT-4.1-mini failed on KeyCorp_2018-CR-report_052019_drvv30t9.pdf, fallback to Vision...


📄 Processing PDFs:  41%|████      | 526/1277 [18:18<35:59,  2.87s/it]

⚠️ GPT-4.1-mini failed on Knoll_Inc_Knoll_Enviro_2008_gqetdkb7.pdf, fallback to Vision...


📄 Processing PDFs:  42%|████▏     | 541/1277 [18:43<13:09,  1.07s/it]

⚠️ GPT-4.1-mini failed on LIXIL_Corporation_LIXIL_csr_2013e_4kg372wg.pdf, fallback to Vision...


📄 Processing PDFs:  43%|████▎     | 546/1277 [18:55<27:31,  2.26s/it]

⚠️ GPT-4.1-mini failed on Landis__Gyr_AG_LandisGyr_Sustainability-Report_2016-17_5ciomp93.pdf, fallback to Vision...


📄 Processing PDFs:  44%|████▎     | 557/1277 [19:20<23:37,  1.97s/it]

⚠️ GPT-4.1-mini failed on Lg_Corp_LG20ESG20Reporting202022-2023_ENG_tkesgsns.pdf, fallback to Vision...


📄 Processing PDFs:  44%|████▍     | 560/1277 [19:33<37:28,  3.14s/it]

⚠️ GPT-4.1-mini failed on Link_Mobility_Group_ASA_LINK-Mobility-Group-Holding-ASA-Annual-Report-2022_w9hp55i9.pdf, fallback to Vision...


📄 Processing PDFs:  44%|████▍     | 568/1277 [20:05<41:09,  3.48s/it]

⚠️ GPT-4.1-mini failed on Luxottica_Group_SPA_Milano_2006_12_31_luxottica_annual_report_eng_7noqm8ix.pdf, fallback to Vision...


📄 Processing PDFs:  45%|████▍     | 571/1277 [20:14<36:20,  3.09s/it]

⚠️ GPT-4.1-mini failed on LyondellBasell_Industries_NV_2019_sustainability_report_fvt1o08m.pdf, fallback to Vision...


📄 Processing PDFs:  47%|████▋     | 603/1277 [21:00<11:37,  1.03s/it]

⚠️ GPT-4.1-mini failed on Metro_Holdings_Ltd_OTC_MTAGF_2012_lfvzdjwm.pdf, fallback to Vision...


📄 Processing PDFs:  48%|████▊     | 613/1277 [21:24<19:54,  1.80s/it]

⚠️ GPT-4.1-mini failed on Motorola_Solutions_Inc_2021_Corporate_Responsibility_Report_r5jr0c3n.pdf, fallback to Vision...


📄 Processing PDFs:  48%|████▊     | 616/1277 [21:54<1:04:07,  5.82s/it]

⚠️ GPT-4.1-mini failed on Myer_Holdings_Ltd_Myer_National_Packaging_Covenant_Report_2008-2009-Public_Document_rl7pvlk2.pdf, fallback to Vision...


📄 Processing PDFs:  49%|████▉     | 626/1277 [22:07<17:59,  1.66s/it]  

⚠️ GPT-4.1-mini failed on National_Grid_plc_download_lf4twl8x.pdf, fallback to Vision...


📄 Processing PDFs:  52%|█████▏    | 669/1277 [23:04<10:35,  1.05s/it]

⚠️ GPT-4.1-mini failed on Oracle_Corp_oracle-corp-citizenship-report-3941904_uykn1wls.pdf, fallback to Vision...


📄 Processing PDFs:  55%|█████▌    | 708/1277 [24:04<09:44,  1.03s/it]

⚠️ GPT-4.1-mini failed on Phoenix_Group_Holdings_csr-report-2018_2uyqtuyk.pdf, fallback to Vision...


📄 Processing PDFs:  56%|█████▌    | 714/1277 [24:15<16:23,  1.75s/it]

⚠️ GPT-4.1-mini failed on Pininfarina_SPA_Torino_PF-SpA-2020-Non-Financial-Statement_yg1uu30d.pdf, fallback to Vision...


📄 Processing PDFs:  57%|█████▋    | 730/1277 [24:37<08:16,  1.10it/s]

⚠️ GPT-4.1-mini failed on Qiagen_NV_bro_sustainability_2020_34xq1si8.pdf, fallback to Vision...


📄 Processing PDFs:  57%|█████▋    | 734/1277 [24:44<12:36,  1.39s/it]

⚠️ GPT-4.1-mini failed on RBL_Bank_Ltd_sustainability-report-2019-20_15upt0u5.pdf, fallback to Vision...


📄 Processing PDFs:  59%|█████▊    | 748/1277 [25:16<18:59,  2.15s/it]

⚠️ GPT-4.1-mini failed on Redtape_Limited_Annual-Report-2022-23_vji8ftnn.pdf, fallback to Vision...


📄 Processing PDFs:  59%|█████▉    | 753/1277 [25:26<17:05,  1.96s/it]

⚠️ GPT-4.1-mini failed on Reliance_Industries_Ltd_ril_sr2010_11_zq4u6rto.pdf, fallback to Vision...


📄 Processing PDFs:  60%|█████▉    | 766/1277 [25:52<10:20,  1.21s/it]

⚠️ GPT-4.1-mini failed on Ross_Stores_Inc_ROS-010_Ross-2020-Corporate-Social-Responsibility-Report_Programmed-Interactive-PDF_05nexes1.pdf, fallback to Vision...


📄 Processing PDFs:  62%|██████▏   | 790/1277 [26:26<11:26,  1.41s/it]

⚠️ GPT-4.1-mini failed on Samsung_Heavy_Industries_Co_Ltd_Samsung_Electronics_Sustainability_Report_2022_3l5f5v52.pdf, fallback to Vision...


📄 Processing PDFs:  63%|██████▎   | 801/1277 [26:46<13:29,  1.70s/it]

⚠️ GPT-4.1-mini failed on Sarine_Technologies_Ltd_657553_Sarine20AR202020-20Final_3k054r4g.pdf, fallback to Vision...


📄 Processing PDFs:  64%|██████▍   | 818/1277 [28:05<18:15,  2.39s/it]  

⚠️ GPT-4.1-mini failed on Seiko_Holdings_Corp_SEIKO_CSR_2019_en_t3e1ocv7.pdf, fallback to Vision...


📄 Processing PDFs:  65%|██████▍   | 828/1277 [28:27<11:36,  1.55s/it]

⚠️ GPT-4.1-mini failed on Shangri-La_Asia_Ltd_LTN20150424664_phebvw27.pdf, fallback to Vision...


📄 Processing PDFs:  66%|██████▌   | 838/1277 [28:45<16:44,  2.29s/it]

⚠️ GPT-4.1-mini failed on Shriram_Transport_Finance_Company_Ltd_STFC_AR_2020-21_9tkqfk8e.pdf, fallback to Vision...


📄 Processing PDFs:  66%|██████▋   | 848/1277 [29:15<13:34,  1.90s/it]

⚠️ GPT-4.1-mini failed on Siltronic_AG_20220309_Siltronic_Non-financial_Report_2021_qxyku58i.pdf, fallback to Vision...


📄 Processing PDFs:  67%|██████▋   | 854/1277 [29:24<10:37,  1.51s/it]

⚠️ GPT-4.1-mini failed on Sky_PLC_annual-report-spreads-2015_52ipwujn.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Sky_Perfect_JSAT_Holdings_Inc_2021report_27qp2cae.pdf, fallback to Vision...


📄 Processing PDFs:  68%|██████▊   | 865/1277 [30:04<10:57,  1.60s/it]

⚠️ GPT-4.1-mini failed on Solar_Industries_India_Ltd_solar-sustainability-report-2021_23rfdtxx.pdf, fallback to Vision...


📄 Processing PDFs:  68%|██████▊   | 872/1277 [30:19<15:14,  2.26s/it]

⚠️ GPT-4.1-mini failed on Sopharma_AD_3jr_nonfinancial_decl_2017_group_en_bncbtav5.pdf, fallback to Vision...


📄 Processing PDFs:  69%|██████▉   | 884/1277 [30:37<08:04,  1.23s/it]

⚠️ GPT-4.1-mini failed on St_James_Place_SJH20Annual20Report20199920-20English_v124agxh.pdf, fallback to Vision...


📄 Processing PDFs:  69%|██████▉   | 886/1277 [30:40<08:33,  1.31s/it]

⚠️ GPT-4.1-mini failed on StarHub_Ltd_Sustainability_5cftd97e.pdf, fallback to Vision...


📄 Processing PDFs:  70%|██████▉   | 890/1277 [31:22<38:08,  5.91s/it]

⚠️ GPT-4.1-mini failed on Sterlite_Technologies_Ltd_Annual-report-2016_p4sp9tbo.pdf, fallback to Vision...


📄 Processing PDFs:  70%|███████   | 894/1277 [31:31<25:00,  3.92s/it]

⚠️ GPT-4.1-mini failed on Strabag_AG_STRABAG_Sustainability_Report_2010_91pkl4r4.pdf, fallback to Vision...


📄 Processing PDFs:  71%|███████▏  | 910/1277 [32:02<08:11,  1.34s/it]

⚠️ GPT-4.1-mini failed on Suzlon_Energy_Ltd_annual-report-2017-18_5dmhfo40.pdf, fallback to Vision...


📄 Processing PDFs:  72%|███████▏  | 914/1277 [32:14<13:14,  2.19s/it]

⚠️ GPT-4.1-mini failed on TIM_SA_TIM-2022-sustainability-report-ENG_5qx6prh7.pdf, fallback to Vision...


📄 Processing PDFs:  72%|███████▏  | 920/1277 [32:36<17:17,  2.91s/it]

⚠️ GPT-4.1-mini failed on Tam_Jai_International_Co_Ltd_2022083101184_go5rbp4a.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Target_Corp_2007_Target_Corporate-Responsibility-Report_i6l2pkdd.pdf, fallback to Vision...


📄 Processing PDFs:  73%|███████▎  | 931/1277 [32:58<07:28,  1.30s/it]

⚠️ GPT-4.1-mini failed on Tech_Mahindra_Ltd_Integrated-Report-19_vfu2tql9.pdf, fallback to Vision...


📄 Processing PDFs:  74%|███████▎  | 941/1277 [33:17<09:08,  1.63s/it]

⚠️ GPT-4.1-mini failed on Telenor_ASA_Telenor20Annual20Report202021_hnxlbva9.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Telstra_Corporation_Ltd_2008-corporate-responsibility-report_kg372wg1.pdf, fallback to Vision...


📄 Processing PDFs:  74%|███████▍  | 944/1277 [33:38<20:32,  3.70s/it]

⚠️ GPT-4.1-mini failed on Thales_Thales_CRAR_2010_Accessible_makaexgz.pdf, fallback to Vision...


📄 Processing PDFs:  74%|███████▍  | 948/1277 [33:46<15:35,  2.84s/it]

⚠️ GPT-4.1-mini failed on Tianma_Microelectronics_Co_Ltd_caef0ba4-3439-4089-89b6-7ab33b249b3c_xyku58in.pdf, fallback to Vision...


📄 Processing PDFs:  75%|███████▍  | 952/1277 [34:17<28:07,  5.19s/it]

⚠️ GPT-4.1-mini failed on Titan_Company_Ltd_Annual20Report202013_p4r8w07u.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Titan_Company_Ltd_Annual_Report_2019_20_ufd92ukb.pdf, fallback to Vision...


📄 Processing PDFs:  76%|███████▋  | 976/1277 [35:09<05:05,  1.01s/it]

⚠️ GPT-4.1-mini failed on Tsubakimoto_Kogyo_Co_Ltd_2022-tsubaki-annual-report_en_nn66jmmt.pdf, fallback to Vision...


📄 Processing PDFs:  77%|███████▋  | 988/1277 [35:41<14:28,  3.01s/it]

⚠️ GPT-4.1-mini failed on Unknown_0101_311220_hkr0kz6t.pdf, fallback to Vision...


📄 Processing PDFs:  78%|███████▊  | 1001/1277 [36:04<04:47,  1.04s/it]

⚠️ GPT-4.1-mini failed on Unknown_1216245224_wuow755m.pdf, fallback to Vision...


📄 Processing PDFs:  79%|███████▊  | 1003/1277 [36:11<08:56,  1.96s/it]

⚠️ GPT-4.1-mini failed on Unknown_125973890_eq6xdle8.pdf, fallback to Vision...


📄 Processing PDFs:  79%|███████▉  | 1015/1277 [36:49<08:38,  1.98s/it]

⚠️ GPT-4.1-mini failed on Unknown_1996-Annual-Report_smz0trf7.pdf, fallback to Vision...


📄 Processing PDFs:  80%|████████  | 1022/1277 [37:19<11:04,  2.61s/it]

⚠️ GPT-4.1-mini failed on Unknown_2010_annual_report_low_res_cumrzxaz.pdf, fallback to Vision...


📄 Processing PDFs:  81%|████████▏ | 1040/1277 [37:46<04:40,  1.18s/it]

⚠️ GPT-4.1-mini failed on Unknown_2014SustainRpt_FNL_lr_7mrwsfm7.pdf, fallback to Vision...


📄 Processing PDFs:  83%|████████▎ | 1056/1277 [38:09<05:03,  1.37s/it]

⚠️ GPT-4.1-mini failed on Unknown_2017_ul51uqii.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2018-company-profile_cwz968nk.pdf, fallback to Vision...


📄 Processing PDFs:  83%|████████▎ | 1058/1277 [38:24<13:10,  3.61s/it]

⚠️ GPT-4.1-mini failed on Unknown_20181206_RS_Aquafil_ENG_2017_def_qcghobt1.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2018_AnnualReport_x4spl5z6.pdf, fallback to Vision...


📄 Processing PDFs:  83%|████████▎ | 1062/1277 [39:00<21:21,  5.96s/it]

⚠️ GPT-4.1-mini failed on Unknown_2019SustainabilityReportEN_all_230w2upi.pdf, fallback to Vision...


📄 Processing PDFs:  84%|████████▍ | 1071/1277 [39:34<08:39,  2.52s/it]

⚠️ GPT-4.1-mini failed on Unknown_2020042300516_z8w33430.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2020042401497_78f0azi2.pdf, fallback to Vision...


📄 Processing PDFs:  88%|████████▊ | 1119/1277 [40:54<02:34,  1.03it/s]

⚠️ GPT-4.1-mini failed on Unknown_2021poscochemical_en_wr4au7x0.pdf, fallback to Vision...


📄 Processing PDFs:  88%|████████▊ | 1126/1277 [41:11<05:56,  2.36s/it]

⚠️ GPT-4.1-mini failed on Unknown_2022-JBS-SUSTAINABILITY-REPORT_3ef6dv5e.pdf, fallback to Vision...


📄 Processing PDFs:  89%|████████▊ | 1131/1277 [41:24<05:41,  2.34s/it]

⚠️ GPT-4.1-mini failed on Unknown_2022-assurant-esg_0vxs5cs9.pdf, fallback to Vision...


📄 Processing PDFs:  89%|████████▊ | 1132/1277 [41:42<14:23,  5.96s/it]

⚠️ GPT-4.1-mini failed on Unknown_2022-esg-report-update_grll1p8t.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2022-esg-report_ofuikwie.pdf, fallback to Vision...


📄 Processing PDFs:  91%|█████████ | 1159/1277 [42:31<01:45,  1.12it/s]

⚠️ GPT-4.1-mini failed on Unknown_2022AL-IWQMAR_226t8esw.pdf, fallback to Vision...


📄 Processing PDFs:  91%|█████████ | 1160/1277 [43:39<36:38, 18.79s/it]

⚠️ GPT-4.1-mini failed on Unknown_2022_HD20Hyundai20Electric_Integrated20ReportIssued20in202023_zhtmjnry.pdf, fallback to Vision...


📄 Processing PDFs:  91%|█████████ | 1163/1277 [43:56<20:39, 10.87s/it]

⚠️ GPT-4.1-mini failed on Unknown_2022_LGChem_Sustainability_Report_ENG_dy4esyz0.pdf, fallback to Vision...


📄 Processing PDFs:  91%|█████████ | 1165/1277 [44:33<24:17, 13.01s/it]

⚠️ GPT-4.1-mini failed on Unknown_2022_tx_konzern_annual-report_en_9t97h9br.pdf, fallback to Vision...


📄 Processing PDFs:  91%|█████████▏| 1167/1277 [44:58<21:16, 11.61s/it]

⚠️ GPT-4.1-mini failed on Unknown_2023-ALS-Sustainability-Report_ocuzeva0.pdf, fallback to Vision...


📄 Processing PDFs:  92%|█████████▏| 1174/1277 [45:34<07:38,  4.46s/it]

⚠️ GPT-4.1-mini failed on Unknown_2023042101335_kyzhtmjn.pdf, fallback to Vision...


📄 Processing PDFs:  92%|█████████▏| 1175/1277 [45:35<05:42,  3.36s/it]

⚠️ GPT-4.1-mini failed on Unknown_2023042704942_wzv2kg92.pdf, fallback to Vision...


📄 Processing PDFs:  93%|█████████▎| 1192/1277 [46:31<01:33,  1.10s/it]

⚠️ GPT-4.1-mini failed on Unknown_23076_Whitbread_AR2020_web_0v2mxh4f.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_230523_STO_Sustainability_Report_2022_EnglishUK_p00082dp.pdf, fallback to Vision...


📄 Processing PDFs:  94%|█████████▎| 1197/1277 [46:39<01:47,  1.35s/it]

⚠️ GPT-4.1-mini failed on Unknown_26709850_AR_SMN_2021___LK_pug3sh79.pdf, fallback to Vision...


📄 Processing PDFs:  94%|█████████▍| 1204/1277 [47:08<03:40,  3.02s/it]

⚠️ GPT-4.1-mini failed on Unknown_3585_8q5qs8ry.pdf, fallback to Vision...


📄 Processing PDFs:  95%|█████████▍| 1213/1277 [47:37<02:35,  2.43s/it]

⚠️ GPT-4.1-mini failed on Unknown_41913-MGIRON_Sustainability-Report_2021_WEB-singles_ojs9mnr0.pdf, fallback to Vision...


📄 Processing PDFs:  97%|█████████▋| 1238/1277 [48:37<00:47,  1.21s/it]

⚠️ GPT-4.1-mini failed on Unknown_650713a78100f398f325c42f90432a0b_01o0g1nh.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_6a458c26bcb145eeb926cc39d7581091_1upfx44e.pdf, fallback to Vision...


📄 Processing PDFs: 100%|█████████▉| 1273/1277 [49:37<00:04,  1.13s/it]

⚠️ GPT-4.1-mini failed on Unknown_about-us-sustainability-report-and-policy-sustainability-report-2015-en_l4r0j0ra.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_acs04-csr-esp_2wgz8riy.pdf, fallback to Vision...


📄 Processing PDFs: 100%|█████████▉| 1274/1277 [49:52<00:13,  4.45s/it]

⚠️ GPT-4.1-mini failed on Unknown_adbi-managing-transition-low-carbon-economy_087is5zy.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_adp07-sus-fr_95qx6prh.pdf, fallback to Vision...


📄 Processing PDFs: 100%|██████████| 1277/1277 [49:54<00:00,  2.34s/it]



✅ Extraction complete! Results saved to: results/extracted_report_years_formal1.csv


In [3]:
import pandas as pd

# 读取文件
anno_df = pd.read_excel("check/rfyear_annotation.xlsx")
result_df = pd.read_csv("output/report_years_extracted_new.csv")

# 清洗文件名列
anno_df["pdf_name_clean"] = anno_df["pdf_name"].str.strip().str.lower()
result_df["filename_clean"] = result_df["filename"].str.strip().str.lower()

# 合并两个表
merged_df = pd.merge(
    anno_df,
    result_df,
    left_on="pdf_name_clean",
    right_on="filename_clean",
    how="left"
)

# 年份对比函数
def match_year(human, gpt):
    if pd.isna(human) or pd.isna(gpt):
        return "missing"
    return "match" if str(human).strip().lower() == str(gpt).strip().lower() else "mismatch"

# 添加匹配结果
merged_df["match_result"] = merged_df.apply(
    lambda row: match_year(row["chosen_rfyear"], row["report_year_vision"]),
    axis=1
)

# 仅保留有用字段
final_df = merged_df[[
    "pdf_name", "chosen_rfyear", "report_year_vision", "match_result"
]]

# 保存结果
final_df.to_csv("rfyear_comparison_results_minimal.csv", index=False)

# 输出统计
print("✅ 已保存简洁对比结果为 rfyear_comparison_results_minimal.csv")
print("📊 匹配情况统计：")
print(final_df["match_result"].value_counts())

✅ 已保存简洁对比结果为 rfyear_comparison_results_minimal.csv
📊 匹配情况统计：
match_result
missing     386
mismatch    215
Name: count, dtype: int64


In [4]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

# 1. 读取两个文件
gpt_df = pd.read_csv("results/extracted_report_years_formal.csv")  # 提取结果
label_df = pd.read_excel("check/rfyear_annotation.xlsx")         # 人工标注文件

# 2. 重命名列以便对齐合并
gpt_df.rename(columns={"filename": "pdf_name", "report_year": "report_year_pred"}, inplace=True)

# 3. 合并两个表（inner join，只保留两个都有的 pdf）
merged = pd.merge(label_df, gpt_df, on="pdf_name", how="inner")

# 4. 标准化字符串格式（去除空格大小写等）
merged["report_year_pred"] = merged["report_year_pred"].astype(str).str.strip().str.lower()
merged["chosen_rfyear"] = merged["chosen_rfyear"].astype(str).str.strip().str.lower()

# 5. 定义匹配逻辑（完全匹配即可）
merged["match"] = merged["report_year_pred"] == merged["chosen_rfyear"]

# 6. 统计指标
total = len(merged)
correct = merged["match"].sum()
accuracy = correct / total

print(f"✅ 合并样本数：{total}")
print(f"✅ 匹配正确数：{correct}")
print(f"✅ 准确率（accuracy）：{accuracy:.2%}")

# 7. 可选：保存对比结果
# merged[["pdf_name", "chosen_rfyear", "report_year_pred", "match"]].to_csv("eval/year_comparison.csv", index=False)

✅ 合并样本数：224
✅ 匹配正确数：5
✅ 准确率（accuracy）：2.23%


In [6]:
import pandas as pd
from sklearn.metrics import classification_report

# 1. 读取两个文件
gpt_df = pd.read_csv("results/extracted_report_years_formal.csv")
human_df = pd.read_excel("check/rfyear_annotation.xlsx")  # 修改为你的路径

# 2. 对齐列名并合并（按 PDF 文件名匹配）
gpt_df["pdf_name"] = gpt_df["filename"].str.replace(".pdf", "", regex=False)
merged_df = pd.merge(human_df, gpt_df, on="pdf_name", how="inner")

# 3. 标准化年份字符串（两侧空格、小写等）
merged_df["chosen_rfyear_std"] = merged_df["chosen_rfyear"].astype(str).str.strip().str.lower()
merged_df["report_year_std"] = merged_df["report_year"].astype(str).str.strip().str.lower()

# 4. 比对是否完全一致
merged_df["is_match"] = merged_df["chosen_rfyear_std"] == merged_df["report_year_std"]

# 5. 评估准确率
total = len(merged_df)
correct = merged_df["is_match"].sum()
accuracy = correct / total

# 6. 输出评估结果
print(f"✅ Total matched records: {total}")
print(f"✅ Correct matches: {correct}")
print(f"✅ Accuracy: {accuracy:.2%}")

# 7. 可视化错误案例
errors = merged_df[~merged_df["is_match"]][["pdf_name", "chosen_rfyear", "report_year", "source"]]
errors.to_csv("eval/year_mismatch_cases.csv", index=False)
print("🚨 Mismatched cases saved to eval/year_mismatch_cases.csv")

✅ Total matched records: 0
✅ Correct matches: 0
✅ Accuracy: nan%
🚨 Mismatched cases saved to eval/year_mismatch_cases.csv


  accuracy = correct / total
