In [None]:
import os
import fitz  # PyMuPDF
import openai
import pytesseract
import tempfile
import base64
import json
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
from tqdm import tqdm
from pdf2image import convert_from_path
from PIL import Image
from openai import OpenAI

# ========== 1. 初始化 OpenAI ==========
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# ========== 2. 提取文本：前后20页 ==========
def extract_front_back_text(pdf_path, front_n=10, back_n=10):
    try:
        doc = fitz.open(pdf_path)
        texts = [doc[i].get_text() for i in range(min(front_n, len(doc)))]
        texts += [doc[i].get_text() for i in range(max(0, len(doc) - back_n), len(doc))]
        doc.close()
        full_text = "\n".join(texts)
        if len(full_text.strip()) < 50:
            raise ValueError("Too little text")
        return full_text
    except Exception as e:
        print(f"⚠️ PyMuPDF failed on {pdf_path.name}, switching to OCR...")
        return extract_text_with_ocr(pdf_path)

# ========== 3. OCR 补救 ==========
def extract_text_with_ocr(pdf_path, dpi=300):
    with tempfile.TemporaryDirectory() as path:
        images = convert_from_path(pdf_path, dpi=dpi, output_folder=path)
        text_parts = [pytesseract.image_to_string(img) for img in images[:3] + images[-3:]]
        return "\n".join(text_parts)

# ========== 4. 构造 Prompt ==========
def build_report_year_prompt(text):
    return f"""
You are an expert assistant helping to extract **reporting years** or **periods** from corporate reports.

Please follow the instructions carefully and return only a **JSON object** as described below.

---

### 📘 Task 1: Identify valid expressions for **reporting period**, **fiscal year**, or **financial year** from the input **text**, including:

- Main text body
- Tables or figures
- Footnotes

You should consider **all parts of the document**, not just plain paragraphs.  
If a table covers a specific date range (even if only with month/year or day/month/year), treat it as a valid reporting period.

---

### 📗 Task 2: Normalize each valid expression into a standard format:

1. If it's a **single year**, like `"2013"` or `"FY2020"`, return just `"2013"` or `"2020"`.
2. If it's a **range of years**, like `"2014–15"`, return `"2014 to 2015"`. Do **not** add months or days.
3. If it includes **exact dates** like `"1st April 2020 – 31st March 2021"`:
   - Remove ordinal suffixes (`1st` → `1`)
   - Normalize as `"1 April 2020 to 31 March 2021"`
4. If it uses **months only**, like `"April 2020 – March 2021"`, return `"April 2020 to March 2021"`.
5. Convert dashes (– or -) or slashes (/) to `"to"`.

🛑 Do **not** invent or guess missing information.  
🛑 Do **not** add any commentary outside the JSON object.

---

### 📕 Task 3: Return your result in the following JSON format:

If a valid reporting period is found:

{{
  "normalized_report_year": "your final normalized version",
  "original_expression": "the exact expression from the text",
  "source": "e.g. Page 1, table; main text; footnote"
}}
If no valid expression is found:

{{
  "normalized_report_year": null,
  "original_expression": null,
  "source": "NOT FOUND"
}}

examples of valid expressions:
input text:This report contains an overview of OPC Energy’s sustainability projects and activities during FY 2020 and FY2021, whose period is from November 1, 2019 to October 31, 2021.
output json: {{
  "normalized_report_year": "1 November 2019 to 31 October 2021",
  "original_expression": "from November 1, 2019 to October 31, 2021",
  "source": "main text"
}}  
input text:Report Period
This report covers the period from April 1, 2003 to March 31, 2004. Some activities occurring in FY 2004 are also described in the report.
output json: {{
  "normalized_report_year": "1 April 2003 to 31 March 2004",
  "original_expression": "from April 1, 2003 to March 31, 2004",
  "source": "main text"
}}
{text}
"""

# ========== 5. Vision 模型辅助 ==========
def encode_image_to_base64(pil_image):
    with tempfile.NamedTemporaryFile(suffix=".png") as f:
        pil_image.save(f.name, format="PNG")
        with open(f.name, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")

def extract_year_from_vision(pdf_path, client, page_limit=3):
    try:
        images = convert_from_path(pdf_path, dpi=200)
        for i, img in enumerate(images[:page_limit]):
            b64 = encode_image_to_base64(img)
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "user", "content": [
                        {"type": "text", "text": f"Please extract the fiscal year or reporting period from page {i+1}. Please return in this JSON format:\n"
                                                 "{\n  \"normalized_report_year\": \"...\",\n  \"original_expression\": \"...\",\n  \"source\": \"Page {i+1}, image-based\"\n}"},
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}"}}
                    ]}
                ],
                max_tokens=300
            )
            content = response.choices[0].message.content
            parsed = json.loads(content.strip()) if content.strip().startswith("{") else eval(content.strip())
            if parsed.get("normalized_report_year"):
                return parsed
    except Exception as e:
        return {
            "normalized_report_year": None,
            "original_expression": None,
            "source": f"Vision ERROR: {e}"
        }
    return {
        "normalized_report_year": None,
        "original_expression": None,
        "source": "Vision NOT FOUND"
    }

# ========== 6. 主函数：先文本，再 Vision ==========
def extract_report_year(pdf_path, client):
    try:
        text = extract_front_back_text(pdf_path)
        prompt = build_report_year_prompt(text)
        response = client.chat.completions.create(
            model="gpt-4.1",
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=500
        )
        content = response.choices[0].message.content
        parsed = json.loads(content.strip()) if content.strip().startswith("{") else eval(content.strip())
        if parsed.get("normalized_report_year"):
            return parsed
    except Exception as e:
        print(f"⚠️ GPT-4.1-mini failed on {Path(pdf_path).name}, fallback to Vision...")
    return extract_year_from_vision(pdf_path, client)


from joblib import Parallel, delayed

# 多线程处理函数
def process_pdf(pdf_path):
    try:
        out = extract_report_year(str(pdf_path), client)
        out["filename"] = pdf_path.name
    except Exception as e:
        out = {
            "filename": pdf_path.name,
            "normalized_report_year": None,
            "original_expression": None,
            "source": f"ERROR: {e}"
        }
    return out

def batch_extract_years_multithread(pdf_dir, output_csv="results/report_years1.csv", n_jobs=2):
    pdf_dir = Path(pdf_dir)
    pdf_files = sorted(pdf_dir.glob("*.pdf"))
    os.makedirs(Path(output_csv).parent, exist_ok=True)

    # ✅ 并行处理
    results = Parallel(n_jobs=n_jobs, prefer="threads")(
        delayed(process_pdf)(pdf) for pdf in tqdm(pdf_files, desc="📄 Processing PDFs")
    )

    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    print(f"\n✅ Extraction complete! Results saved to: {output_csv}")

# ========== 8. 运行 ==========
if __name__ == "__main__":
    batch_extract_years_multithread("pdf_folder")  



⚠️ GPT-4.1-mini failed on Armstrong_Flooring_Inc_SustainabilityReport-2020_kot54emv.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Arvind_Ltd_Arvind_AR_2022-23_0_iwp4673c.pdf, fallback to Vision...



📄 Processing PDFs:   9%|▉         | 114/1278 [02:35<30:26,  1.57s/it]

⚠️ GPT-4.1-mini failed on BASF_SE_2012_BASF_Report_lmq79gwn.pdf, fallback to Vision...


[A

⚠️ GPT-4.1-mini failed on Boryung_Corporation_EBB3B4EBA0B920ECA780EC868DEAB080EB8AA5EAB2BDEC9881EBB3B4EAB3A0EC849CEC9881EBACB8_ebpit5lz.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on EKI_Energy_Services_Limited_69298543284_zj7y1tjh.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Hansae_Yes24_Holdings_Co_Ltd_HANSAE20YES2420HOLDINGS20ESG20REPORT202022_th5kzsfk.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Home_Inns__Hotels_Management_Inc_Barclays_Bank_PLC_Annual_Report_202014_5lj1epic.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Hyosung_Corp_SR_2020_en_8g98j6gk.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on KNT-CT_Holdings_Co_Ltd_Kintetsu_Group_Integrated_Report_2022_english_single_1mc85xmw.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on MEG_Energy_Corp_MEG-Energy-ESG-Report-2021_mrqyijmb.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Majesco_NYSE_MJCO_2019_nymj6mmx.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Make_Corporation_SR16_E_All_nbedqzc7.pdf, fallback to Vision...





⚠️ GPT-4.1-mini failed on Tam_Jai_International_Co_Ltd_2022083101184_go5rbp4a.pdf, fallback to Vision...


📄 Processing PDFs:  72%|███████▏  | 922/1278 [30:40<09:39,  1.63s/it][A

⚠️ GPT-4.1-mini failed on Unknown_12179822783690541_bl1b3clg.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_1380859_ejohf4m6.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_160307_ica_gruppen_annual_report_2015_1twa6vev.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_16193538730635fl8wi_wl8x0grs.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_1632746560527_UK20Rai20RFA202020_2ve43jdy.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_16843_7hq7wx9k.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_17_Oracle_OpenWorld_Sustainability_Report_MeetGreen_6p5077lr.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_18068976dae00000834d_rc1svlrs.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_1901_4ch3st16.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_1996-Annual-Report_smz0trf7.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_200420Bunce20et20al_5slw68cl.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_2004e_xtectedg.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2004report21_22_e_1ohsl6bx.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2005-6_Annual_Report_jbhlnigk.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_2005_annual_report_-_full_version_e6bnee0f.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2007-annual-report_jv0d30ri.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_2008-Annual-Report-English_robzpu59.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2010_annual_report_low_res_cumrzxaz.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_2010_halysg5q.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2011-Annual-Report-and-Accounts-for-the-year-ended-31-December-20111_yea72djn.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_201120Annual20Reportpdfdownloadasset_im1lupqn.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2011_2012_ps1c4ya4.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_2012-annual-report-india_0x2x8vc3.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2012_27tj3yg2.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_2012_MENA_CSOSI_b0lp84pr.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2012_annual_report_botce7d9.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_2012_zl26z8vu.pdf, fallback to Vision...





⚠️ GPT-4.1-mini failed on Unknown_2012csr_kj8xtv1x.pdf, fallback to Vision...


📄 Processing PDFs:  81%|████████  | 1034/1278 [36:22<28:45,  7.07s/it][A

⚠️ GPT-4.1-mini failed on Unknown_2013-AF-Annual-report_6peeyrk4.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_201320Annual20Reportpdfdownloadasset_60msopo9.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_2013SustainabilityManagementPlan_od2vggbm.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2014-annual-en_n7v9rtj6.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_2014-cap-annual-report_m52nvmyx.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2014-integrated-annual-report_fk4yhuq7.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_2014SustainRpt_FNL_lr_7mrwsfm7.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2014_Annual_Review2_x8dqx8pb.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_2014_all_en_3rmf01h9.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2015-201620Sustainability-Report20210825_171509627_x9kgqjf9.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_2015-AGO-Annual-Report-Final_7506locq.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2015e_all_yoox8j0o.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_2016-Registration-Document-and-Annual-Financial-Report-HD_j9qojjae.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2016-julius-baer-corporate-sustainability-report-en_gtuteyko.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_201603155521-1_0fo3aveg.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2016_schaeffler_sustainability_report_en_hp60o4nl.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_2017-1820Ranken20Annual20Report_9geovmr9.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2017-Environmental-Social-Responsibility-Update_Sep-2018_o582cs82.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_2017_Corporate_Social_Responsibility_Report20_9xmw7z2l.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2017_State_of_the_Great_Lakes_Report_Michigan_OGL_609330_7_890ws3j4.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_2017_ar_csr_e_nr9i80sd.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2017_eng_tdzuwgvs.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_2017_ul51uqii.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2018-company-profile_cwz968nk.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_20180308-annual-report-incl-sustainability-report-and-corporate-governance-report-2017_yl06bvg4.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_20181206_RS_Aquafil_ENG_2017_def_qcghobt1.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_2023042101335_kyzhtmjn.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_23076_Whitbread_AR2020_web_0v2mxh4f.pdf, fallback to Vision...




⚠️ GPT-4.1-mini failed on Unknown_adbi-managing-transition-low-carbon-economy_087is5zy.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_adp07-sus-fr_95qx6prh.pdf, fallback to Vision...


📄 Processing PDFs: 100%|██████████| 1278/1278 [45:29<00:00,  2.14s/it]



✅ Extraction complete! Results saved to: results/report_years1.csv


In [None]:
import pandas as pd
import re

# === 1. 加载数据 ===
gpt_df = pd.read_csv("results/report_years1.csv")  # GPT提取结果
label_df = pd.read_excel("check/rfyear_annotation2.xlsx")            # 标注结果

# === 2. 重命名列对齐、合并（按文件名）===
gpt_df.rename(columns={"filename": "pdf_name", "report_year": "normalized_report_year"}, inplace=True)
merged = pd.merge(label_df, gpt_df, on="pdf_name", how="inner")

# === 3. 清洗文本：空值转空串，大小写、空格、换行处理 ===
def clean_text(s):
    if pd.isna(s): return ""
    return str(s).strip().lower().replace("\n", " ")

merged["normalized_report_year"] = merged["normalized_report_year"].apply(clean_text)
merged["chosen_rfyear"] = merged["chosen_rfyear"].apply(clean_text)

# === 4. 定义辅助函数 ===
import re

def normalize_year_text(s):
    if not s:
        return ""
    s = s.lower().strip()
    
    # 替换常见连字符为统一格式
    s = s.replace("–", " to ").replace("-", " to ").replace("/", " to ")

    # 清除无效字符
    s = re.sub(r'[\(\)\[\],;:]', ' ', s)
    s = re.sub(r'\s+', ' ', s)

    # fy 缩写处理：fy2020 或 fy 2020 → 2020
    s = re.sub(r'\bfy\s*(\d{4})\b', r'\1', s)                   # fy 2020 → 2020
    s = re.sub(r'\bfy\s*(\d{2})\b', lambda m: f"20{m.group(1)}", s)  # fy 19 → 2019

    # 区间格式处理：2020 to 21 → 2020 to 2021
    s = re.sub(r'\b(20\d{2})\s+to\s+(\d{2})\b', lambda m: f"{m.group(1)} to 20{m.group(2)}", s)

    # 区间格式处理：2020–2021、2020-2021、fy2020 to 2021 → 2020 to 2021
    s = re.sub(r'\b(20\d{2})\s*to\s*(20\d{2})\b', r'\1 to \2', s)
    s = re.sub(r'\b(20\d{2})\s+to\s+20\d{2}', r'\g<0>', s)

    # 清理序数词
    s = re.sub(r'\b(\d{1,2})(st|nd|rd|th)\b', r'\1', s)

    return s.strip()

def extract_years(s):
    return sorted(set(re.findall(r'\b(20\d{2}|19\d{2})\b', s)))

def is_fuzzy_match(a, b):
    a_norm = normalize_year_text(a)
    b_norm = normalize_year_text(b)
    if a_norm == b_norm:
        return True

    # 提取年份集合
    a_years = extract_years(a_norm)
    b_years = extract_years(b_norm)

    if not a_years or not b_years:
        return False

    # 排序后再比较
    a_sorted = sorted(set(a_years))
    b_sorted = sorted(set(b_years))

    # 完全一致或包含
    if a_sorted == b_sorted:
        return True
    if len(a_sorted) == 1 and a_sorted[0] in b_sorted:
        return True
    if len(b_sorted) == 1 and b_sorted[0] in a_sorted:
        return True

    # 仅一个年份时允许±1
    if len(a_sorted) == 1 and len(b_sorted) == 1 and abs(int(a_sorted[0]) - int(b_sorted[0])) <= 1:
        return True

    return False

# === 5. 应用模糊匹配函数 ===
merged["fuzzy_match"] = merged.apply(lambda row: is_fuzzy_match(row["normalized_report_year"], row["chosen_rfyear"]), axis=1)

# === 6. 计算准确率 ===
total = len(merged)
correct = merged["fuzzy_match"].sum()
accuracy = correct / total

print(f"✅ number of combine samples：{total}")
print(f"✅ correct matches：{correct}")
print(f"✅ Fuzzy Accuracy：{accuracy:.2%}")

# === 7. 保存结果 ===
merged[["pdf_name", "chosen_rfyear", "normalized_report_year", "fuzzy_match"]].to_csv("eval/report_years_comparison1.csv", index=False)

✅ number of combine samples：224
✅ correct matches：185
✅ Fuzzy Accuracy：82.59%


In [5]:
import pandas as pd
import re

# === 1. 加载数据 ===
gpt_df = pd.read_csv("results/report_years1.csv")  # GPT提取结果
label_df = pd.read_excel("check/rfyear_annotation2.xlsx")            # 标注结果

# === 2. 重命名列对齐、合并（按文件名）===
gpt_df.rename(columns={"filename": "pdf_name", "report_year": "normalized_report_year"}, inplace=True)
merged = pd.merge(label_df, gpt_df, on="pdf_name", how="inner")

# === 3. 清洗文本：空值转空串，大小写、空格、换行处理 ===
def clean_text(s):
    if pd.isna(s): return ""
    return str(s).strip().lower().replace("\n", " ")

merged["normalized_report_year"] = merged["normalized_report_year"].apply(clean_text)
merged["chosen_rfyear"] = merged["chosen_rfyear"].apply(clean_text)

# === 4. 定义辅助函数 ===
import re

def normalize_year_text(s):
    if not s:
        return ""
    s = s.lower().strip()
    
    # 替换常见连字符为统一格式
    s = s.replace("–", " to ").replace("-", " to ").replace("/", " to ")

    # 清除无效字符
    s = re.sub(r'[\(\)\[\],;:]', ' ', s)
    s = re.sub(r'\s+', ' ', s)

    # fy 缩写处理：fy2020 或 fy 2020 → 2020
    s = re.sub(r'\bfy\s*(\d{4})\b', r'\1', s)                   # fy 2020 → 2020
    s = re.sub(r'\bfy\s*(\d{2})\b', lambda m: f"20{m.group(1)}", s)  # fy 19 → 2019

    # 区间格式处理：2020 to 21 → 2020 to 2021
    s = re.sub(r'\b(20\d{2})\s+to\s+(\d{2})\b', lambda m: f"{m.group(1)} to 20{m.group(2)}", s)

    # 区间格式处理：2020–2021、2020-2021、fy2020 to 2021 → 2020 to 2021
    s = re.sub(r'\b(20\d{2})\s*to\s*(20\d{2})\b', r'\1 to \2', s)
    s = re.sub(r'\b(20\d{2})\s+to\s+20\d{2}', r'\g<0>', s)

    # 清理序数词
    s = re.sub(r'\b(\d{1,2})(st|nd|rd|th)\b', r'\1', s)

    return s.strip()

def extract_years(s):
    return sorted(set(re.findall(r'\b(20\d{2}|19\d{2})\b', s)))

def is_fuzzy_match(a, b):
    a_norm = normalize_year_text(a)
    b_norm = normalize_year_text(b)
    if a_norm == b_norm:
        return True

    # 提取年份集合
    a_years = extract_years(a_norm)
    b_years = extract_years(b_norm)

    if not a_years or not b_years:
        return False

    # 排序后再比较
    a_sorted = sorted(set(a_years))
    b_sorted = sorted(set(b_years))

    # 完全一致或包含
    if a_sorted == b_sorted:
        return True
    if len(a_sorted) == 1 and a_sorted[0] in b_sorted:
        return True
    if len(b_sorted) == 1 and b_sorted[0] in a_sorted:
        return True

    # 仅一个年份时允许±1
    if len(a_sorted) == 1 and len(b_sorted) == 1 and abs(int(a_sorted[0]) - int(b_sorted[0])) <= 1:
        return True

    return False

# === 5. 应用模糊匹配函数 ===
merged["fuzzy_match"] = merged.apply(lambda row: is_fuzzy_match(row["normalized_report_year"], row["chosen_rfyear"]), axis=1)

# === 6. 计算准确率 ===
total = len(merged)
correct = merged["fuzzy_match"].sum()
accuracy = correct / total

print(f"✅ number of combine samples：{total}")
print(f"✅ correct matches：{correct}")
print(f"✅ Fuzzy Accuracy：{accuracy:.2%}")

# === 7. 保存结果 ===
merged[["pdf_name", "chosen_rfyear", "normalized_report_year", "fuzzy_match"]].to_csv("eval/2report_years_comparison1.csv", index=False)

✅ number of combine samples：221
✅ correct matches：187
✅ Fuzzy Accuracy：84.62%
