In [1]:
from openai import OpenAI
from pathlib import Path
import fitz  # PyMuPDF
import pandas as pd
from tqdm import tqdm
import json
from dotenv import load_dotenv

from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import tempfile
import os

import base64
import shutil

from joblib import Parallel, delayed



In [2]:
# Load OpenAI API key from .env file
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Paths
PDF_DIR = Path("pdf_folder")  # Ensure this path contains your PDFs
# OUTPUT_PATH = Path("results/classify_gpt_results1.csv")

In [3]:

# OCR 补救函数：用于从首页和末页图像提取文本
def extract_text_with_ocr(pdf_path, front_n=5, back_n=5, dpi=300):
    import warnings
    from pdf2image.exceptions import PDFPageCountError
    try:
        with tempfile.TemporaryDirectory() as path:
            try:
                images = convert_from_path(str(pdf_path), dpi=dpi, output_folder=path)
            except PDFPageCountError as e:
                return f"OCR ERROR: PDF structure invalid – {str(e)}"
            except Exception as e:
                return f"OCR ERROR: {str(e)}"

            total_pages = len(images)
            if total_pages == 0:
                return "OCR ERROR: No images extracted"

            selected = images[:front_n] + images[-back_n:]
            texts = []
            for img in selected:
                img = img.convert("L")  # 灰度增强
                text = pytesseract.image_to_string(img, lang="eng")
                texts.append(text)
            return "\n".join(texts)

    except Exception as e:
        return f"OCR ERROR (outer): {str(e)}"

In [4]:

def extract_front_back_text(pdf_path, front_n=10, back_n=10, dpi=300):
    try:
        doc = fitz.open(str(pdf_path))
        texts = []
        for i in range(min(front_n, len(doc))):
            texts.append(doc[i].get_text())
        for i in range(max(0, len(doc) - back_n), len(doc)):
            texts.append(doc[i].get_text())
        doc.close()
        full_text = "\n".join(texts)
        if len(full_text.strip()) < 100:
            raise ValueError("Too short, fallback to OCR.")
        return full_text
    except Exception as e:
        print(f"⚠️ Fallback to OCR on: {pdf_path.name} due to {str(e)}")
        return extract_text_with_ocr(pdf_path, front_n, back_n, dpi)
    

In [5]:

# ========== 4. 构造 Prompt ==========
def build_report_year_prompt(text):
    return f"""
You are an expert assistant helping to extract **the reporting period actually covered by the report** (the action period), not targets or future goals.

Follow these rules strictly and return only a **JSON object** as described.

---

### What to EXTRACT (high priority cues)
Pick the **most authoritative, explicit scope statement** such as lines containing:
- "covers ... from X to Y"
- "reporting period:" / "report period:"
- "for the year ended ..." / "for the period ended ..."
- "fiscal year [range] from X to Y" (when it states the scope of THIS report)
- "this report includes / contains data from X to Y"

### What to IGNORE (do NOT use as reporting period)
- Long-term targets/roadmaps: e.g., "by FY30", "by FY40", "from 2022 onward", "target reduction from FY2018 levels"
- Baseline/comparison references: "compared to FY2018", "since 2019", "2019 highlights"
- Generic facts with years that do not declare the **covered period**
- Multi-year strategies without explicit "covers/reporting period" verbs

If multiple candidates exist, apply tie-breakers in this order:
1) Prefer sentences that explicitly say "covers/reporting period/for the year ended".
2) Prefer **the narrowest exact date/window** that clearly defines this report’s scope.
3) Prefer statements that name **both start and end** (e.g., "from Nov 1, 2019 to Oct 31, 2021").
4) If both a month/year range and a day/month/year range exist for the same scope, pick the **day-level** one.

---

### Normalization rules
1) If it's a **single year**, like "2013" or "FY2020", return "2013" or "2020" (no months/days).
2) If it's a **range of years only**, like "2014–15", return "2014 to 2015" (no months/days).
3) If it includes **exact dates**, remove ordinals (1st→1) and format as:
   "1 April 2020 to 31 March 2021"
4) If it uses **months only**:
   "April 2020 – March 2021" → "April 2020 to March 2021"
5) Convert dashes (– or -) or slashes (/) to "to".
6) Do **not** invent missing information.

---

### Output JSON

If a valid reporting period is found:
{{
  "normalized_report_year": "your final normalized version",
  "original_expression": "verbatim span from the text that states the scope",
  "source": "e.g. main text; table; footnote (if you can infer)"
}}

If no valid reporting period is found:
{{
  "normalized_report_year": null,
  "original_expression": null,
  "source": "NOT FOUND"
}}

---


### Mini example (the exact case you often miss)

Input snippet:
"Here are a few facts... Pure is committing to a 3x reduction ... by FY30...  
This inaugural report **covers ESG data, initiatives and activities from February 1, 2019 (FY20) to January 31, 2021 (FY21)**."

Expected:
{{
  "normalized_report_year": "1 February 2019 to 31 January 2021",
  "original_expression": "covers ESG data, initiatives and activities from February 1, 2019 (FY20) to January 31, 2021 (FY21)",
  "source": "main text"
}}

---

### Now analyze and extract:
{text}
"""

# ========== 5. Vision 模型辅助 ==========
def encode_image_to_base64(pil_image):
    with tempfile.NamedTemporaryFile(suffix=".png") as f:
        pil_image.save(f.name, format="PNG")
        with open(f.name, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")

def extract_year_from_vision(pdf_path, page_limit=3):
    try:
        images = convert_from_path(pdf_path, dpi=200)
        for i, img in enumerate(images[:page_limit]):
            b64 = encode_image_to_base64(img)
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "user", "content": [
                        {"type": "text", "text": f"Please extract the fiscal year or reporting period from page {i+1}. Please return in this JSON format:\n"
                                                 "{\n  \"normalized_report_year\": \"...\",\n  \"original_expression\": \"...\",\n  \"source\": \"Page {i+1}, image-based\"\n}"},
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}"}}
                    ]}
                ],
                max_tokens=300
            )
            content = response.choices[0].message.content
            parsed = json.loads(content.strip()) if content.strip().startswith("{") else eval(content.strip())
            if parsed.get("normalized_report_year"):
                return parsed
    except Exception as e:
        return {
            "normalized_report_year": None,
            "original_expression": None,
            "source": f"Vision ERROR: {e}"
        }
    return {
        "normalized_report_year": None,
        "original_expression": None,
        "source": "Vision NOT FOUND"
    }

# ========== 6. 主函数：先文本，再 Vision ==========
def extract_report_year(pdf_path):
    try:
        text = extract_front_back_text(pdf_path)
        prompt = build_report_year_prompt(text)
        response = client.chat.completions.create(
            model="gpt-4.1-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=500
        )
        content = response.choices[0].message.content
        parsed = json.loads(content.strip()) if content.strip().startswith("{") else eval(content.strip())
        if parsed.get("normalized_report_year"):
            return parsed
    except Exception as e:
        print(f"⚠️ GPT-4.1-mini failed on {Path(pdf_path).name}, fallback to Vision...")
    return extract_year_from_vision(pdf_path)



In [6]:
def process_pdf(pdf_path):
    text = extract_front_back_text(pdf_path)

    # report_type, sustainability,sustainability_name,classify_reasoning = classify_report_type(text)
    year = extract_report_year(pdf_path)
    # company = extract_company_or_publisher_with_gpt(text)

    return {
        "filename": pdf_path.name,
        # "report_type": rtype.get("report_type"),
        # "has_sustainability_section": rtype.get("has_sustainability_section"),
        # "sustainability_section_name": rtype.get("sustainability_section_name"),

        "normalized_report_year": year.get("normalized_report_year"),
        "original_expression": year.get("original_expression"),
        "year_source": year.get("source")
    }

In [None]:
# 设置路径
from concurrent.futures import ThreadPoolExecutor
pdf_dir = Path("pdf_folder")
output_path = Path("years/report_years2.csv")
os.makedirs(output_path.parent, exist_ok=True)

# 收集所有 PDF 文件
pdf_files = list(pdf_dir.glob("*.pdf"))

# 定义最大线程数（推荐 5–10）
MAX_WORKERS = 5

# 使用线程池并行执行
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    results = list(tqdm(executor.map(process_pdf, pdf_files), total=len(pdf_files), desc="🚀 Processing PDFs"))

# 保存结果
df = pd.DataFrame(results)
df.to_csv(output_path, index=False)
print(f"✅ Done! Results saved to: {output_path}")

🚀 Processing PDFs:   0%|          | 0/1278 [00:00<?, ?it/s]

⚠️ GPT-4.1-mini failed on Unknown_8f57f855-11bb-496d-9916-91ff88cb537b_sqi6k2d1.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2020_SEBANG20SUSTAINABILITY20REPORT_ENG_cty885ga.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Knoll_Inc_Knoll_Enviro_2008_gqetdkb7.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Toyota_Industries_Corp_environment2004_40h96hjb.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Intel_Corp__fwws0wtm.pdf, fallback to Vision...


🚀 Processing PDFs:   0%|          | 1/1278 [00:47<16:52:17, 47.56s/it]

⚠️ GPT-4.1-mini failed on Shui_on_Land_Ltd_3_7xr7l35i.pdf, fallback to Vision...


🚀 Processing PDFs:   0%|          | 4/1278 [00:54<3:51:27, 10.90s/it] 

⚠️ GPT-4.1-mini failed on Unknown_637407834879900000_opfazoiq.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Newport_Corp_Volvo-Ocean-Race-Newport-Stopover-Sustainability-Report_2a7nj4kv.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Mint_Corp_The_mint_ar_10_eng_final_jsnlk95f.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2015-201620Sustainability-Report20210825_171509627_x9kgqjf9.pdf, fallback to Vision...


🚀 Processing PDFs:   1%|          | 9/1278 [01:27<2:07:53,  6.05s/it]

⚠️ GPT-4.1-mini failed on Isabella_Bank_Corp_COMBO_494873_tqto1pwh.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on TIM_SA_TIM-2022-sustainability-report-ENG_5qx6prh7.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2021_RPM_Sustainability_Report_i4bfnpju.pdf, fallback to Vision...


🚀 Processing PDFs:   1%|          | 10/1278 [01:44<3:11:43,  9.07s/it]

⚠️ GPT-4.1-mini failed on Unknown_2022060100165_a05pfov4.pdf, fallback to Vision...


🚀 Processing PDFs:   1%|          | 11/1278 [02:01<3:55:37, 11.16s/it]

⚠️ GPT-4.1-mini failed on BBVA_Compass_Bancshares_Inc_Informe_anual_2010_Eng_tcm927-346446_3qyfos2v.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Sundaram_Clayton_Ltd_BusinessResponsibilityReport2021-22_2njbf6an.pdf, fallback to Vision...


🚀 Processing PDFs:   1%|          | 13/1278 [02:20<3:21:46,  9.57s/it]

⚠️ GPT-4.1-mini failed on Citrix_Systems_Inc_citrix-sustainability-report-2020_qnsobgyv.pdf, fallback to Vision...


🚀 Processing PDFs:   1%|▏         | 16/1278 [02:36<2:16:24,  6.48s/it]

⚠️ GPT-4.1-mini failed on Sterlite_Technologies_Ltd_Annual-report-2016_p4sp9tbo.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2022053120034016302_1gl1sbq2.pdf, fallback to Vision...


🚀 Processing PDFs:   1%|▏         | 17/1278 [02:50<3:01:05,  8.62s/it]

⚠️ GPT-4.1-mini failed on India_Glycols_Ltd_annual-report-2019-20_g3mx3rps.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on JSW_Energy_Ltd_Integrated20Annual20Report202021_wgoux4h5.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Sojitz_Corporation_csr2007_all_3ucpks58.pdf, fallback to Vision...


🚀 Processing PDFs:   2%|▏         | 20/1278 [03:14<2:40:24,  7.65s/it]

⚠️ GPT-4.1-mini failed on PerkinElmer_Inc_2016-2017-Corporate-Social-Responsibility-Report_tcm137-198142_lsp07rq5.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Appian_Corp_476815272_Social-Responsibility-Doc_final-7_chc5albm.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on IQE_PLC__12224-iqe-annual-report-2018_web_10fo3ave.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Hirose_Electric_Co_Ltd_HiroseElectric_SustainabilityReport2020E_3ixvbvy1.pdf, fallback to Vision...


🚀 Processing PDFs:   2%|▏         | 24/1278 [03:48<2:05:53,  6.02s/it]

⚠️ GPT-4.1-mini failed on Asian_Paints_Ltd_q4_3gbbkfbt.pdf, fallback to Vision...


🚀 Processing PDFs:   2%|▏         | 26/1278 [04:02<2:11:34,  6.31s/it]

⚠️ GPT-4.1-mini failed on Metair_Investment_Ltd_Metair-IAR_2020_d28upgn8.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Ambuja_Cements_Ltd_Sustainable-Development-Report-2016_8ixcsb2n.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Marks_and_Spencer_Group_PLC_OTC_MAKSF_2012_ffk4yhuq.pdf, fallback to Vision...


🚀 Processing PDFs:   2%|▏         | 27/1278 [04:16<2:53:06,  8.30s/it]

⚠️ GPT-4.1-mini failed on Chambal_Fertilisers__Chemicals_Gadepan-I-And-II-Compliance-Report_es33maw9.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2020-sustainability-report_6rybl2c1.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on GL_Ltd_kilroy-realty-corporation-sustainability-report-2018_cnh296bj.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_2020-annual-report-final-copy_q50a286f.pdf, fallback to Vision...


🚀 Processing PDFs:   3%|▎         | 32/1278 [04:57<2:23:02,  6.89s/it]

⚠️ GPT-4.1-mini failed on Hess_Corp_hess-2021-sustainability-report_n2wei7r5.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_124096-19In-26060160T18678681728S-In_eolydoef.pdf, fallback to Vision...


🚀 Processing PDFs:   3%|▎         | 34/1278 [05:13<2:21:53,  6.84s/it]

⚠️ GPT-4.1-mini failed on Gresham_Technologies_PLC_LSE_GHTL_2019_biqvttuq.pdf, fallback to Vision...


🚀 Processing PDFs:   3%|▎         | 35/1278 [05:22<2:30:51,  7.28s/it]

⚠️ GPT-4.1-mini failed on Starbucks_Corp_4dd6216d0fd0400f8689eceba0497e04_zovctl4r.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Unknown_95F8F73558174EF49F0B2B278C2F2D27_3tevpoql.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on LyondellBasell_Industries_NV_2019_sustainability_report_fvt1o08m.pdf, fallback to Vision...


🚀 Processing PDFs:   3%|▎         | 38/1278 [05:49<2:28:30,  7.19s/it]

⚠️ GPT-4.1-mini failed on Unknown_27464_7y8uu0oh.pdf, fallback to Vision...


🚀 Processing PDFs:   3%|▎         | 39/1278 [06:01<2:57:05,  8.58s/it]

⚠️ GPT-4.1-mini failed on KRBL_Ltd_KRBL_ANNUAL_REPORT_2018-19_5wnor8by.pdf, fallback to Vision...
⚠️ GPT-4.1-mini failed on Singapore_Airlines_Ltd_annualreport0910_xfiqnr8w.pdf, fallback to Vision...


🚀 Processing PDFs:   3%|▎         | 39/1278 [06:06<3:14:15,  9.41s/it]


In [None]:
import pandas as pd
import re

# === 1. 加载数据 ===
gpt_df = pd.read_csv("years/report_years2.csv")  # GPT提取结果
label_df = pd.read_excel("check/rfyear_annotation2.xlsx")            # 标注结果

# === 2. 重命名列对齐、合并（按文件名）===
gpt_df.rename(columns={"filename": "pdf_name", "report_year": "normalized_report_year"}, inplace=True)
merged = pd.merge(label_df, gpt_df, on="pdf_name", how="inner")

# === 3. 清洗文本：空值转空串，大小写、空格、换行处理 ===
def clean_text(s):
    if pd.isna(s): return ""
    return str(s).strip().lower().replace("\n", " ")

merged["normalized_report_year"] = merged["normalized_report_year"].apply(clean_text)
merged["chosen_rfyear"] = merged["chosen_rfyear"].apply(clean_text)

# === 4. 定义辅助函数 ===
import re

def normalize_year_text(s):
    if not s:
        return ""
    s = s.lower().strip()
    
    # 替换常见连字符为统一格式
    s = s.replace("–", " to ").replace("-", " to ").replace("/", " to ")

    # 清除无效字符
    s = re.sub(r'[\(\)\[\],;:]', ' ', s)
    s = re.sub(r'\s+', ' ', s)

    # fy 缩写处理：fy2020 或 fy 2020 → 2020
    s = re.sub(r'\bfy\s*(\d{4})\b', r'\1', s)                   # fy 2020 → 2020
    s = re.sub(r'\bfy\s*(\d{2})\b', lambda m: f"20{m.group(1)}", s)  # fy 19 → 2019

    # 区间格式处理：2020 to 21 → 2020 to 2021
    s = re.sub(r'\b(20\d{2})\s+to\s+(\d{2})\b', lambda m: f"{m.group(1)} to 20{m.group(2)}", s)

    # 区间格式处理：2020–2021、2020-2021、fy2020 to 2021 → 2020 to 2021
    s = re.sub(r'\b(20\d{2})\s*to\s*(20\d{2})\b', r'\1 to \2', s)
    s = re.sub(r'\b(20\d{2})\s+to\s+20\d{2}', r'\g<0>', s)

    # 清理序数词
    s = re.sub(r'\b(\d{1,2})(st|nd|rd|th)\b', r'\1', s)

    return s.strip()

def extract_years(s):
    return sorted(set(re.findall(r'\b(20\d{2}|19\d{2})\b', s)))

def is_fuzzy_match(a, b):
    a_norm = normalize_year_text(a)
    b_norm = normalize_year_text(b)
    if a_norm == b_norm:
        return True

    # 提取年份集合
    a_years = extract_years(a_norm)
    b_years = extract_years(b_norm)

    if not a_years or not b_years:
        return False

    # 排序后再比较
    a_sorted = sorted(set(a_years))
    b_sorted = sorted(set(b_years))

    # 完全一致或包含
    if a_sorted == b_sorted:
        return True
    if len(a_sorted) == 1 and a_sorted[0] in b_sorted:
        return True
    if len(b_sorted) == 1 and b_sorted[0] in a_sorted:
        return True

    # 仅一个年份时允许±1
    if len(a_sorted) == 1 and len(b_sorted) == 1 and abs(int(a_sorted[0]) - int(b_sorted[0])) <= 1:
        return True

    return False

# === 5. 应用模糊匹配函数 ===
merged["fuzzy_match"] = merged.apply(lambda row: is_fuzzy_match(row["normalized_report_year"], row["chosen_rfyear"]), axis=1)

# === 6. 计算准确率 ===
total = len(merged)
correct = merged["fuzzy_match"].sum()
accuracy = correct / total

print(f"✅ number of combine samples：{total}")
print(f"✅ correct matches：{correct}")
print(f"✅ Fuzzy Accuracy：{accuracy:.2%}")

# === 7. 保存结果 ===
# merged[["pdf_name", "chosen_rfyear", "normalized_report_year", "fuzzy_match"]].to_csv("eval/report_years_comparison1.csv", index=False)

✅ number of combine samples：224
✅ correct matches：185
✅ Fuzzy Accuracy：82.59%


In [None]:
import pandas as pd
import re

# === 1. 加载数据 ===
gpt_df = pd.read_csv("years/report_years2.csv")  # GPT提取结果
label_df = pd.read_excel("check/rfyear_annotation2.xlsx")            # 标注结果

# === 2. 重命名列对齐、合并（按文件名）===
gpt_df.rename(columns={"filename": "pdf_name", "report_year": "normalized_report_year"}, inplace=True)
merged = pd.merge(label_df, gpt_df, on="pdf_name", how="inner")

# === 3. 清洗文本：空值转空串，大小写、空格、换行处理 ===
def clean_text(s):
    if pd.isna(s): return ""
    return str(s).strip().lower().replace("\n", " ")

merged["normalized_report_year"] = merged["normalized_report_year"].apply(clean_text)
merged["chosen_rfyear"] = merged["chosen_rfyear"].apply(clean_text)

# === 4. 定义辅助函数 ===
import re

def normalize_year_text(s):
    if not s:
        return ""
    s = s.lower().strip()
    
    # 替换常见连字符为统一格式
    s = s.replace("–", " to ").replace("-", " to ").replace("/", " to ")

    # 清除无效字符
    s = re.sub(r'[\(\)\[\],;:]', ' ', s)
    s = re.sub(r'\s+', ' ', s)

    # fy 缩写处理：fy2020 或 fy 2020 → 2020
    s = re.sub(r'\bfy\s*(\d{4})\b', r'\1', s)                   # fy 2020 → 2020
    s = re.sub(r'\bfy\s*(\d{2})\b', lambda m: f"20{m.group(1)}", s)  # fy 19 → 2019

    # 区间格式处理：2020 to 21 → 2020 to 2021
    s = re.sub(r'\b(20\d{2})\s+to\s+(\d{2})\b', lambda m: f"{m.group(1)} to 20{m.group(2)}", s)

    # 区间格式处理：2020–2021、2020-2021、fy2020 to 2021 → 2020 to 2021
    s = re.sub(r'\b(20\d{2})\s*to\s*(20\d{2})\b', r'\1 to \2', s)
    s = re.sub(r'\b(20\d{2})\s+to\s+20\d{2}', r'\g<0>', s)

    # 清理序数词
    s = re.sub(r'\b(\d{1,2})(st|nd|rd|th)\b', r'\1', s)

    return s.strip()

def extract_years(s):
    return sorted(set(re.findall(r'\b(20\d{2}|19\d{2})\b', s)))

def is_fuzzy_match(a, b):
    a_norm = normalize_year_text(a)
    b_norm = normalize_year_text(b)
    if a_norm == b_norm:
        return True

    # 提取年份集合
    a_years = extract_years(a_norm)
    b_years = extract_years(b_norm)

    if not a_years or not b_years:
        return False

    # 排序后再比较
    a_sorted = sorted(set(a_years))
    b_sorted = sorted(set(b_years))

    # 完全一致或包含
    if a_sorted == b_sorted:
        return True
    if len(a_sorted) == 1 and a_sorted[0] in b_sorted:
        return True
    if len(b_sorted) == 1 and b_sorted[0] in a_sorted:
        return True

    # 仅一个年份时允许±1
    if len(a_sorted) == 1 and len(b_sorted) == 1 and abs(int(a_sorted[0]) - int(b_sorted[0])) <= 1:
        return True

    return False

# === 5. 应用模糊匹配函数 ===
merged["fuzzy_match"] = merged.apply(lambda row: is_fuzzy_match(row["normalized_report_year"], row["chosen_rfyear"]), axis=1)

# === 6. 计算准确率 ===
total = len(merged)
correct = merged["fuzzy_match"].sum()
accuracy = correct / total

print(f"✅ number of combine samples：{total}")
print(f"✅ correct matches：{correct}")
print(f"✅ Fuzzy Accuracy：{accuracy:.2%}")

# === 7. 保存结果 ===
# merged[["pdf_name", "chosen_rfyear", "normalized_report_year", "fuzzy_match"]].to_csv("eval/2report_years_comparison1.csv", index=False)

✅ number of combine samples：221
✅ correct matches：187
✅ Fuzzy Accuracy：84.62%
