In [20]:
import fitz
import pytesseract
from pdf2image import convert_from_path
import tempfile
import os
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
from dotenv import load_dotenv
import json

# Load OpenAI API key from .env file
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


# 提取 PDF 前后页文本
def extract_front_back_text(pdf_path, front_n=5, back_n=5, dpi=400):
    try:
        doc = fitz.open(str(pdf_path))
        texts = []
        for i in range(min(front_n, len(doc))):
            texts.append(doc[i].get_text())
        for i in range(max(0, len(doc) - back_n), len(doc)):
            texts.append(doc[i].get_text())
        doc.close()
        full_text = "\n".join(texts)
        if len(full_text.strip()) < 100:
            raise ValueError("Too short, fallback to OCR.")
        return full_text
    except Exception as e:
        print(f"⚠️ Fallback to OCR on: {pdf_path.name} due to {str(e)}")
        return extract_text_with_ocr(pdf_path, front_n, back_n, dpi)

def extract_text_with_ocr(pdf_path, front_n=5, back_n=5, dpi=400):
    import warnings
    from pdf2image.exceptions import PDFPageCountError
    try:
        with tempfile.TemporaryDirectory() as path:
            try:
                images = convert_from_path(str(pdf_path), dpi=dpi, output_folder=path)
            except PDFPageCountError as e:
                return f"OCR ERROR: PDF structure invalid – {str(e)}"
            except Exception as e:
                return f"OCR ERROR: {str(e)}"

            total_pages = len(images)
            if total_pages == 0:
                return "OCR ERROR: No images extracted"

            selected = images[:front_n] + images[-back_n:]
            texts = []
            for img in selected:
                img = img.convert("L")  # 灰度增强
                text = pytesseract.image_to_string(img, lang="eng")
                texts.append(text)
            return "\n".join(texts)

    except Exception as e:
        return f"OCR ERROR (outer): {str(e)}"


def extract_company_or_publisher_with_gpt(text):
    import json

    # 第一阶段：提取公司名称、别名、国家
    primary_prompt = f"""
You are a corporate reporting analyst.

From the first and last pages of a corporate report, extract the following:
1. The **official company name** that issued the report.
2. A list of **other names** referring to the company (abbreviations, group name, acronyms, etc.).
3. The **country** where the company is headquartered.
4. A brief **reasoning** explaining your extraction.

Return a JSON object like this:
{{
  "company_name": "...",
  "other_names": ["...", "..."],
  "country": "...",
  "reasoning": "..."
}}

If the company name is not found, return:
{{
  "company_name": "UNKNOWN",
  "other_names": [],
  "country": "UNKNOWN",
  "reasoning": "No indication of the company in the text."
}}

Text:
{text}
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4.1",  # 更稳定
            messages=[
                {"role": "system", "content": "You are an expert in ESG and corporate reporting."},
                {"role": "user", "content": primary_prompt}
            ],
            temperature=0
        )
        content = response.choices[0].message.content.strip()
        result = json.loads(content)
    except Exception as e:
        return {
            "company_name": "GPT_ERROR",
            "other_names": [],
            "country": "GPT_ERROR",
            "reasoning": str(e),
            "publisher": None
        }
    except json.JSONDecodeError:
        return {
            "company_name": "PARSE_ERROR",
            "other_names": [],
            "country": "PARSE_ERROR",
            "reasoning": f"Raw response: {content}",
            "publisher": None
        }

    # 第二阶段：若无法识别公司名，尝试提取 publisher
    if result.get("company_name", "").upper() == "UNKNOWN":
        try:
            secondary_prompt = f"""
This report was not issued by a company but possibly by a public institution or academic body.

From the following text, extract the **publisher** and return:
{{ "publisher": "..." }}

If not found:
{{ "publisher": "UNKNOWN" }}

Text:
{text}
"""
            response2 = client.chat.completions.create(
                model="gpt-4.1",
                messages=[
                    {"role": "system", "content": "You are an expert in institutional publishing."},
                    {"role": "user", "content": secondary_prompt}
                ],
                temperature=0
            )
            content2 = response2.choices[0].message.content.strip()
            pub_result = json.loads(content2)
            result["publisher"] = pub_result.get("publisher", "UNKNOWN")
        except Exception as e:
            result["publisher"] = f"GPT_ERROR: {str(e)}"
    else:
        result["publisher"] = None

    # 清理字段
    if result.get("company_name", "").upper() == "UNKNOWN":
        result["company_name"] = ""
    if not isinstance(result.get("other_names"), list):
        result["other_names"] = []
    if "country" not in result or not isinstance(result["country"], str):
        result["country"] = "UNKNOWN"

    return result

def process_folder_with_gpt_and_type(pdf_folder, output_path):
    results = []
    pdf_folder = Path(pdf_folder)
    pdf_files = list(pdf_folder.glob("*.pdf"))

    for pdf in tqdm(pdf_files, desc="Classifying reports"):
        text = extract_front_back_text(pdf)
        result = extract_company_or_publisher_with_gpt(text)
        results.append({
            "file_name": pdf.name,
            "company_name": result["company_name"],
            "other_names": "; ".join(result.get("other_names", [])),  # 用分号拼接方便输出
            "publisher": result.get("publisher", None),
            "reasoning": result["reasoning"],
            "country": result["country"],
        })

    df = pd.DataFrame(results)
    df.to_csv(output_path, index=False)
    print(f"✅ Saved to {output_path}")
    return df

In [None]:
# 路径替换成你的 PDF 文件夹
# Paths
PDF_DIR = Path("pdf_folder")  # Ensure this path contains your PDFs
OUTPUT_PATH = Path("output/company_name_gpt_results.csv")
process_folder_with_gpt_and_type(PDF_DIR, OUTPUT_PATH)

Classifying reports:  20%|█▉        | 251/1277 [27:25<2:29:51,  8.76s/it]

⚠️ Fallback to OCR on: Unknown_adbi-managing-transition-low-carbon-economy_087is5zy.pdf due to Too short, fallback to OCR.


Classifying reports:  20%|██        | 256/1277 [28:01<2:33:32,  9.02s/it]

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



Classifying reports:  20%|██        | 257/1277 [28:11<2:36:06,  9.18s/it]

⚠️ Fallback to OCR on: Unknown_2014SustainRpt_FNL_lr_7mrwsfm7.pdf due to Too short, fallback to OCR.


Classifying reports:  28%|██▊       | 360/1277 [38:33<1:31:06,  5.96s/it]

⚠️ Fallback to OCR on: Hansae_Yes24_Holdings_Co_Ltd_HANSAE20YES2420HOLDINGS20ESG20REPORT202022_th5kzsfk.pdf due to Too short, fallback to OCR.


Classifying reports:  34%|███▎      | 430/1277 [44:43<1:04:52,  4.60s/it]

⚠️ Fallback to OCR on: Home_Inns__Hotels_Management_Inc_Barclays_Bank_PLC_Annual_Report_202014_5lj1epic.pdf due to Too short, fallback to OCR.


Classifying reports:  39%|███▉      | 498/1277 [51:30<1:11:50,  5.53s/it]

⚠️ Fallback to OCR on: Unknown_adp07-sus-fr_95qx6prh.pdf due to Too short, fallback to OCR.


Classifying reports:  53%|█████▎    | 672/1277 [1:10:49<44:41,  4.43s/it]  

⚠️ Fallback to OCR on: Armstrong_Flooring_Inc_SustainabilityReport-2020_kot54emv.pdf due to Too short, fallback to OCR.


Classifying reports:  57%|█████▋    | 724/1277 [1:16:06<52:09,  5.66s/it]  

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: obje

Classifying reports:  60%|██████    | 771/1277 [1:20:37<33:48,  4.01s/it]  

⚠️ Fallback to OCR on: Tam_Jai_International_Co_Ltd_2022083101184_go5rbp4a.pdf due to Too short, fallback to OCR.


Classifying reports:  70%|███████   | 896/1277 [1:33:39<27:12,  4.28s/it]  

⚠️ Fallback to OCR on: Hyosung_Corp_SR_2020_en_8g98j6gk.pdf due to Too short, fallback to OCR.


Classifying reports:  76%|███████▋  | 974/1277 [1:42:01<19:38,  3.89s/it]  

⚠️ Fallback to OCR on: Boryung_Corporation_EBB3B4EBA0B920ECA780EC868DEAB080EB8AA5EAB2BDEC9881EBB3B4EAB3A0EC849CEC9881EBACB8_ebpit5lz.pdf due to Too short, fallback to OCR.


Classifying reports:  77%|███████▋  | 987/1277 [1:42:43<16:14,  3.36s/it]

⚠️ Fallback to OCR on: Arvind_Ltd_Arvind_AR_2022-23_0_iwp4673c.pdf due to Too short, fallback to OCR.


Classifying reports:  87%|████████▋ | 1106/1277 [1:52:01<11:11,  3.93s/it]

⚠️ Fallback to OCR on: EKI_Energy_Services_Limited_69298543284_zj7y1tjh.pdf due to Too short, fallback to OCR.


Classifying reports:  88%|████████▊ | 1122/1277 [1:53:24<16:16,  6.30s/it]

⚠️ Fallback to OCR on: Unknown_23076_Whitbread_AR2020_web_0v2mxh4f.pdf due to Too short, fallback to OCR.


Classifying reports:  90%|████████▉ | 1149/1277 [1:55:30<09:11,  4.31s/it]

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



Classifying reports:  91%|█████████▏| 1168/1277 [1:56:51<06:32,  3.61s/it]

MuPDF error: format error: No default Layer config



Classifying reports:  95%|█████████▍| 1209/1277 [2:00:06<04:55,  4.35s/it]

⚠️ Fallback to OCR on: Unknown_2023042101335_kyzhtmjn.pdf due to Too short, fallback to OCR.


Classifying reports:  97%|█████████▋| 1239/1277 [2:02:52<03:55,  6.19s/it]

⚠️ Fallback to OCR on: Titan_Company_Ltd_Annual20Report202013_p4r8w07u.pdf due to Too short, fallback to OCR.


Classifying reports:  99%|█████████▉| 1269/1277 [2:05:28<00:25,  3.22s/it]

⚠️ Fallback to OCR on: BASF_SE_2012_BASF_Report_lmq79gwn.pdf due to Too short, fallback to OCR.


Classifying reports: 100%|██████████| 1277/1277 [2:05:57<00:00,  5.92s/it]

✅ Saved to output/company_name_gpt_results3-1.csv





Unnamed: 0,file_name,company_name,other_names,publisher,reasoning,country
0,Unknown_8f57f855-11bb-496d-9916-91ff88cb537b_s...,Paramount Global,Paramount; the Company; ViacomCBS Inc.; Viacom...,,The first page of the report clearly states 'P...,United States
1,Toyota_Industries_Corp_environment2004_40h96hj...,Toyota Industries Corporation,Toyota Industries; Toyota Industries Group; To...,,The official company name 'Toyota Industries C...,Japan
2,Knoll_Inc_Knoll_Enviro_2008_gqetdkb7.pdf,"Knoll, Inc.",Knoll; Knoll Group; KnollStudio; KnollTextiles...,,The first page of the report includes a quote ...,United States
3,Intel_Corp__fwws0wtm.pdf,Intel Corporation,Intel; Intel Foundation; Intel Group,,The first page of the report refers to 'Intel'...,United States
4,Unknown_2020_SEBANG20SUSTAINABILITY20REPORT_EN...,"SEBANG Co., Ltd.",SEBANG; SEBANG Group; SEBANG Express; SEBANG VINA,,"The official company name 'SEBANG Co., Ltd.' i...",South Korea
...,...,...,...,...,...,...
1272,Logwin_AG_CSR_Report_2021_en_rbp4aney.pdf,Logwin AG,Logwin; Logwin Group; Logwin Corporation,,"On page 3, the text states: 'Logwin AG, with h...",Luxembourg
1273,Shanghai_Electric_Group_Co_Ltd_95909_s0uqoqkj.pdf,"Shanghai Electric Group Co., Ltd",Shanghai Electric; Group; Company; we,,The official company name 'Shanghai Electric G...,China
1274,PT_Soho_Global_Health_Tbk_Final_annual_report_...,PT Soho Global Health Tbk,SGH; Soho Global Health; Kelompok Usaha; the G...,,The official company name 'PT Soho Global Heal...,Indonesia
1275,Banner_Corp_32banner-sustainability-report-202...,Banner Ltd.,Banner; Banner Group; Banner Group Ltd; Banner...,,The first and last pages of the report list th...,United Kingdom


In [None]:
import pandas as pd
from difflib import SequenceMatcher
import re
import ast

# === Step 1: 加载文件 ===
df_pred = pd.read_csv("output/company_name_gpt_results3.csv")  # 包含 company_name、other_names
df_true = pd.read_csv("check/matching_gabarito_with_pdfs.csv")  # 包含 name_2

df_pred["filename"] = df_pred["file_name"].str.strip()
df_true["filename"] = df_true["pdf_path"].str.strip()

df_merged = pd.merge(df_pred, df_true, on="filename", how="inner")

# === Step 2: 清洗函数 ===
def _clean_name(name: str) -> str:
    if pd.isna(name):
        return ""
    name = name.lower()
    name = re.sub(r"\([^()]*\)", "", name)
    name = re.sub(r"[^\w\d\s]", "", name)
    suffixes = [" company", " companies", " corporation", " incorporated", " corp", " llc", " ltd", " inc", 
                " oyj", " intl", " sa", " lp", " spa", " sanv", " nv", " plc", " nvsa", " ptd", 
                " int", " international", "limited", "group", "the ", " holdings", " co"]
    for suffix in suffixes:
        name = name.replace(suffix, "")
    name = name.replace("é", "e").replace("  ", "").replace(" ", "")
    return re.sub(r"[^a-zA-Z0-9]", "", name)

# === Step 3: 匹配函数 ===
def fuzzy_match(clean_a, clean_b):
    return SequenceMatcher(None, clean_a, clean_b).ratio()

def get_best_match(row):
    target = _clean_name(row["name_2"])
    main_name = _clean_name(row["company_name"])
    best_score = fuzzy_match(main_name, target)
    best_source = "company_name"

    # 如果提供了 other_names，就尝试匹配
    if "other_names" in row and pd.notna(row["other_names"]):
        try:
            candidates = ast.literal_eval(row["other_names"]) if isinstance(row["other_names"], str) else []
        except Exception:
            candidates = []
        for alt in candidates:
            alt_clean = _clean_name(alt)
            score = fuzzy_match(alt_clean, target)
            if score > best_score:
                best_score = score
                best_source = "other_names"

    return pd.Series([best_score, best_score >= 0.85, best_source], index=["fuzzy_score", "is_correct", "matched_by"])

# === Step 4: 应用匹配函数 ===
df_merged[["fuzzy_score", "is_correct", "matched_by"]] = df_merged.apply(get_best_match, axis=1)

# === Step 5: 输出评估表 ===
df_eval = df_merged[["filename", "company_name", "other_names", "name_2", "fuzzy_score", "is_correct", "matched_by"]]
df_eval.to_csv("output/company_name_eval_with_alias.csv", index=False)

# === Step 6: 显示准确率 ===
accuracy = df_eval["is_correct"].mean()
print(f"✅ 公司名称提取准确率（含别名容错）：{accuracy:.2%}")

✅ 公司名称提取准确率（含别名容错）：75.49%


In [None]:
import pandas as pd
from difflib import SequenceMatcher
import re
import ast

# === Step 1: 加载文件 ===
df_pred = pd.read_csv("output/company_name_gpt_results3.csv")  # 包含 company_name、other_names
df_true = pd.read_csv("check/matching_gabarito_with_pdfs.csv")  # 包含 name_2

df_pred["filename"] = df_pred["file_name"].str.strip()
df_true["filename"] = df_true["pdf_path"].str.strip()

df_merged = pd.merge(df_pred, df_true, on="filename", how="inner")

# === Step 2: 清洗函数 ===
def _clean_name(name: str) -> str:
    if pd.isna(name):
        return ""
    name = name.lower()
    name = re.sub(r"\([^()]*\)", "", name)
    name = re.sub(r"[^\w\d\s]", "", name)
    suffixes = [
        " company", " companies", " corporation", " incorporated", " corp", " llc", " ltd", " inc", 
        " oyj", " intl", " sa", " lp", " spa", " sanv", " nv", " plc", " nvsa", " ptd", 
        " int", " international", "limited", "group", "the ", " holdings", " co"
    ]
    for suffix in suffixes:
        name = name.replace(suffix, "")
    name = name.replace("é", "e").replace("  ", "").replace(" ", "")
    return re.sub(r"[^a-zA-Z0-9]", "", name)

# === Step 3: 匹配函数 ===
def fuzzy_match(clean_a, clean_b):
    return SequenceMatcher(None, clean_a, clean_b).ratio()
def get_best_match(row):
    target = _clean_name(row["name_2"])
    main_name = _clean_name(row["company_name"])
    best_score = fuzzy_match(main_name, target)
    best_source = "company_name"

    # 解析 other_names：支持列表或字符串（自动切分）
    candidates = []
    if "other_names" in row and pd.notna(row["other_names"]):
        raw = row["other_names"]
        try:
            if isinstance(raw, str):
                if raw.startswith("["):  # 是列表字符串
                    candidates = ast.literal_eval(raw)
                else:  # 否则按常见分隔符切分
                    candidates = re.split(r"[;,\n]", raw)
            elif isinstance(raw, list):
                candidates = raw
        except Exception:
            candidates = []

    for alt in candidates:
        alt_clean = _clean_name(alt)
        score = fuzzy_match(alt_clean, target)
        if score > best_score:
            best_score = score
            best_source = f"other_names ({alt.strip()})"

    return pd.Series([best_score, best_score >= 0.85, best_source], index=["fuzzy_score", "is_correct", "matched_by"])
# === Step 4: 应用匹配函数 ===
df_merged[["fuzzy_score", "is_correct", "matched_by"]] = df_merged.apply(get_best_match, axis=1)

# === Step 5: 输出评估表 ===
df_eval = df_merged[["filename", "company_name", "other_names", "name_2", "fuzzy_score", "is_correct", "matched_by"]]
df_eval.to_csv("eval/company_name_eval_with_alias.csv", index=False)

# === Step 6: 显示准确率 ===
accuracy = df_eval["is_correct"].mean()
print(f"✅ 公司名称提取准确率（含别名匹配）：{accuracy:.2%}")

✅ 公司名称提取准确率（含别名匹配）：90.69%


In [None]:
import pandas as pd
import re
import ast
from difflib import SequenceMatcher

# === Step 1: 加载文件 ===
df_pred = pd.read_csv("output/company_name_gpt_results3.csv")  # 包含 company_name、other_names
df_true = pd.read_csv("check/matching_gabarito_with_pdfs.csv")  # 包含 name_2

df_pred["filename"] = df_pred["file_name"].str.strip()
df_true["filename"] = df_true["pdf_path"].str.strip()


# === Step 2: 国家简称 ↔︎ 全称映射 ===
from iso3166 import countries_by_alpha2, countries_by_name
def normalize_country_name(name):
    if pd.isna(name):
        return ""
    name = name.strip().lower()
    # 检查是否是简称
    if name.upper() in countries_by_alpha2:
        return countries_by_alpha2[name.upper()].name.lower()
    # 检查是否是全称
    for country in countries_by_name.values():
        if name in country.name.lower():
            return country.name.lower()
    return name

# === Step 3: 清洗函数 ===
def _clean_name(name: str) -> str:
    if pd.isna(name):
        return ""
    name = name.lower()
    name = re.sub(r"\([^()]*\)", "", name)
    name = re.sub(r"[^\w\d\s]", "", name)
    suffixes = [
        " company", " companies", " corporation", " incorporated", " corp", " llc", " ltd", " inc", 
        " oyj", " intl", " sa", " lp", " spa", " sanv", " nv", " plc", " nvsa", " ptd", 
        " int", " international", "limited", "group", "the ", " holdings", " co"
    ]
    for suffix in suffixes:
        name = name.replace(suffix, "")
    name = name.replace("é", "e").replace("  ", "").replace(" ", "")
    return re.sub(r"[^a-zA-Z0-9]", "", name)

def fuzzy_match(clean_a, clean_b):
    return SequenceMatcher(None, clean_a, clean_b).ratio()

def keyword_overlap_match(name1: str, name2: str) -> bool:
    """只保留字母+数字单词，比对是否有交集"""
    words1 = set(re.findall(r'\b\w+\b', name1.lower()))
    words2 = set(re.findall(r'\b\w+\b', name2.lower()))
    # 排除无意义短词（如co, ltd, inc等）
    blacklist = {"co", "ltd", "inc", "group", "company", "corp", "corporation", "plc", "holdings"}
    words1 = {w for w in words1 if w not in blacklist and len(w) > 2}
    words2 = {w for w in words2 if w not in blacklist and len(w) > 2}
    return len(words1 & words2) > 0

# === Step 4: 处理匹配逻辑（主名 + 别名 + 国家辅助）===
def get_best_match(row):
    target = _clean_name(row["name_2"])
    main_name = _clean_name(row["company_name"])
    best_score = fuzzy_match(main_name, target)
    best_source = "company_name"
    is_match = best_score >= 0.85

    # 尝试匹配别名
    candidates = []
    if "other_names" in row and pd.notna(row["other_names"]):
        raw = row["other_names"]
        try:
            if isinstance(raw, str):
                if raw.startswith("["):  # 是列表字符串
                    candidates = ast.literal_eval(raw)
                else:  # 否则按分隔符分割
                    candidates = re.split(r"[;,\n]", raw)
            elif isinstance(raw, list):
                candidates = raw
        except Exception:
            candidates = []

    for alt in candidates:
        alt_clean = _clean_name(alt)
        score = fuzzy_match(alt_clean, target)
        if score > best_score:
            best_score = score
            best_source = f"other_names ({alt.strip()})"
            is_match = score >= 0.85

    # 如果主名/别名都不满足阈值，则根据国家判断
    # === 替换原来的 relaxed_by_country_match 判断部分 ===
    if not is_match:
        country1 = normalize_country_name(row.get("country", ""))
        country2 = normalize_country_name(row.get("Loc", ""))
        if country1 and country1 == country2:
            if keyword_overlap_match(row["company_name"], row["name_2"]):
                is_match = True
                best_source = "relaxed_by_country_keyword"

    return pd.Series([best_score, is_match, best_source], index=["fuzzy_score", "is_correct", "matched_by"])

# === Step 5: 应用匹配逻辑 ===
df_merged[["fuzzy_score", "is_correct", "matched_by"]] = df_merged.apply(get_best_match, axis=1)

# === Step 6: 保存结果 ===
df_eval = df_merged[["filename", "company_name", "other_names", "name_2", "country", "loc", "fuzzy_score", "is_correct", "matched_by"]]
df_eval.to_csv("eval/company_name_eval_with_country.csv", index=False)

# === Step 7: 输出准确率 ===
accuracy = df_eval["is_correct"].mean()
print(f"✅ 公司名称提取准确率（含别名 + 国家辅助匹配）：{accuracy:.2%}")

Index(['file_name', 'company_name', 'other_names', 'publisher', 'reasoning',
       'country', 'filename', 'name', 'name_2', 'is_match',
       'difficulty_category', 'conml', 'loc', 'GICS_level_1', 'GICS_level_2',
       'GICS_level_3', 'predicted_company_name', 'pdf_data_lake_path',
       'pdf_bucket_path1', 'pdf_bucket_path2', 'pdf_bucket_path3',
       'pdf_bucket_path4', 'pdf_bucket_path5', 'pdf_path'],
      dtype='object')
✅ 公司名称提取准确率（含别名 + 国家辅助匹配）：90.69%


In [16]:
df_true = pd.read_csv("check/matching_gabarito_with_pdfs.csv")  # 包含 name_2
print(df_true["loc"].value_counts())

loc
USA    553
JPN    334
IND    274
GBR    184
KOR    118
CHN    112
CAN    102
DEU     86
FRA     84
AUS     80
HKG     60
SGP     54
IDN     51
CHE     47
ZAF     44
SWE     41
ITA     31
NOR     26
ESP     25
NLD     20
AUT     17
FIN     14
NZL     14
POL     13
CHL     13
IRL     13
BEL     12
TUR     11
ISR     11
DNK      9
MEX      8
SAU      7
LUX      7
JEY      4
MAR      4
PRT      4
GRC      3
BRA      3
COL      3
VGB      2
CYM      2
BMU      2
HRV      1
GGY      1
ARG      1
LTU      1
BGR      1
RUS      1
CYP      1
CZE      1
SVN      1
Name: count, dtype: int64


In [23]:
import pandas as pd
from difflib import SequenceMatcher
import re
import ast
import pycountry

# === Step 1: 加载文件 ===
df_pred = pd.read_csv("output/company_name_gpt_results3.csv")  # 包含 company_name、other_names
df_true = pd.read_csv("check/matching_gabarito_with_pdfs2.csv")  # 包含 name_2
df_pred["filename"] = df_pred["file_name"].str.strip()
df_true["filename"] = df_true["pdf_path"].str.strip()

df_merged = pd.merge(df_pred, df_true, on="filename", how="inner")

# === Step 2: 清洗函数 ===
def _clean_name(name: str) -> str:
    if pd.isna(name):
        return ""
    name = name.lower()
    name = re.sub(r"\([^()]*\)", "", name)
    name = re.sub(r"[^\w\d\s]", "", name)
    suffixes = [
        " company", " companies", " corporation", " incorporated", " corp", " llc", " ltd", " inc", 
        " oyj", " intl", " sa", " lp", " spa", " sanv", " nv", " plc", " nvsa", " ptd", 
        " int", " international", "limited", "group", "the ", " holdings", " co"
    ]
    for suffix in suffixes:
        name = name.replace(suffix, "")
    return re.sub(r"\s+", " ", name).strip()

def fuzzy_match(a, b):
    return SequenceMatcher(None, a, b).ratio()

def normalize_country_name(name):
    if pd.isna(name) or not isinstance(name, str):
        return ""
    name = name.strip().lower()
    try:
        # 先尝试国家简称
        country = pycountry.countries.get(alpha_2=name.upper())
        if country:
            return country.name.lower()
        # 再尝试国家全称
        country = pycountry.countries.search_fuzzy(name)[0]
        return country.name.lower()
    except:
        return name.lower()

def word_overlap_match(name1, name2):
    """返回两个名字中单词交集是否显著（≥1个有效关键词）"""
    stopwords = {"group", "holdings", "company", "limited", "corporation", "inc", "co"}
    words1 = set(re.findall(r"\b\w+\b", name1.lower())) - stopwords
    words2 = set(re.findall(r"\b\w+\b", name2.lower())) - stopwords
    return len(words1 & words2) > 0

# === Step 3: 匹配函数 ===
def get_best_match(row):
    target = _clean_name(row["name_2"])
    main_name = _clean_name(row["company_name"])
    best_score = fuzzy_match(main_name, target)
    best_source = "company_name"
    is_match = best_score >= 0.85

    # 尝试别名匹配
    candidates = []
    if "other_names" in row and pd.notna(row["other_names"]):
        raw = row["other_names"]
        try:
            if isinstance(raw, str):
                if raw.startswith("["):
                    candidates = ast.literal_eval(raw)
                else:
                    candidates = re.split(r"[;,\n]", raw)
            elif isinstance(raw, list):
                candidates = raw
        except Exception:
            candidates = []

    for alt in candidates:
        alt_clean = _clean_name(alt)
        score = fuzzy_match(alt_clean, target)
        if score > best_score:
            best_score = score
            is_match = score >= 0.85
            best_source = f"other_names ({alt.strip()})"

    # 第三阶段：若高分匹配失败，则使用国家宽松匹配 + 单词交集
    country1 = normalize_country_name(row.get("country", ""))
    country2 = normalize_country_name(row.get("loc", ""))
    
    relaxed_match_applied = False
    if not is_match and country1 and country1 == country2:
        if word_overlap_match(row["company_name"], row["name_2"]):
            is_match = True
            best_source = "relaxed_by_country_word_overlap"
            relaxed_match_applied = True

    return pd.Series([
        best_score,
        is_match,
        best_source,
        country1,
        country2,
        relaxed_match_applied
    ], index=[
        "fuzzy_score", "is_correct", "matched_by", 
        "normalized_country_pred", "normalized_country_true", 
        "used_relaxed_match"
    ])

# === Step 4: 应用匹配函数 ===
df_merged[[
    "fuzzy_score", "is_correct", "matched_by", 
    "normalized_country_pred", "normalized_country_true", 
    "used_relaxed_match"
]] = df_merged.apply(get_best_match, axis=1)

# === Step 5: 输出表格 ===
df_eval = df_merged[[
    "filename", "company_name", "other_names", "country", 
    "name_2", "loc", "normalized_country_pred", 
    "normalized_country_true", "fuzzy_score", "is_correct", 
    "matched_by", "used_relaxed_match"
]]
df_eval.to_csv("eval/company_name_eval_with_country_relaxed.csv", index=False)

# === Step 6: 显示准确率 ===
accuracy = df_eval["is_correct"].mean()
print(f"✅ 公司名称提取准确率（含国家宽松匹配）：{accuracy:.2%}")

✅ 公司名称提取准确率（含国家宽松匹配）：95.59%
