In [None]:
import fitz
import pytesseract
from pdf2image import convert_from_path
import tempfile
import os
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
from dotenv import load_dotenv
import json

# Load OpenAI API key from .env file
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


# 提取 PDF 前后页文本
def extract_front_back_text(pdf_path, front_n=5, back_n=5, dpi=400):
    try:
        doc = fitz.open(str(pdf_path))
        texts = []
        for i in range(min(front_n, len(doc))):
            texts.append(doc[i].get_text())
        for i in range(max(0, len(doc) - back_n), len(doc)):
            texts.append(doc[i].get_text())
        doc.close()
        full_text = "\n".join(texts)
        if len(full_text.strip()) < 100:
            raise ValueError("Too short, fallback to OCR.")
        return full_text
    except Exception as e:
        print(f"⚠️ Fallback to OCR on: {pdf_path.name} due to {str(e)}")
        return extract_text_with_ocr(pdf_path, front_n, back_n, dpi)

def extract_text_with_ocr(pdf_path, front_n=5, back_n=5, dpi=400):
    import warnings
    from pdf2image.exceptions import PDFPageCountError
    try:
        with tempfile.TemporaryDirectory() as path:
            try:
                images = convert_from_path(str(pdf_path), dpi=dpi, output_folder=path)
            except PDFPageCountError as e:
                return f"OCR ERROR: PDF structure invalid – {str(e)}"
            except Exception as e:
                return f"OCR ERROR: {str(e)}"

            total_pages = len(images)
            if total_pages == 0:
                return "OCR ERROR: No images extracted"

            selected = images[:front_n] + images[-back_n:]
            texts = []
            for img in selected:
                img = img.convert("L")  # 灰度增强
                text = pytesseract.image_to_string(img, lang="eng")
                texts.append(text)
            return "\n".join(texts)

    except Exception as e:
        return f"OCR ERROR (outer): {str(e)}"


def extract_company_or_publisher_with_gpt(text):
    import json

    # 第一阶段：提取公司名 + 其他别名（简称、缩写、母公司等）
    primary_prompt = f"""
You are a corporate reporting analyst.

Given the first and last pages of a corporate report, your tasks are:
1. Identify the official **company name** that issued the report.
2. Identify **other names or abbreviations** (e.g., short forms, brands, acronyms, or group names) mentioned in the text that refer to the same company.
3. Briefly explain your reasoning.

Return a JSON object like this:
{{
  "company_name": "...",
  "other_names": ["...", "..."],     ← optional list
  "reasoning": "..."
}}

If no company name is found, return:
{{
  "company_name": "UNKNOWN",
  "other_names": [],
  "reasoning": "No indication of the company in the text."
}}

Text:
{text}
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4.1",  # 更稳定
            messages=[
                {"role": "system", "content": "You are an expert in ESG and corporate reporting."},
                {"role": "user", "content": primary_prompt}
            ],
            temperature=0
        )
        content = response.choices[0].message.content.strip()
        result = json.loads(content)
    except Exception as e:
        return {
            "company_name": "GPT_ERROR",
            "other_names": [],
            "reasoning": str(e),
            "publisher": None
        }
    except json.JSONDecodeError:
        return {
            "company_name": "PARSE_ERROR",
            "other_names": [],
            "reasoning": f"Raw response: {content}",
            "publisher": None
        }

    # 第二阶段：如果公司名未知，则继续识别 publisher
    if result.get("company_name", "").upper() == "UNKNOWN":
        try:
            secondary_prompt = f"""
You are analyzing a report that is not issued by a company but by a public organization or academic body.

Identify the **publisher** from this text and return:
{{ "publisher": "..." }}

If unknown:
{{ "publisher": "UNKNOWN" }}

Text:
{text[:4000]}
"""
            response2 = client.chat.completions.create(
                model="gpt-4.1",
                messages=[
                    {"role": "system", "content": "You are an expert in institutional publishing."},
                    {"role": "user", "content": secondary_prompt}
                ],
                temperature=0
            )
            content2 = response2.choices[0].message.content.strip()
            pub_result = json.loads(content2)
            result["publisher"] = pub_result.get("publisher", "UNKNOWN")
        except Exception as e:
            result["publisher"] = f"GPT_ERROR: {str(e)}"
    else:
        result["publisher"] = None

    # 清理输出格式
    if result.get("company_name", "").upper() == "UNKNOWN":
        result["company_name"] = ""
    if not isinstance(result.get("other_names"), list):
        result["other_names"] = []

    return result


def process_folder_with_gpt_and_type(pdf_folder, output_path):
    results = []
    pdf_folder = Path(pdf_folder)
    pdf_files = list(pdf_folder.glob("*.pdf"))

    for pdf in tqdm(pdf_files, desc="Classifying reports"):
        text = extract_front_back_text(pdf)
        result = extract_company_or_publisher_with_gpt(text)
        results.append({
            "file_name": pdf.name,
            "company_name": result["company_name"],
            "other_names": "; ".join(result.get("other_names", [])),  # 用分号拼接方便输出
            "publisher": result.get("publisher", None),
            "reasoning": result["reasoning"]
        })

    df = pd.DataFrame(results)
    df.to_csv(output_path, index=False)
    print(f"✅ Saved to {output_path}")
    return df

In [2]:
# 路径替换成你的 PDF 文件夹
# Paths
PDF_DIR = Path("pdf_folder")  # Ensure this path contains your PDFs
OUTPUT_PATH = Path("output/company_name_gpt_results2.csv")
process_folder_with_gpt_and_type(PDF_DIR, OUTPUT_PATH)

Classifying reports:  20%|█▉        | 251/1277 [13:27<51:35,  3.02s/it]  

⚠️ Fallback to OCR on: Unknown_adbi-managing-transition-low-carbon-economy_087is5zy.pdf due to Too short, fallback to OCR.


Classifying reports:  20%|██        | 256/1277 [13:40<48:09,  2.83s/it]

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



Classifying reports:  20%|██        | 257/1277 [13:44<50:58,  3.00s/it]

⚠️ Fallback to OCR on: Unknown_2014SustainRpt_FNL_lr_7mrwsfm7.pdf due to Too short, fallback to OCR.


Classifying reports:  28%|██▊       | 360/1277 [19:57<1:07:24,  4.41s/it]

⚠️ Fallback to OCR on: Hansae_Yes24_Holdings_Co_Ltd_HANSAE20YES2420HOLDINGS20ESG20REPORT202022_th5kzsfk.pdf due to Too short, fallback to OCR.


Classifying reports:  34%|███▎      | 430/1277 [23:57<48:54,  3.46s/it]  

⚠️ Fallback to OCR on: Home_Inns__Hotels_Management_Inc_Barclays_Bank_PLC_Annual_Report_202014_5lj1epic.pdf due to Too short, fallback to OCR.


Classifying reports:  39%|███▉      | 498/1277 [27:29<35:30,  2.74s/it]  

⚠️ Fallback to OCR on: Unknown_adp07-sus-fr_95qx6prh.pdf due to Too short, fallback to OCR.


Classifying reports:  53%|█████▎    | 672/1277 [36:56<33:19,  3.30s/it]

⚠️ Fallback to OCR on: Armstrong_Flooring_Inc_SustainabilityReport-2020_kot54emv.pdf due to Too short, fallback to OCR.


Classifying reports:  57%|█████▋    | 724/1277 [39:58<40:30,  4.39s/it]

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: obje

Classifying reports:  60%|██████    | 771/1277 [43:05<41:37,  4.94s/it]  

⚠️ Fallback to OCR on: Tam_Jai_International_Co_Ltd_2022083101184_go5rbp4a.pdf due to Too short, fallback to OCR.


Classifying reports:  70%|███████   | 896/1277 [50:08<18:01,  2.84s/it]

⚠️ Fallback to OCR on: Hyosung_Corp_SR_2020_en_8g98j6gk.pdf due to Too short, fallback to OCR.


Classifying reports:  76%|███████▋  | 974/1277 [55:29<24:14,  4.80s/it]

⚠️ Fallback to OCR on: Boryung_Corporation_EBB3B4EBA0B920ECA780EC868DEAB080EB8AA5EAB2BDEC9881EBB3B4EAB3A0EC849CEC9881EBACB8_ebpit5lz.pdf due to Too short, fallback to OCR.


Classifying reports:  77%|███████▋  | 987/1277 [56:35<23:06,  4.78s/it]

⚠️ Fallback to OCR on: Arvind_Ltd_Arvind_AR_2022-23_0_iwp4673c.pdf due to Too short, fallback to OCR.


Classifying reports:  87%|████████▋ | 1106/1277 [1:04:41<07:55,  2.78s/it]

⚠️ Fallback to OCR on: EKI_Energy_Services_Limited_69298543284_zj7y1tjh.pdf due to Too short, fallback to OCR.


Classifying reports:  88%|████████▊ | 1122/1277 [1:06:19<24:31,  9.50s/it]

⚠️ Fallback to OCR on: Unknown_23076_Whitbread_AR2020_web_0v2mxh4f.pdf due to Too short, fallback to OCR.


Classifying reports:  90%|████████▉ | 1149/1277 [1:09:10<20:51,  9.78s/it]

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



Classifying reports:  91%|█████████▏| 1168/1277 [1:10:45<07:32,  4.15s/it]

MuPDF error: format error: No default Layer config



Classifying reports:  95%|█████████▍| 1209/1277 [1:13:50<04:39,  4.10s/it]

⚠️ Fallback to OCR on: Unknown_2023042101335_kyzhtmjn.pdf due to Too short, fallback to OCR.


Classifying reports:  97%|█████████▋| 1239/1277 [1:15:12<01:35,  2.51s/it]

⚠️ Fallback to OCR on: Titan_Company_Ltd_Annual20Report202013_p4r8w07u.pdf due to Too short, fallback to OCR.


Classifying reports:  99%|█████████▉| 1269/1277 [1:17:09<00:22,  2.76s/it]

⚠️ Fallback to OCR on: BASF_SE_2012_BASF_Report_lmq79gwn.pdf due to Too short, fallback to OCR.


Classifying reports: 100%|██████████| 1277/1277 [1:17:51<00:00,  3.66s/it]

✅ Saved to output/company_name_gpt_results2.csv





Unnamed: 0,file_name,company_name,other_names,publisher,reasoning
0,Unknown_8f57f855-11bb-496d-9916-91ff88cb537b_s...,Paramount Global,PARAA; PARA; PARAP,,The official company name 'Paramount Global' i...
1,Toyota_Industries_Corp_environment2004_40h96hj...,Toyota Industries Corporation,Toyota Industries,,The text explicitly states 'Toyota Industries ...
2,Knoll_Inc_Knoll_Enviro_2008_gqetdkb7.pdf,"Knoll, Inc.",Knoll; KnollStudio; The Knoll 8,,"The official company name 'Knoll, Inc.' is exp..."
3,Intel_Corp__fwws0wtm.pdf,Intel Corporation,Intel; Intel Foundation,,The report repeatedly refers to 'Intel' as the...
4,Unknown_2020_SEBANG20SUSTAINABILITY20REPORT_EN...,SEBANG,SEBANG Building; SEBANG Group,,The official company name 'SEBANG' is repeated...
...,...,...,...,...,...
1272,Logwin_AG_CSR_Report_2021_en_rbp4aney.pdf,Logwin AG,Logwin; Logwin Group; Logwin Corporation,,The official company name 'Logwin AG' is expli...
1273,Shanghai_Electric_Group_Co_Ltd_95909_s0uqoqkj.pdf,"Shanghai Electric Group Co., Ltd",Shanghai Electric Group Company Limited; Shang...,,The official company name 'Shanghai Electric G...
1274,PT_Soho_Global_Health_Tbk_Final_annual_report_...,PT SOHO Global Health,SOHO Global Health; SGH,,The text includes a section titled 'Profil Per...
1275,Banner_Corp_32banner-sustainability-report-202...,Banner Ltd.,Banner; Banner Group,,The official company name 'Banner Ltd.' appear...


In [None]:
import pandas as pd
from difflib import SequenceMatcher
import re
import ast

# === Step 1: 加载文件 ===
df_pred = pd.read_csv("output/company_name_gpt_results.csv")  # 包含 company_name、other_names
df_true = pd.read_csv("check/matching_gabarito_with_pdfs.csv")  # 包含 name_2

df_pred["filename"] = df_pred["file_name"].str.strip()
df_true["filename"] = df_true["pdf_path"].str.strip()

df_merged = pd.merge(df_pred, df_true, on="filename", how="inner")

# === Step 2: 清洗函数 ===
def _clean_name(name: str) -> str:
    if pd.isna(name):
        return ""
    name = name.lower()
    name = re.sub(r"\([^()]*\)", "", name)
    name = re.sub(r"[^\w\d\s]", "", name)
    suffixes = [" company", " companies", " corporation", " incorporated", " corp", " llc", " ltd", " inc", 
                " oyj", " intl", " sa", " lp", " spa", " sanv", " nv", " plc", " nvsa", " ptd", 
                " int", " international", "limited", "group", "the ", " holdings", " co"]
    for suffix in suffixes:
        name = name.replace(suffix, "")
    name = name.replace("é", "e").replace("  ", "").replace(" ", "")
    return re.sub(r"[^a-zA-Z0-9]", "", name)

# === Step 3: 匹配函数 ===
def fuzzy_match(clean_a, clean_b):
    return SequenceMatcher(None, clean_a, clean_b).ratio()

def get_best_match(row):
    target = _clean_name(row["name_2"])
    main_name = _clean_name(row["company_name"])
    best_score = fuzzy_match(main_name, target)
    best_source = "company_name"

    # 如果提供了 other_names，就尝试匹配
    if "other_names" in row and pd.notna(row["other_names"]):
        try:
            candidates = ast.literal_eval(row["other_names"]) if isinstance(row["other_names"], str) else []
        except Exception:
            candidates = []
        for alt in candidates:
            alt_clean = _clean_name(alt)
            score = fuzzy_match(alt_clean, target)
            if score > best_score:
                best_score = score
                best_source = "other_names"

    return pd.Series([best_score, best_score >= 0.85, best_source], index=["fuzzy_score", "is_correct", "matched_by"])

# === Step 4: 应用匹配函数 ===
df_merged[["fuzzy_score", "is_correct", "matched_by"]] = df_merged.apply(get_best_match, axis=1)

# === Step 5: 输出评估表 ===
df_eval = df_merged[["filename", "company_name", "other_names", "name_2", "fuzzy_score", "is_correct", "matched_by"]]
df_eval.to_csv("output/company_name_eval_with_alias.csv", index=False)

# === Step 6: 显示准确率 ===
accuracy = df_eval["is_correct"].mean()
print(f"✅ 公司名称提取准确率（含别名容错）：{accuracy:.2%}")

✅ 公司名称提取准确率（含别名匹配）：75.98%


In [3]:
import pandas as pd
from difflib import SequenceMatcher
import re
import ast

# === Step 1: 加载文件 ===
df_pred = pd.read_csv("output/company_name_gpt_results3.csv")  # 包含 company_name、other_names
df_true = pd.read_csv("check/matching_gabarito_with_pdfs2.csv")  # 包含 name_2

df_pred["filename"] = df_pred["file_name"].str.strip()
df_true["filename"] = df_true["pdf_path"].str.strip()

df_merged = pd.merge(df_pred, df_true, on="filename", how="inner")

# === Step 2: 清洗函数 ===
def _clean_name(name: str) -> str:
    if pd.isna(name):
        return ""
    name = name.lower()
    name = re.sub(r"\([^()]*\)", "", name)
    name = re.sub(r"[^\w\d\s]", "", name)
    suffixes = [
        " company", " companies", " corporation", " incorporated", " corp", " llc", " ltd", " inc", 
        " oyj", " intl", " sa", " lp", " spa", " sanv", " nv", " plc", " nvsa", " ptd", 
        " int", " international", "limited", "group", "the ", " holdings", " co"
    ]
    for suffix in suffixes:
        name = name.replace(suffix, "")
    name = name.replace("é", "e").replace("  ", "").replace(" ", "")
    return re.sub(r"[^a-zA-Z0-9]", "", name)

# === Step 3: 匹配函数 ===
def fuzzy_match(clean_a, clean_b):
    return SequenceMatcher(None, clean_a, clean_b).ratio()
def get_best_match(row):
    target = _clean_name(row["name_2"])
    main_name = _clean_name(row["company_name"])
    best_score = fuzzy_match(main_name, target)
    best_source = "company_name"

    # 解析 other_names：支持列表或字符串（自动切分）
    candidates = []
    if "other_names" in row and pd.notna(row["other_names"]):
        raw = row["other_names"]
        try:
            if isinstance(raw, str):
                if raw.startswith("["):  # 是列表字符串
                    candidates = ast.literal_eval(raw)
                else:  # 否则按常见分隔符切分
                    candidates = re.split(r"[;,\n]", raw)
            elif isinstance(raw, list):
                candidates = raw
        except Exception:
            candidates = []

    for alt in candidates:
        alt_clean = _clean_name(alt)
        score = fuzzy_match(alt_clean, target)
        if score > best_score:
            best_score = score
            best_source = f"other_names ({alt.strip()})"

    return pd.Series([best_score, best_score >= 0.85, best_source], index=["fuzzy_score", "is_correct", "matched_by"])
# === Step 4: 应用匹配函数 ===
df_merged[["fuzzy_score", "is_correct", "matched_by"]] = df_merged.apply(get_best_match, axis=1)

# === Step 5: 输出评估表 ===
df_eval = df_merged[["filename", "company_name", "other_names", "name_2", "fuzzy_score", "is_correct", "matched_by"]]
df_eval.to_csv("eval/company_name_eval_with_alias.csv", index=False)

# === Step 6: 显示准确率 ===
accuracy = df_eval["is_correct"].mean()
print(f"✅ 公司名称提取准确率（含别名匹配）：{accuracy:.2%}")

✅ 公司名称提取准确率（含别名匹配）：94.12%


In [1]:
import fitz
import pytesseract
from pdf2image import convert_from_path
import tempfile
import os
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
from dotenv import load_dotenv
import json

# Load OpenAI API key from .env file
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


# 提取 PDF 前后页文本
def extract_front_back_text(pdf_path, front_n=5, back_n=5, dpi=400):
    try:
        doc = fitz.open(str(pdf_path))
        texts = []
        for i in range(min(front_n, len(doc))):
            texts.append(doc[i].get_text())
        for i in range(max(0, len(doc) - back_n), len(doc)):
            texts.append(doc[i].get_text())
        doc.close()
        full_text = "\n".join(texts)
        if len(full_text.strip()) < 100:
            raise ValueError("Too short, fallback to OCR.")
        return full_text
    except Exception as e:
        print(f"⚠️ Fallback to OCR on: {pdf_path.name} due to {str(e)}")
        return extract_text_with_ocr(pdf_path, front_n, back_n, dpi)

def extract_text_with_ocr(pdf_path, front_n=5, back_n=5, dpi=400):
    import warnings
    from pdf2image.exceptions import PDFPageCountError
    try:
        with tempfile.TemporaryDirectory() as path:
            try:
                images = convert_from_path(str(pdf_path), dpi=dpi, output_folder=path)
            except PDFPageCountError as e:
                return f"OCR ERROR: PDF structure invalid – {str(e)}"
            except Exception as e:
                return f"OCR ERROR: {str(e)}"

            total_pages = len(images)
            if total_pages == 0:
                return "OCR ERROR: No images extracted"

            selected = images[:front_n] + images[-back_n:]
            texts = []
            for img in selected:
                img = img.convert("L")  # 灰度增强
                text = pytesseract.image_to_string(img, lang="eng")
                texts.append(text)
            return "\n".join(texts)

    except Exception as e:
        return f"OCR ERROR (outer): {str(e)}"


def extract_company_or_publisher_with_gpt(text):
    import json

    # 第一阶段：提取公司名称、别名、国家
    primary_prompt = f"""
You are a corporate reporting analyst.

From the first and last pages of a corporate report, extract the following:
1. The **official company name** that issued the report.
2. A list of **other names** referring to the company (abbreviations, group name, acronyms, etc.).
3. The **country** where the company is headquartered.
4. A brief **reasoning** explaining your extraction.

Return a JSON object like this:
{{
  "company_name": "...",
  "other_names": ["...", "..."],
  "country": "...",
  "reasoning": "..."
}}

If the company name is not found, return:
{{
  "company_name": "UNKNOWN",
  "other_names": [],
  "country": "UNKNOWN",
  "reasoning": "No indication of the company in the text."
}}

Text:
{text}
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4.1",  # 更稳定
            messages=[
                {"role": "system", "content": "You are an expert in ESG and corporate reporting."},
                {"role": "user", "content": primary_prompt}
            ],
            temperature=0
        )
        content = response.choices[0].message.content.strip()
        result = json.loads(content)
    except Exception as e:
        return {
            "company_name": "GPT_ERROR",
            "other_names": [],
            "country": "GPT_ERROR",
            "reasoning": str(e),
            "publisher": None
        }
    except json.JSONDecodeError:
        return {
            "company_name": "PARSE_ERROR",
            "other_names": [],
            "country": "PARSE_ERROR",
            "reasoning": f"Raw response: {content}",
            "publisher": None
        }

    # 第二阶段：若无法识别公司名，尝试提取 publisher
    if result.get("company_name", "").upper() == "UNKNOWN":
        try:
            secondary_prompt = f"""
This report was not issued by a company but possibly by a public institution or academic body.

From the following text, extract the **publisher** and return:
{{ "publisher": "..." }}

If not found:
{{ "publisher": "UNKNOWN" }}

Text:
{text}
"""
            response2 = client.chat.completions.create(
                model="gpt-4.1-mini",
                messages=[
                    {"role": "system", "content": "You are an expert in institutional publishing."},
                    {"role": "user", "content": secondary_prompt}
                ],
                temperature=0
            )
            content2 = response2.choices[0].message.content.strip()
            pub_result = json.loads(content2)
            result["publisher"] = pub_result.get("publisher", "UNKNOWN")
        except Exception as e:
            result["publisher"] = f"GPT_ERROR: {str(e)}"
    else:
        result["publisher"] = None

    # 清理字段
    if result.get("company_name", "").upper() == "UNKNOWN":
        result["company_name"] = ""
    if not isinstance(result.get("other_names"), list):
        result["other_names"] = []
    if "country" not in result or not isinstance(result["country"], str):
        result["country"] = "UNKNOWN"

    return result

def process_folder_with_gpt_and_type(pdf_folder, output_path):
    results = []
    pdf_folder = Path(pdf_folder)
    pdf_files = list(pdf_folder.glob("*.pdf"))

    for pdf in tqdm(pdf_files, desc="Classifying reports"):
        text = extract_front_back_text(pdf)
        result = extract_company_or_publisher_with_gpt(text)
        results.append({
            "file_name": pdf.name,
            "company_name": result["company_name"],
            "other_names": "; ".join(result.get("other_names", [])),  # 用分号拼接方便输出
            "publisher": result.get("publisher", None),
            "reasoning": result["reasoning"],
            "country": result["country"],
        })

    df = pd.DataFrame(results)
    df.to_csv(output_path, index=False)
    print(f"✅ Saved to {output_path}")
    return df

In [2]:
# 路径替换成你的 PDF 文件夹
# Paths
PDF_DIR = Path("pdf_folder")  # Ensure this path contains your PDFs
OUTPUT_PATH = Path("output/company_name_gpt_results3-2.csv")
process_folder_with_gpt_and_type(PDF_DIR, OUTPUT_PATH)

Classifying reports:  20%|█▉        | 251/1277 [26:46<3:16:28, 11.49s/it]

⚠️ Fallback to OCR on: Unknown_adbi-managing-transition-low-carbon-economy_087is5zy.pdf due to Too short, fallback to OCR.


Classifying reports:  20%|██        | 256/1277 [27:29<3:08:12, 11.06s/it]

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



Classifying reports:  20%|██        | 257/1277 [27:36<2:47:29,  9.85s/it]

⚠️ Fallback to OCR on: Unknown_2014SustainRpt_FNL_lr_7mrwsfm7.pdf due to Too short, fallback to OCR.


Classifying reports:  28%|██▊       | 360/1277 [37:35<1:08:24,  4.48s/it]

⚠️ Fallback to OCR on: Hansae_Yes24_Holdings_Co_Ltd_HANSAE20YES2420HOLDINGS20ESG20REPORT202022_th5kzsfk.pdf due to Too short, fallback to OCR.


Classifying reports:  34%|███▎      | 430/1277 [44:08<1:11:05,  5.04s/it]

⚠️ Fallback to OCR on: Home_Inns__Hotels_Management_Inc_Barclays_Bank_PLC_Annual_Report_202014_5lj1epic.pdf due to Too short, fallback to OCR.


Classifying reports:  39%|███▉      | 498/1277 [50:56<1:00:09,  4.63s/it]

⚠️ Fallback to OCR on: Unknown_adp07-sus-fr_95qx6prh.pdf due to Too short, fallback to OCR.


Classifying reports:  53%|█████▎    | 672/1277 [1:10:12<47:11,  4.68s/it]  

⚠️ Fallback to OCR on: Armstrong_Flooring_Inc_SustainabilityReport-2020_kot54emv.pdf due to Too short, fallback to OCR.


Classifying reports:  57%|█████▋    | 724/1277 [1:15:34<1:03:00,  6.84s/it]

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: obje

Classifying reports:  60%|██████    | 771/1277 [1:19:57<33:37,  3.99s/it]  

⚠️ Fallback to OCR on: Tam_Jai_International_Co_Ltd_2022083101184_go5rbp4a.pdf due to Too short, fallback to OCR.


Classifying reports:  70%|███████   | 896/1277 [1:33:35<32:03,  5.05s/it]  

⚠️ Fallback to OCR on: Hyosung_Corp_SR_2020_en_8g98j6gk.pdf due to Too short, fallback to OCR.


Classifying reports:  76%|███████▋  | 974/1277 [1:42:15<17:42,  3.51s/it]  

⚠️ Fallback to OCR on: Boryung_Corporation_EBB3B4EBA0B920ECA780EC868DEAB080EB8AA5EAB2BDEC9881EBB3B4EAB3A0EC849CEC9881EBACB8_ebpit5lz.pdf due to Too short, fallback to OCR.


Classifying reports:  77%|███████▋  | 987/1277 [1:43:02<19:07,  3.96s/it]

⚠️ Fallback to OCR on: Arvind_Ltd_Arvind_AR_2022-23_0_iwp4673c.pdf due to Too short, fallback to OCR.


Classifying reports:  87%|████████▋ | 1106/1277 [1:51:45<11:01,  3.87s/it]

⚠️ Fallback to OCR on: EKI_Energy_Services_Limited_69298543284_zj7y1tjh.pdf due to Too short, fallback to OCR.


Classifying reports:  88%|████████▊ | 1122/1277 [1:52:56<10:50,  4.20s/it]

⚠️ Fallback to OCR on: Unknown_23076_Whitbread_AR2020_web_0v2mxh4f.pdf due to Too short, fallback to OCR.


Classifying reports:  90%|████████▉ | 1149/1277 [1:55:06<08:05,  3.79s/it]

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



Classifying reports:  91%|█████████▏| 1168/1277 [1:56:26<06:31,  3.60s/it]

MuPDF error: format error: No default Layer config



Classifying reports:  95%|█████████▍| 1209/1277 [1:59:29<05:15,  4.64s/it]

⚠️ Fallback to OCR on: Unknown_2023042101335_kyzhtmjn.pdf due to Too short, fallback to OCR.


Classifying reports:  97%|█████████▋| 1239/1277 [2:02:13<03:51,  6.10s/it]

⚠️ Fallback to OCR on: Titan_Company_Ltd_Annual20Report202013_p4r8w07u.pdf due to Too short, fallback to OCR.


Classifying reports:  99%|█████████▉| 1269/1277 [2:04:50<00:28,  3.61s/it]

⚠️ Fallback to OCR on: BASF_SE_2012_BASF_Report_lmq79gwn.pdf due to Too short, fallback to OCR.


Classifying reports: 100%|██████████| 1277/1277 [2:05:18<00:00,  5.89s/it]

✅ Saved to output/company_name_gpt_results3-2.csv





Unnamed: 0,file_name,company_name,other_names,publisher,reasoning,country
0,Unknown_8f57f855-11bb-496d-9916-91ff88cb537b_s...,Paramount Global,Paramount; the Company; ViacomCBS Inc.; Viacom...,,The first page of the report clearly states 'P...,United States
1,Toyota_Industries_Corp_environment2004_40h96hj...,Toyota Industries Corporation,Toyota Industries; Toyota Industries Group; To...,,The official company name 'Toyota Industries C...,Japan
2,Knoll_Inc_Knoll_Enviro_2008_gqetdkb7.pdf,"Knoll, Inc.",Knoll; Knoll Group; KnollStudio; KnollTextiles...,,The first page of the report includes a quote ...,United States
3,Intel_Corp__fwws0wtm.pdf,Intel Corporation,Intel; Intel Foundation,,The first page of the report refers to 'Intel'...,United States
4,Unknown_2020_SEBANG20SUSTAINABILITY20REPORT_EN...,"SEBANG Co., Ltd.",SEBANG; SEBANG Group; SEBANG Express; SEBANG VINA,,"The official company name 'SEBANG Co., Ltd.' i...",South Korea
...,...,...,...,...,...,...
1272,Logwin_AG_CSR_Report_2021_en_rbp4aney.pdf,Logwin AG,Logwin; Logwin Group; Logwin Corporation,,"On page 3, the text states: 'Logwin AG, with h...",Luxembourg
1273,Shanghai_Electric_Group_Co_Ltd_95909_s0uqoqkj.pdf,"Shanghai Electric Group Co., Ltd",Shanghai Electric; Group; Company; we,,The official company name 'Shanghai Electric G...,China
1274,PT_Soho_Global_Health_Tbk_Final_annual_report_...,PT Soho Global Health Tbk,SGH; Soho Global Health; Kelompok Usaha; the G...,,The official company name 'PT Soho Global Heal...,Indonesia
1275,Banner_Corp_32banner-sustainability-report-202...,Banner Group Ltd,Banner Ltd.; Banner; Banner Group; Banner Grou...,,The first page lists 'Banner Ltd.' with a UK a...,United Kingdom


In [3]:
import pandas as pd
from difflib import SequenceMatcher
import re
import ast
import pycountry

# === Step 1: 加载文件 ===
df_pred = pd.read_csv("output/company_name_gpt_results3-2.csv")  # 包含 company_name、other_names
df_true = pd.read_csv("check/matching_gabarito_with_pdfs2.csv")  # 包含 name_2
df_pred["filename"] = df_pred["file_name"].str.strip()
df_true["filename"] = df_true["pdf_path"].str.strip()

df_merged = pd.merge(df_pred, df_true, on="filename", how="inner")

# === Step 2: 清洗函数 ===
def _clean_name(name: str) -> str:
    if pd.isna(name):
        return ""
    name = name.lower()
    name = re.sub(r"\([^()]*\)", "", name)
    name = re.sub(r"[^\w\d\s]", "", name)
    suffixes = [
        " company", " companies", " corporation", " incorporated", " corp", " llc", " ltd", " inc", 
        " oyj", " intl", " sa", " lp", " spa", " sanv", " nv", " plc", " nvsa", " ptd", 
        " int", " international", "limited", "group", "the ", " holdings", " co"
    ]
    for suffix in suffixes:
        name = name.replace(suffix, "")
    return re.sub(r"\s+", " ", name).strip()

def fuzzy_match(a, b):
    return SequenceMatcher(None, a, b).ratio()

def normalize_country_name(name):
    if pd.isna(name) or not isinstance(name, str):
        return ""
    name = name.strip().lower()
    try:
        # 先尝试国家简称
        country = pycountry.countries.get(alpha_2=name.upper())
        if country:
            return country.name.lower()
        # 再尝试国家全称
        country = pycountry.countries.search_fuzzy(name)[0]
        return country.name.lower()
    except:
        return name.lower()

def word_overlap_match(name1, name2):
    """返回两个名字中单词交集是否显著（≥1个有效关键词）"""
    stopwords = {"group", "holdings", "company", "limited", "corporation", "inc", "co"}
    words1 = set(re.findall(r"\b\w+\b", name1.lower())) - stopwords
    words2 = set(re.findall(r"\b\w+\b", name2.lower())) - stopwords
    return len(words1 & words2) > 0

# === Step 3: 匹配函数 ===
def get_best_match(row):
    target = _clean_name(row["name_2"])
    main_name = _clean_name(row["company_name"])
    best_score = fuzzy_match(main_name, target)
    best_source = "company_name"
    is_match = best_score >= 0.85

    # 尝试别名匹配
    candidates = []
    if "other_names" in row and pd.notna(row["other_names"]):
        raw = row["other_names"]
        try:
            if isinstance(raw, str):
                if raw.startswith("["):
                    candidates = ast.literal_eval(raw)
                else:
                    candidates = re.split(r"[;,\n]", raw)
            elif isinstance(raw, list):
                candidates = raw
        except Exception:
            candidates = []

    for alt in candidates:
        alt_clean = _clean_name(alt)
        score = fuzzy_match(alt_clean, target)
        if score > best_score:
            best_score = score
            is_match = score >= 0.85
            best_source = f"other_names ({alt.strip()})"

    # 第三阶段：若高分匹配失败，则使用国家宽松匹配 + 单词交集
    country1 = normalize_country_name(row.get("country", ""))
    country2 = normalize_country_name(row.get("loc", ""))
    
    relaxed_match_applied = False
    if not is_match and country1 and country1 == country2:
        if word_overlap_match(row["company_name"], row["name_2"]):
            is_match = True
            best_source = "relaxed_by_country_word_overlap"
            relaxed_match_applied = True

    return pd.Series([
        best_score,
        is_match,
        best_source,
        country1,
        country2,
        relaxed_match_applied
    ], index=[
        "fuzzy_score", "is_correct", "matched_by", 
        "normalized_country_pred", "normalized_country_true", 
        "used_relaxed_match"
    ])

# === Step 4: 应用匹配函数 ===
df_merged[[
    "fuzzy_score", "is_correct", "matched_by", 
    "normalized_country_pred", "normalized_country_true", 
    "used_relaxed_match"
]] = df_merged.apply(get_best_match, axis=1)

# === Step 5: 输出表格 ===
df_eval = df_merged[[
    "filename", "company_name", "other_names", "country", 
    "name_2", "loc", "normalized_country_pred", 
    "normalized_country_true", "fuzzy_score", "is_correct", 
    "matched_by", "used_relaxed_match"
]]
# df_eval.to_csv("eval/company_name_eval_with_country_relaxed.csv", index=False)

# === Step 6: 显示准确率 ===
accuracy = df_eval["is_correct"].mean()
print(f"✅ 公司名称提取准确率（含国家宽松匹配）：{accuracy:.2%}")

✅ 公司名称提取准确率（含国家宽松匹配）：95.59%
