In [None]:
import fitz
import pytesseract
from pdf2image import convert_from_path
import tempfile
import os
from pathlib import Path
import pandas as pd
from tqdm import tqdm
from openai import OpenAI
from dotenv import load_dotenv
import json

# Load OpenAI API key from .env file
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


# 提取 PDF 前后页文本
def extract_front_back_text(pdf_path, front_n=5, back_n=5, dpi=400):
    try:
        doc = fitz.open(str(pdf_path))
        texts = []
        for i in range(min(front_n, len(doc))):
            texts.append(doc[i].get_text())
        for i in range(max(0, len(doc) - back_n), len(doc)):
            texts.append(doc[i].get_text())
        doc.close()
        full_text = "\n".join(texts)
        if len(full_text.strip()) < 100:
            raise ValueError("Too short, fallback to OCR.")
        return full_text
    except Exception as e:
        print(f"⚠️ Fallback to OCR on: {pdf_path.name} due to {str(e)}")
        return extract_text_with_ocr(pdf_path, front_n, back_n, dpi)

def extract_text_with_ocr(pdf_path, front_n=5, back_n=5, dpi=400):
    import warnings
    from pdf2image.exceptions import PDFPageCountError
    try:
        with tempfile.TemporaryDirectory() as path:
            try:
                images = convert_from_path(str(pdf_path), dpi=dpi, output_folder=path)
            except PDFPageCountError as e:
                return f"OCR ERROR: PDF structure invalid – {str(e)}"
            except Exception as e:
                return f"OCR ERROR: {str(e)}"

            total_pages = len(images)
            if total_pages == 0:
                return "OCR ERROR: No images extracted"

            selected = images[:front_n] + images[-back_n:]
            texts = []
            for img in selected:
                img = img.convert("L")  # 灰度增强
                text = pytesseract.image_to_string(img, lang="eng")
                texts.append(text)
            return "\n".join(texts)

    except Exception as e:
        return f"OCR ERROR (outer): {str(e)}"

# 用 GPT 提取公司名称

def extract_company_or_publisher_with_gpt(text):
    import json

    # 第一阶段：尝试识别公司名称
    primary_prompt = f"""
You are a corporate reporting analyst.

You are given the first and last pages of a report. Your task:
1. Identify the official **company name** that issued the report.
2. Briefly explain your reasoning (e.g., where you found the name, any indicative phrasing).

Return your answer as a JSON object with the following format:
{{
  "company_name": "...",
  "reasoning": "..."
}}

If no company name is clearly found, return:
{{
  "company_name": "UNKNOWN",
  "reasoning": "No clear indication of the issuing company in the provided text."
}}

Text:
{text}
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4.1-mini",  # 更稳
            messages=[
                {"role": "system", "content": "You are an expert in ESG and corporate reporting."},
                {"role": "user", "content": primary_prompt}
            ],
            temperature=0
        )
        content = response.choices[0].message.content.strip()
        result = json.loads(content)
    except Exception as e:
        return {
            "company_name": "GPT_ERROR",
            "reasoning": str(e),
            "publisher": None
        }
    except json.JSONDecodeError:
        return {
            "company_name": "PARSE_ERROR",
            "reasoning": f"Raw response: {content}",
            "publisher": None
        }

    # 第二阶段：如果公司名未知，则继续识别 publisher
    if result.get("company_name", "").upper() == "UNKNOWN":
        try:
            secondary_prompt = f"""
You are analyzing a report that is **not issued by a company**. Instead, it may be from a government agency, international organization, academic institution, or other publisher.

From the following text, identify the **publisher** or issuing organization of this report.

Return only the publisher name as a JSON object:
{{ "publisher": "..." }}

If no organization is found, return:
{{ "publisher": "UNKNOWN" }}

Text:
{text}
"""
            response2 = client.chat.completions.create(
                model="gpt-4.1-mini",  # 或 gpt-4.1
                messages=[
                    {"role": "system", "content": "You are an expert in academic and institutional publishing."},
                    {"role": "user", "content": secondary_prompt}
                ],
                temperature=0
            )
            content2 = response2.choices[0].message.content.strip()
            pub_result = json.loads(content2)
            result["publisher"] = pub_result.get("publisher", "UNKNOWN")
        except Exception as e:
            result["publisher"] = f"GPT_ERROR: {str(e)}"

    else:
        result["publisher"] = None  # 如果识别出公司名，不需要 publisher

    # ✅ 将 UNKNOWN 替换为空字符串
    if result.get("company_name", "").upper() == "UNKNOWN":
        result["company_name"] = ""

        
    return result
# 单文件处理
def extract_company_from_pdf(pdf_path):
    text = extract_front_back_text(pdf_path)
    company = extract_company_or_publisher_with_gpt(text)
    return company

def process_folder_with_gpt_and_type(pdf_folder,output_path):
    results = []
    pdf_folder = Path(pdf_folder)
    pdf_files = list(pdf_folder.glob("*.pdf"))

    for pdf in tqdm(pdf_files, desc="Classifying reports"):
        text = extract_front_back_text(pdf)
        result = extract_company_or_publisher_with_gpt(text)
        results.append({
            "file_name": pdf.name,
            "company_name": result["company_name"],
            "publisher": result.get("publisher", None),
            "reasoning": result["reasoning"]
        })

    df = pd.DataFrame(results)
    df.to_csv(output_path, index=False)
    print(f"✅ Saved to {output_path}")
    return df

In [8]:
# 路径替换成你的 PDF 文件夹
# Paths
PDF_DIR = Path("pdf_folder")  # Ensure this path contains your PDFs
OUTPUT_PATH = Path("output/company_name_gpt_results1.csv")
process_folder_with_gpt_and_type(PDF_DIR, OUTPUT_PATH)

Classifying reports:   0%|          | 0/1277 [00:00<?, ?it/s]

Classifying reports:  20%|█▉        | 251/1277 [10:07<50:16,  2.94s/it]  

⚠️ Fallback to OCR on: Unknown_adbi-managing-transition-low-carbon-economy_087is5zy.pdf due to Too short, fallback to OCR.


Classifying reports:  20%|██        | 256/1277 [10:17<36:37,  2.15s/it]

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



Classifying reports:  20%|██        | 257/1277 [10:20<37:43,  2.22s/it]

⚠️ Fallback to OCR on: Unknown_2014SustainRpt_FNL_lr_7mrwsfm7.pdf due to Too short, fallback to OCR.


Classifying reports:  28%|██▊       | 360/1277 [14:21<37:07,  2.43s/it]

⚠️ Fallback to OCR on: Hansae_Yes24_Holdings_Co_Ltd_HANSAE20YES2420HOLDINGS20ESG20REPORT202022_th5kzsfk.pdf due to Too short, fallback to OCR.


Classifying reports:  34%|███▎      | 430/1277 [17:01<35:08,  2.49s/it]

⚠️ Fallback to OCR on: Home_Inns__Hotels_Management_Inc_Barclays_Bank_PLC_Annual_Report_202014_5lj1epic.pdf due to Too short, fallback to OCR.


Classifying reports:  39%|███▉      | 498/1277 [19:41<31:18,  2.41s/it]

⚠️ Fallback to OCR on: Unknown_adp07-sus-fr_95qx6prh.pdf due to Too short, fallback to OCR.


Classifying reports:  53%|█████▎    | 672/1277 [27:23<20:29,  2.03s/it]

⚠️ Fallback to OCR on: Armstrong_Flooring_Inc_SustainabilityReport-2020_kot54emv.pdf due to Too short, fallback to OCR.


Classifying reports:  57%|█████▋    | 724/1277 [29:24<19:06,  2.07s/it]

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: obje

Classifying reports:  60%|██████    | 771/1277 [31:06<18:42,  2.22s/it]

⚠️ Fallback to OCR on: Tam_Jai_International_Co_Ltd_2022083101184_go5rbp4a.pdf due to Too short, fallback to OCR.


Classifying reports:  70%|███████   | 896/1277 [36:19<13:57,  2.20s/it]

⚠️ Fallback to OCR on: Hyosung_Corp_SR_2020_en_8g98j6gk.pdf due to Too short, fallback to OCR.


Classifying reports:  76%|███████▋  | 974/1277 [39:32<13:05,  2.59s/it]

⚠️ Fallback to OCR on: Boryung_Corporation_EBB3B4EBA0B920ECA780EC868DEAB080EB8AA5EAB2BDEC9881EBB3B4EAB3A0EC849CEC9881EBACB8_ebpit5lz.pdf due to Too short, fallback to OCR.


Classifying reports:  77%|███████▋  | 987/1277 [40:03<11:26,  2.37s/it]

⚠️ Fallback to OCR on: Arvind_Ltd_Arvind_AR_2022-23_0_iwp4673c.pdf due to Too short, fallback to OCR.


Classifying reports:  87%|████████▋ | 1106/1277 [44:58<08:42,  3.06s/it]

⚠️ Fallback to OCR on: EKI_Energy_Services_Limited_69298543284_zj7y1tjh.pdf due to Too short, fallback to OCR.


Classifying reports:  88%|████████▊ | 1122/1277 [45:35<04:51,  1.88s/it]

⚠️ Fallback to OCR on: Unknown_23076_Whitbread_AR2020_web_0v2mxh4f.pdf due to Too short, fallback to OCR.


Classifying reports:  90%|████████▉ | 1149/1277 [46:32<04:41,  2.20s/it]

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



Classifying reports:  91%|█████████▏| 1168/1277 [47:29<04:18,  2.37s/it]

MuPDF error: format error: No default Layer config



Classifying reports:  95%|█████████▍| 1209/1277 [49:16<02:39,  2.35s/it]

⚠️ Fallback to OCR on: Unknown_2023042101335_kyzhtmjn.pdf due to Too short, fallback to OCR.


Classifying reports:  97%|█████████▋| 1239/1277 [50:28<01:38,  2.58s/it]

⚠️ Fallback to OCR on: Titan_Company_Ltd_Annual20Report202013_p4r8w07u.pdf due to Too short, fallback to OCR.


Classifying reports:  99%|█████████▉| 1269/1277 [52:06<00:20,  2.58s/it]

⚠️ Fallback to OCR on: BASF_SE_2012_BASF_Report_lmq79gwn.pdf due to Too short, fallback to OCR.


Classifying reports: 100%|██████████| 1277/1277 [52:21<00:00,  2.46s/it]

✅ Saved to output/company_name_gpt_results1.csv





Unnamed: 0,file_name,company_name,publisher,reasoning
0,Unknown_8f57f855-11bb-496d-9916-91ff88cb537b_s...,Paramount Global,,The official company name 'Paramount Global' i...
1,Toyota_Industries_Corp_environment2004_40h96hj...,Toyota Industries Corporation,,The company name 'Toyota Industries Corporatio...
2,Knoll_Inc_Knoll_Enviro_2008_gqetdkb7.pdf,"Knoll, Inc.",,"The company name 'Knoll, Inc.' is explicitly m..."
3,Intel_Corp__fwws0wtm.pdf,Intel Corporation,,The report repeatedly references 'Intel' and '...
4,Unknown_2020_SEBANG20SUSTAINABILITY20REPORT_EN...,"SEBANG Co., Ltd.",,"The company name 'SEBANG Co., Ltd.' is explici..."
...,...,...,...,...
1272,Logwin_AG_CSR_Report_2021_en_rbp4aney.pdf,Logwin AG,,The company name 'Logwin AG' is explicitly men...
1273,Shanghai_Electric_Group_Co_Ltd_95909_s0uqoqkj.pdf,"Shanghai Electric Group Co., Ltd",,"The company name 'Shanghai Electric Group Co.,..."
1274,PT_Soho_Global_Health_Tbk_Final_annual_report_...,PT Soho Global Health Tbk,,The company name 'PT Soho Global Health Tbk' i...
1275,Banner_Corp_32banner-sustainability-report-202...,Banner Group Ltd,,The company name 'Banner Group Ltd' is explici...


In [None]:
import pandas as pd
from difflib import SequenceMatcher
import re

# === 设置路径 ===
pred_path = "output/company_name_gpt_results1.csv"
true_path = "check/matching_gabarito_with_pdfs.csv"

# === Step 1: 读取提取结果与标注结果 ===
df_pred = pd.read_csv(pred_path)
df_true = pd.read_csv(true_path)

# === Step 2: 标准化文件名并对齐 ===
df_pred["filename"] = df_pred["file_name"].str.strip()
df_true["filename"] = df_true["pdf_path"].str.strip()

# === Step 3: 合并两个表格 ===
df_merged = pd.merge(df_pred, df_true, on="filename", how="inner")

# === Step 4: 定义标准化与模糊匹配函数 ===

def _clean_name(name: str) -> str:
    name = name.lower()
    name = name.replace(".com", "")
    name = re.sub(r"\([^()]*\)", "", name)
    name = re.sub(r"[^\w\d\s]", "", name)
    name = "".join(
        name.replace(" company", "")
        .replace(" companies", "")
        .replace(" corporation", "")
        .replace(" incorporated", "")
        .replace("a shares", "")
        .replace(" corp", "")
        .replace(" llc", "")
        .replace(" ltd", "")
        .replace(" inc", "")
        .replace(" oyj", "")
        .replace(" intl", "")
        .replace(" sa", "")
        .replace(" lp", "")
        .replace(" spa", "")
        .replace(" sanv", "")
        .replace(" nv", "")
        .replace(" plc", "")
        .replace(" nvsa", "")
        .replace(" ptd", "")
        .replace(" int", "")
        .replace(" international", "")
        .replace("int l", "")
        .replace("limited", "")
        .replace("group", "")
        .replace(" ", "")
        .replace("  ", "")
        .replace("the ", "")
        .replace("é", "e")
        .replace(" holdings", "")
        .removesuffix(" co")
    )
    name = re.sub(r"[^a-zA-Z0-9]", "", name)
    return name

def fuzzy_match(a, b):
    return SequenceMatcher(None, _clean_name(a), _clean_name(b)).ratio()

# === Step 5: 计算匹配得分和是否正确 ===
df_merged["fuzzy_score"] = df_merged.apply(lambda row: fuzzy_match(row["company_name"], row["name_2"]), axis=1)
df_merged["is_correct"] = df_merged["fuzzy_score"] >= 0.85  # 匹配阈值可调

# === Step 6: 仅输出有用列 ===
df_eval = df_merged[["filename", "company_name", "name_2", "fuzzy_score", "is_correct"]]

# === Step 7: 计算总体准确率 ===
accuracy = df_eval["is_correct"].mean()
print(f"✅ 公司名称提取准确率: {accuracy:.2%}")

# 可选：保存输出
df_eval.to_csv("output/company_name_eval_results.csv", index=False)

✅ 公司名称提取准确率: 55.39%


In [8]:
import pandas as pd
from difflib import SequenceMatcher

# === 设置路径 ===
pred_path = "output/full_pipeline_results.csv"
true_path = "check/matching_gabarito_with_pdfs.csv"

# === Step 1: 读取提取结果与标注结果 ===
df_pred = pd.read_csv(pred_path)
df_true = pd.read_csv(true_path)

# === Step 2: 标准化文件名并对齐 ===
df_pred["filename"] = df_pred["filename"].str.strip()
df_true["filename"] = df_true["pdf_path"].str.strip()

# === Step 3: 合并两个表格 ===
df_merged = pd.merge(df_pred, df_true, on="filename", how="inner")

# === Step 4: 定义标准化与模糊匹配函数 ===
def clean_name(name):
    if pd.isna(name):
        return ""
    return str(name).lower().replace("ltd", "").replace("inc", "").replace(".", "").replace(",", "").strip()

def fuzzy_match(a, b):
    return SequenceMatcher(None, clean_name(a), clean_name(b)).ratio()

# === Step 5: 计算匹配得分和是否正确 ===
df_merged["fuzzy_score"] = df_merged.apply(lambda row: fuzzy_match(row["company_name"], row["name_2"]), axis=1)
df_merged["is_correct"] = df_merged["fuzzy_score"] >= 0.85  # 匹配阈值可调

# === Step 6: 仅输出有用列 ===
df_eval = df_merged[["filename", "company_name", "name_2", "fuzzy_score", "is_correct"]]

# === Step 7: 计算总体准确率 ===
accuracy = df_eval["is_correct"].mean()
print(f"✅ 公司名称提取准确率: {accuracy:.2%}")

# 可选：保存输出
# df_eval.to_csv("output/company_name_eval_results.csv", index=False)

✅ 公司名称提取准确率: 55.39%


In [3]:
import pandas as pd
from difflib import SequenceMatcher
import re

# === 设置路径 ===
pred_path = "output/company_name_gpt_results1.csv"
true_path = "check/matching_gabarito_with_pdfs.csv"

# === Step 1: 读取提取结果与标注结果 ===
df_pred = pd.read_csv(pred_path)
df_true = pd.read_csv(true_path)

# === Step 2: 标准化文件名并对齐 ===
df_pred["filename"] = df_pred["file_name"].str.strip()
df_true["filename"] = df_true["pdf_path"].str.strip()

# === Step 3: 合并两个表格 ===
df_merged = pd.merge(df_pred, df_true, on="filename", how="inner")

# === Step 4: 定义标准化与模糊匹配函数 ===

def _clean_name(name: str) -> str:
    name = name.lower()
    name = name.replace(".com", "")
    name = re.sub(r"\([^()]*\)", "", name)
    name = re.sub(r"[^\w\d\s]", "", name)
    name = "".join(
        name.replace(" company", "")
        .replace(" companies", "")
        .replace(" corporation", "")
        .replace(" incorporated", "")
        .replace("a shares", "")
        .replace(" corp", "")
        .replace(" llc", "")
        .replace(" ltd", "")
        .replace(" inc", "")
        .replace(" oyj", "")
        .replace(" intl", "")
        .replace(" sa", "")
        .replace(" lp", "")
        .replace(" spa", "")
        .replace(" sanv", "")
        .replace(" nv", "")
        .replace(" plc", "")
        .replace(" nvsa", "")
        .replace(" ptd", "")
        .replace(" int", "")
        .replace(" international", "")
        .replace("int l", "")
        .replace("limited", "")
        .replace("group", "")
        .replace(" ", "")
        .replace("  ", "")
        .replace("the ", "")
        .replace("é", "e")
        .replace(" holdings", "")
        .removesuffix(" co")
    )
    name = re.sub(r"[^a-zA-Z0-9]", "", name)

    return name
# def clean_name(name):
#     if pd.isna(name):
#         return ""
#     return str(name).lower().replace("ltd", "").replace("inc", "").replace(".", "").replace(",", "").strip()

def fuzzy_match(a, b):
    return SequenceMatcher(None, _clean_name(a), _clean_name(b)).ratio()

# === Step 5: 计算匹配得分和是否正确 ===
df_merged["fuzzy_score"] = df_merged.apply(lambda row: fuzzy_match(row["company_name"], row["name_2"]), axis=1)
df_merged["is_correct"] = df_merged["fuzzy_score"] >= 0.85  # 匹配阈值可调

# === Step 6: 仅输出有用列 ===
df_eval = df_merged[["filename", "company_name", "name_2", "fuzzy_score", "is_correct"]]

# === Step 7: 计算总体准确率 ===
accuracy = df_eval["is_correct"].mean()
print(f"accuracy of name extraction: {accuracy:.2%}")

# 可选：保存输出
df_eval.to_csv("output/company_name_eval_clean.csv", index=False)


accuracy of name extraction: 72.06%
