# üìä GPT-assisted Report Type Classification
This notebook reads PDF reports, extracts front/back pages, and uses OpenAI GPT API to classify the report type and sustainability section presence.

In [1]:
from openai import OpenAI
from pathlib import Path
import fitz  # PyMuPDF
import pandas as pd
from tqdm import tqdm
import json
import os
from dotenv import load_dotenv

# Load OpenAI API key from .env file
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Paths
PDF_DIR = Path("pdf_folder")  # Ensure this path contains your PDFs
OUTPUT_PATH = Path("output/report_type_gpt_results.csv")

In [2]:
# ÈáçÊñ∞Âä†ËΩΩÂøÖË¶ÅÊ®°ÂùóÂíåOCRË°•ÊïëÂáΩÊï∞ÔºàÁéØÂ¢ÉÂ∑≤ÈáçÁΩÆÔºâ
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import tempfile
import os

# OCR Ë°•ÊïëÂáΩÊï∞ÔºöÁî®‰∫é‰ªéÈ¶ñÈ°µÂíåÊú´È°µÂõæÂÉèÊèêÂèñÊñáÊú¨
def extract_text_with_ocr(pdf_path, front_n=5, back_n=5, dpi=300):
    try:
        with tempfile.TemporaryDirectory() as path:
            # Â∞Ü PDF È°µËΩ¨Êç¢‰∏∫ÂõæÂÉè
            images = convert_from_path(pdf_path, dpi=dpi, output_folder=path)
            total_pages = len(images)
            selected_pages = []

            for i in range(min(front_n, total_pages)):
                selected_pages.append(images[i])

            for i in range(max(0, total_pages - back_n), total_pages):
                selected_pages.append(images[i])

            # Áî® OCR ÊèêÂèñÊñáÊú¨
            text_parts = [pytesseract.image_to_string(img) for img in selected_pages]
            return "\n".join(text_parts)
    except Exception as e:
        return f"OCR ERROR: {e}"


In [3]:
# Extract front/back page text from a PDF
# def extract_front_back_text(pdf_path, front_n=5, back_n=5):
#     try:
#         doc = fitz.open(pdf_path)
#         texts = []
#         for i in range(min(front_n, len(doc))):
#             texts.append(doc[i].get_text())
#         for i in range(max(0, len(doc) - back_n), len(doc)):
#             texts.append(doc[i].get_text())
#         doc.close()
#         return "\n".join(texts)
#     except Exception as e:
#         return f"ERROR: {e}"
def extract_front_back_text(pdf_path, front_n=5, back_n=5):
    try:
        doc = fitz.open(pdf_path)
        texts = []
        for i in range(min(front_n, len(doc))):
            texts.append(doc[i].get_text())
        for i in range(max(0, len(doc) - back_n), len(doc)):
            texts.append(doc[i].get_text())
        doc.close()
        full_text = "\n".join(texts)
        if len(full_text.strip()) < 50:
            raise ValueError("Empty or invalid text, fallback to OCR.")
        return full_text
    except Exception as e:
        print(f"‚ö†Ô∏è PyMuPDF failed on {pdf_path.name}, switching to OCR...")
        return extract_text_with_ocr(pdf_path)

In [4]:
# Use GPT to classify report type and sustainability section
def classify_report_type(text):
    system_prompt = "You are a helpful assistant."
    user_prompt = f"""
Given the following content extracted from a corporate report (first and last pages), classify the report into one of the following types:
- "annual report" (financial content only)
- "sustainability report" (focused on ESG or sustainability)
- "integrated report" (combination of financial and sustainability)
- "other"

Also determine whether the report includes a substantial sustainability section (e.g. with a dedicated chapter or multiple references to sustainability, SDGs, GRI, etc.).

Return a JSON object with the following structure:

{{
  "report_type": "...",
  "has_sustainability_section": true/false
}}

Content:
{text[:8000]}
"""
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0
        )
        reply = response.choices[0].message.content
        parsed = json.loads(reply)
        return parsed.get("report_type", ""), parsed.get("has_sustainability_section", "")
    except Exception as e:
        return f"ERROR: {e}", ""

In [5]:
# Loop through PDFs and classify
pdf_results = []
for pdf_file in tqdm(os.listdir(PDF_DIR)):
    if not pdf_file.endswith(".pdf"):
        continue
    pdf_path = PDF_DIR / pdf_file
    text = extract_front_back_text(pdf_path)
    if text.startswith("ERROR"):
        pdf_results.append({"filename": pdf_file, "report_type": "ERROR", "has_sustainability_section": ""})
        continue
    report_type, sustainability = classify_report_type(text)
    pdf_results.append({
        "filename": pdf_file,
        "report_type": report_type,
        "has_sustainability_section": sustainability
    })

 20%|‚ñà‚ñâ        | 251/1277 [03:12<10:49,  1.58it/s]

‚ö†Ô∏è PyMuPDF failed on Unknown_adbi-managing-transition-low-carbon-economy_087is5zy.pdf, switching to OCR...


 20%|‚ñà‚ñà        | 256/1277 [03:16<13:59,  1.22it/s]

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



 20%|‚ñà‚ñà        | 257/1277 [03:17<12:49,  1.33it/s]

‚ö†Ô∏è PyMuPDF failed on Unknown_2014SustainRpt_FNL_lr_7mrwsfm7.pdf, switching to OCR...


 28%|‚ñà‚ñà‚ñä       | 360/1277 [04:41<13:43,  1.11it/s]

‚ö†Ô∏è PyMuPDF failed on Hansae_Yes24_Holdings_Co_Ltd_HANSAE20YES2420HOLDINGS20ESG20REPORT202022_th5kzsfk.pdf, switching to OCR...


 34%|‚ñà‚ñà‚ñà‚ñé      | 430/1277 [05:35<13:34,  1.04it/s]

‚ö†Ô∏è PyMuPDF failed on Home_Inns__Hotels_Management_Inc_Barclays_Bank_PLC_Annual_Report_202014_5lj1epic.pdf, switching to OCR...


 39%|‚ñà‚ñà‚ñà‚ñâ      | 498/1277 [06:25<08:51,  1.46it/s]

‚ö†Ô∏è PyMuPDF failed on Unknown_adp07-sus-fr_95qx6prh.pdf, switching to OCR...


 53%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé    | 672/1277 [08:41<08:23,  1.20it/s]

‚ö†Ô∏è PyMuPDF failed on Armstrong_Flooring_Inc_SustainabilityReport-2020_kot54emv.pdf, switching to OCR...


 57%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 724/1277 [09:21<05:30,  1.67it/s]

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: obje

 60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 771/1277 [10:00<05:44,  1.47it/s]

‚ö†Ô∏è PyMuPDF failed on Tam_Jai_International_Co_Ltd_2022083101184_go5rbp4a.pdf, switching to OCR...


 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 896/1277 [11:33<06:29,  1.02s/it]

‚ö†Ô∏è PyMuPDF failed on Hyosung_Corp_SR_2020_en_8g98j6gk.pdf, switching to OCR...


 76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 974/1277 [12:27<04:34,  1.11it/s]

‚ö†Ô∏è PyMuPDF failed on Boryung_Corporation_EBB3B4EBA0B920ECA780EC868DEAB080EB8AA5EAB2BDEC9881EBB3B4EAB3A0EC849CEC9881EBACB8_ebpit5lz.pdf, switching to OCR...


 77%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã  | 987/1277 [12:36<03:52,  1.25it/s]

‚ö†Ô∏è PyMuPDF failed on Arvind_Ltd_Arvind_AR_2022-23_0_iwp4673c.pdf, switching to OCR...


 87%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã | 1106/1277 [14:09<02:08,  1.33it/s]

‚ö†Ô∏è PyMuPDF failed on EKI_Energy_Services_Limited_69298543284_zj7y1tjh.pdf, switching to OCR...


 88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 1122/1277 [14:21<02:05,  1.24it/s]

‚ö†Ô∏è PyMuPDF failed on Unknown_23076_Whitbread_AR2020_web_0v2mxh4f.pdf, switching to OCR...


 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 1149/1277 [14:40<01:24,  1.52it/s]

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 1168/1277 [14:53<01:14,  1.45it/s]

MuPDF error: format error: No default Layer config



 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 1209/1277 [15:25<00:46,  1.48it/s]

‚ö†Ô∏è PyMuPDF failed on Unknown_2023042101335_kyzhtmjn.pdf, switching to OCR...


 97%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã| 1239/1277 [15:48<00:29,  1.28it/s]

‚ö†Ô∏è PyMuPDF failed on Titan_Company_Ltd_Annual20Report202013_p4r8w07u.pdf, switching to OCR...


 99%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ| 1269/1277 [16:11<00:05,  1.50it/s]

‚ö†Ô∏è PyMuPDF failed on BASF_SE_2012_BASF_Report_lmq79gwn.pdf, switching to OCR...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1277/1277 [16:17<00:00,  1.31it/s]


In [7]:
# Save results
result_df = pd.DataFrame(pdf_results)
OUTPUT_PATH = Path("output/report_type_gpt_results1.csv")
result_df.to_csv(OUTPUT_PATH, index=False)
result_df.head()

Unnamed: 0,filename,report_type,has_sustainability_section
0,Unknown_8f57f855-11bb-496d-9916-91ff88cb537b_s...,annual report,False
1,Toyota_Industries_Corp_environment2004_40h96hj...,sustainability report,True
2,Knoll_Inc_Knoll_Enviro_2008_gqetdkb7.pdf,sustainability report,True
3,Intel_Corp__fwws0wtm.pdf,integrated report,True
4,Unknown_2020_SEBANG20SUSTAINABILITY20REPORT_EN...,integrated report,True


In [8]:
from pdf2image import convert_from_path
import pytesseract
import os
from tqdm import tqdm
import warnings

# Áî®Êà∑ÁõÆÂΩïËÆæÁΩÆ
pdf_dir = "/Users/zhangjingyu/Desktop/ËØæ‰ª∂/IRP/starting/pdf_folder"
output_dir = os.path.join(pdf_dir, "ocr_outputs")
os.makedirs(output_dir, exist_ok=True)

# Tesseract ÈÖçÁΩÆÔºàÂèØÈÄâÔºöÈôêÂà∂ËØ≠Ë®Ä‰∏∫Ëã±ÊñáÔºâ
ocr_config = "--psm 1 -l eng"

# Â§ÑÁêÜÂ§±Ë¥•ÁöÑ PDF ÂàóË°®Ôºà‰Ω†Êèê‰æõÁöÑÔºâ
failed_files = [
    "Lincoln_Electric_Holdings_Inc_le-air-treatment-catalogue-eng_8cadb8rq.pdf",
    "SATS_Ltd_sats-sustainability-report_88g98j6g.pdf",
    "Indusind_BankIndia_Business-Responsibility-and-Sustainability-Report-BRSR-FY2021-22_xw1dg5fb.pdf",
    "Simona_AG_SIMONA_2021_GB21_englisch_54emvime.pdf",
    "Unknown_2020-2021-corporate-responsibility-report_91a20q9k.pdf",
    "Grasim_Industries_Ltd_Business20Responsibility20and20Sustainability20Report_3yg2omk9.pdf",
    "Nichols_PLC_2020-Nichols-plc-AR_wulbuhbx.pdf",
    "Unknown_2021_Target_Corporate-Responsibility-Report_z0za8brg.pdf",
    "Bollore_SE_publication_344_en_5159im40.pdf",
    "Sitowise_Group_Oyj_sitowise-annual-report-sustainability-report-2021pdf_x563vwz2.pdf",
    "Unknown_2023042704942_wzv2kg92.pdf"
]

# OCR ‰∏ªÁ®ãÂ∫è
for fname in tqdm(failed_files, desc="OCR Processing"):
    pdf_path = os.path.join(pdf_dir, fname)
    output_path = os.path.join(output_dir, fname.replace(".pdf", ".txt"))

    try:
        images = convert_from_path(pdf_path, dpi=300)
        text_all = []

        for img in images[:10]:  # ÊéßÂà∂È°µÊï∞ÔºåÂèØÊîπ‰∏∫ [:5]+[-5:] ‰ªÖËØªÂâçÂêéÈ°µ
            text = pytesseract.image_to_string(img, config=ocr_config)
            text_all.append(text)

        with open(output_path, "w", encoding="utf-8") as f:
            f.write("\n".join(text_all))

    except Exception as e:
        warnings.warn(f"‚ö†Ô∏è OCR failed for {fname}: {e}")

OCR Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11/11 [00:00<00:00, 424.78it/s]


In [10]:
from pdf2image import convert_from_path
import pytesseract

path = "/Users/zhangjingyu/Desktop/ËØæ‰ª∂/IRP/starting/pdf_folder/Bollore_SE_publication_344_en_5159im40.pdf"
images = convert_from_path(path, dpi=300)
texts = [pytesseract.image_to_string(img) for img in images[:3]]  # ‰ªÖÊµãÂâç3È°µ
print("\n--- PAGE TEXT ---\n", texts[0][:1000])  # ÊâìÂç∞Ââç1000Â≠óÁ¨¶


--- PAGE TEXT ---
 Bollor√©

Tour Bollor√©

31-32, quai de Dion-Bouton
92811 Puteaux Cedex ‚Äî France
Tel.: + 33 (0)1 46 96 44 33

Fax: + 33 (0)1 46 96 44 22

www.bollore.com

Bollore\

00
fe}
oO
ia
KE
or
O
at
Ww
or
a
<
=)
Z
Z
<

BOLLORE

Annual report
Bollore

2008

Bollore\

