PDF Text  ──┐
            │
            ▼
extract_text_from_pdf()
            │
            ▼
GPT Classification + Structural Inference (via API)
            │
            ▼
Auto-Generated Labeled Data (text + label)
            │
            ▼
Fine-tune Classifier ( BERT / RoBERTa)
            │
            ▼
Fast Local Classification Deployment

In [1]:
# Automatically classify PDFs using GPT and generate fine-tuned training datasets (compatible with OpenAI >= 1.0.0)

import os
import fitz  # PyMuPDF
import json
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI  # ✅ New version calling method

# ✅ Load API key
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")


client = OpenAI(api_key=api_key)

    


In [2]:
from pdf2image import convert_from_path
import pytesseract
from PIL import Image
import tempfile
import os
# OCR Recovery Function: Used to extract text from the first and last page images
def extract_text_with_ocr(pdf_path, front_n=10, back_n=10, dpi=300):
    try:
        with tempfile.TemporaryDirectory() as path:
            # Convert PDF pages to images
            images = convert_from_path(pdf_path, dpi=dpi, output_folder=path)
            total_pages = len(images)
            selected_pages = []

            for i in range(min(front_n, total_pages)):
                selected_pages.append(images[i])

            for i in range(max(0, total_pages - back_n), total_pages):
                selected_pages.append(images[i])

            # Extract text using OCR
            text_parts = [pytesseract.image_to_string(img) for img in selected_pages]
            return "\n".join(text_parts)
    except Exception as e:
        return f"OCR ERROR: {e}"

In [3]:
def extract_front_back_text(pdf_path, front_n=5, back_n=5):
    try:
        doc = fitz.open(pdf_path)
        texts = []
        for i in range(min(front_n, len(doc))):
            texts.append(doc[i].get_text())
        for i in range(max(0, len(doc) - back_n), len(doc)):
            texts.append(doc[i].get_text())
        doc.close()
        full_text = "\n".join(texts)
        if len(full_text.strip()) < 50:
            raise ValueError("Empty or invalid text, fallback to OCR.")
        return full_text
    except Exception as e:
        print(f"⚠️ PyMuPDF failed on {pdf_path.name}, switching to OCR...")
        return extract_text_with_ocr(pdf_path)

In [4]:


label_set = {"annual report", "sustainability report", "integrated report", "other"}

def classify_with_gpt_v4(text):
    prompt = f"""
The following text is extracted from a corporate report (first and last few pages).

Your tasks:

1. Classify the report into one of the following types:
- "sustainability report": The document primarily focuses on ESG, sustainability, CSR, or GRI/SDGs-related topics. It does NOT contain full audited financial statements.
- "annual report": The document primarily contains audited financial disclosures, such as income statements, cash flow statements, balance sheets, and auditor's reports. ESG content, if any, is limited or supplementary.
- "integrated report": The document includes BOTH complete financial disclosures and structured sustainability content within the same document.
- "other": The document does not meet the criteria for any of the above (e.g., regulatory compliance reports, environmental approvals, investor brochures, etc.)

2. Important judgment rules:
- Do NOT classify a "Corporate Social Responsibility Report", "Corporate Responsibility Report", or "CSR Report" as "integrated report" unless it also includes full audited financials.
- If the title contains "Annual Report" but the content lacks actual financial statements, do NOT classify it as "annual" or "integrated".
- References or summaries of financials are NOT sufficient — only classify based on complete financial statement inclusion.

3. If the document contains a distinct sustainability section (e.g. a chapter on ESG/CSR/GRI/SDGs content that constitutes more than 50% of a section), mark `has_sustainability_section = true`, and extract the section name if available.

Return a JSON object with the following fields:
{{
  "reasoning": "...",
  "report_type": "...",
  "has_sustainability_section": true/false,
  "sustainability_section_name": "..."
}}

Content:
{text}
"""
    try:
        response = client.chat.completions.create(
            model="gpt-4.1-mini",  # Or gpt-4.1 when available
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            temperature=0
        )
        reply = response.choices[0].message.content
        parsed = json.loads(reply)
        return parsed
    except Exception as e:
        return {"error": str(e)}
    

# ✅ Batch-Generate Training Data
def generate_training_data(pdf_folder="pdf_folder", output_jsonl="output/bert_training_data.jsonl", max_files=50):
    full_records = []  # Store complete records for CSV
    training_records = []  # Store text + label (only includes report_type)

    pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith(".pdf")]
    pdf_files = pdf_files[:max_files]

    for fname in tqdm(pdf_files, desc="Generating training data"):
        fpath = os.path.join(pdf_folder, fname)
        text = extract_front_back_text(fpath)
        if text.startswith("ERROR"):
            continue

        label_info = classify_with_gpt_v4(text)
        if "error" in label_info:
            continue

        # ✅ Save two versions
        full_records.append({
            "filename": fname,
            "text": text,
            **label_info  # reasoning, report_type, etc.
        })

        training_records.append({
            "text": text,
            "label": label_info["report_type"]
        })

    # ✅ Save training JSONL (contains only text + label)
    os.makedirs(os.path.dirname(output_jsonl), exist_ok=True)
    with open(output_jsonl, "w") as f:
        for r in training_records:
            json.dump(r, f)
            f.write("\n")


    df = pd.DataFrame(full_records)
    df.to_csv("output/gpt_classification_preview.csv", index=False)

    print(f"✅ 训练数据已保存:\n- JSONL: {output_jsonl}\n- CSV: output/gpt_classification_preview.csv")

    return df

df = generate_training_data()
df.head()

Generating training data: 100%|██████████| 50/50 [13:27<00:00, 16.14s/it]

✅ 训练数据已保存:
- JSONL: output/bert_training_data.jsonl
- CSV: output/gpt_classification_preview.csv





Unnamed: 0,filename,text,reasoning,report_type,has_sustainability_section,sustainability_section_name
0,Unknown_8f57f855-11bb-496d-9916-91ff88cb537b_s...,...,The document is a Form 10-K filed with the U.S...,annual report,False,
1,Toyota_Industries_Corp_environment2004_40h96hj...,Social & Environmental Report\n2004\n\nC\nO\nN...,The document is titled 'Social & Environmental...,sustainability report,True,Social & Environmental Report 2004
2,Knoll_Inc_Knoll_Enviro_2008_gqetdkb7.pdf,Knoll and Sustainable Design\n2008 Environment...,"The document is titled '2008 Environmental, He...",sustainability report,True,"Environmental, Health & Safety"
3,Intel_Corp__fwws0wtm.pdf,2010 Corporate Responsibility Report\n\nOn the...,The document is titled '2010 Corporate Respons...,sustainability report,True,"Environmental Factors, Social Factors, Governa..."
4,Unknown_2020_SEBANG20SUSTAINABILITY20REPORT_EN...,This report is printed on FSC® (Forest Steward...,The document is titled '2020 SEBANG Sustainabi...,sustainability report,True,"Sustainable Structure, Our Society, Environmen..."


In [5]:
# output/comb_results.csv

import pandas as pd

# ==== 1. Set File Path ====
human_label_path = "check/pdf_classification.xlsx"
gpt_result_path = "output/gpt_classification_preview.csv"
df_human = pd.read_excel(human_label_path)
df_gpt = pd.read_csv(gpt_result_path)

# ==3. Standardize column names to ensure filename and report_type exist
df_human.columns = [col.strip().lower() for col in df_human.columns]
df_gpt.columns = [col.strip().lower() for col in df_gpt.columns]

# ===4. Merge two tables
df_merged = pd.merge(df_human, df_gpt, left_on="file_name",
    right_on="filename",
    how="left")
df_merged = df_merged.drop_duplicates(subset="file_name", keep="first")

# ==== 6. Compare two classification results ====
df_merged["report_type_human"] = df_merged["report_type_human"].str.strip().str.lower()
df_merged["report_type_gpt"] = df_merged["report_type"].str.strip().str.lower()
df_merged["is_correct"] = df_merged["report_type_human"] == df_merged["report_type_gpt"]

# ===7. Calculate the accuracy rate==
accuracy = df_merged["is_correct"].mean()
print(f"accuracy: {accuracy:.2%}")


import re

def clean_illegal_excel_chars(text):
    if isinstance(text, str):
        return re.sub(r"[\x00-\x1F\x7F]", "", text)
    return text

# Remove invalid characters
df_merged_cleaned = df_merged.applymap(clean_illegal_excel_chars)


df_merged_cleaned.to_excel("pipeline_eval/7/classification_comparison.xlsx", index=False)
print("✅ Saved to classification_comparison.xlsx")
# df_merged.to_excel("pipeline_eval/7/classification_comparison.xlsx", index=False)
# print("saved to classification_comparison_result.xlsx")

import sklearn
from sklearn.metrics import classification_report, confusion_matrix

# === 仅保留非空对比项（防止空值影响）===
valid = df_merged[~df_merged["report_type_human"].isna() & ~df_merged["report_type_gpt"].isna()]

# === 打印 classification report ===
report = classification_report(
    valid["report_type_human"], 
    valid["report_type_gpt"], 
    digits=3, 
    output_dict=False  # 改为 True 可返回字典
)
print("📊 detailed \n")
print(report)

# === 可视化混淆矩阵（可选）===
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(valid["report_type_human"], valid["report_type_gpt"], labels=valid["report_type_human"].unique())
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=valid["report_type_human"].unique(),
            yticklabels=valid["report_type_human"].unique())
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix- Classification Comparison")
plt.tight_layout()
plt.show()

accuracy: 11.11%
✅ Saved to classification_comparison.xlsx


  df_merged_cleaned = df_merged.applymap(clean_illegal_excel_chars)


📊 detailed 

                       precision    recall  f1-score   support

        annual report      0.667     1.000     0.800         2
    integrated report      1.000     0.500     0.667         2
                other      1.000     1.000     1.000         1
sustainability report      1.000     1.000     1.000         5

             accuracy                          0.900        10
            macro avg      0.917     0.875     0.867        10
         weighted avg      0.933     0.900     0.893        10



ModuleNotFoundError: No module named 'seaborn'

In [9]:
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset as TorchDataset
from torch.optim import AdamW  # Import AdamW from torch.optim
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)
from datasets import Dataset, load_dataset
import os
from tqdm import tqdm


import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

# ========== 1. Data Preprocessing and Preparation ==========
def prepare_training_data(jsonl_file, test_size=0.2, random_state=42):
    """
    Prepare training data, including loading, splitting, and label encoding
    """
    # Load JSONL data
    data = []
    with open(jsonl_file, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    
    # Convert to DataFrame
    df = pd.DataFrame(data)
    
    # Check label distribution
    print("Label distribution:")
    print(df['label'].value_counts())
    
    # Create label mapping
    label_list = sorted(df['label'].unique())
    label2id = {label: idx for idx, label in enumerate(label_list)}
    id2label = {idx: label for label, idx in label2id.items()}
    
    # Add numeric labels
    df['label_id'] = df['label'].map(label2id)
    
    # Split train and test set
    train_df, test_df = train_test_split(
        df, test_size=test_size, random_state=random_state, stratify=df['label_id']
    )
    
    return train_df, test_df, label2id, id2label

# ========== 2. Custom Dataset Class ==========
class ReportDataset(TorchDataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# ========== 3. Simplified Model Training ==========
def simple_train_model(train_df, test_df, label2id, id2label, 
                      model_name="bert-base-uncased", output_dir="./fine_tuned_model"):
    """
    Train model using simplified method
    """
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    model = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased", 
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id
    )
    
    # Create dataset
    train_dataset = ReportDataset(
        train_df['text'].tolist(),
        train_df['label_id'].tolist(),
        tokenizer
    )
    
    test_dataset = ReportDataset(
        test_df['text'].tolist(),
        test_df['label_id'].tolist(),
        tokenizer
    )
    
    # Create dataloader
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)
    
    # Set device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    # Set optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=2e-5)
    total_steps = len(train_loader) * 4  # 4 epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    
    # Training loop
    model.train()
    for epoch in range(4):
        total_loss = 0
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/4")
        
        for batch in progress_bar:
            # Move to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Forward pass
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            
            # Backward pass
            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            
            total_loss += loss.item()
            progress_bar.set_postfix({'loss': loss.item()})
        
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}, Average Loss: {avg_loss:.4f}")
    
    # Evaluate model
    model.eval()
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            _, preds = torch.max(outputs.logits, dim=1)
            
            predictions.extend(preds.cpu().tolist())
            true_labels.extend(labels.cpu().tolist())
    
    # Calculate evaluation metrics - fix class mismatch issue
    print("Classification report:")
    
    # Get actual labels
    unique_labels = np.unique(true_labels + predictions)
    
    # Create corresponding label names
    target_names = [id2label[label_id] for label_id in sorted(unique_labels)]
    
    # Generate classification report
    print(classification_report(true_labels, predictions, target_names=target_names, labels=sorted(unique_labels)))
    
    # Save model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    
    return model, tokenizer

# ========== 4. PDF Text Extraction Function ==========
def extract_text_from_pdf(pdf_path, front_n=5, back_n=5):
    """
    Extract text from the first and last few pages of a PDF file
    """
    try:
        import fitz  # PyMuPDF
        doc = fitz.open(pdf_path)
        total = len(doc)
        text = ""
        for i in range(min(front_n, total)):
            text += doc[i].get_text()
        for i in range(max(0, total - back_n), total):
            text += doc[i].get_text()
        return text.strip()
    except Exception as e:
        return f"ERROR extracting {pdf_path}: {e}"

# ========== 5. Local Classifier Deployment ==========
class ReportClassifier:
    def __init__(self, model_path):
        """
        Load trained model and tokenizer
        """
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_path)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model.to(self.device)
        self.model.eval()
        
        # Get label mapping
        self.id2label = self.model.config.id2label
        self.label2id = self.model.config.label2id
    
    def classify(self, text, return_confidence=False):
        """
        Classify a single text
        """
        # Preprocess text
        inputs = self.tokenizer(
            text, 
            return_tensors="pt", 
            truncation=True, 
            padding=True, 
            max_length=512
        )
        
        # Move to device
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        # Prediction
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        # Get prediction result
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        pred_id = torch.argmax(probs, dim=1).item()
        confidence = probs[0][pred_id].item()
        label = self.id2label[pred_id]
        
        if return_confidence:
            return label, confidence
        return label
    
    def classify_pdf(self, pdf_path, front_n=5, back_n=5):
        """
        Directly classify a PDF file
        """
        # Extract text
        text = extract_text_from_pdf(pdf_path, front_n, back_n)
        
        # Classification
        label, confidence = self.classify(text, return_confidence=True)
        
        return {
            "filename": os.path.basename(pdf_path),
            "label": label,
            "confidence": confidence,
            "text_preview": text[:500] + "..." if len(text) > 500 else text
        }
    
    def batch_classify_pdfs(self, pdf_folder, output_csv="output/local_classification_results.csv", max_files=100):
        """
        Batch classify PDF files in a folder, up to max_files files
        """
        pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith(".pdf")]
        
        # Only process the first max_files files
        pdf_files = pdf_files[:max_files]
        
        results = []
        for fname in tqdm(pdf_files, desc=f"Classifying PDF files (up to {max_files})"):
            fpath = os.path.join(pdf_folder, fname)
            result = self.classify_pdf(fpath)
            results.append(result)
        
        # Save results
        df = pd.DataFrame(results)
        df.to_csv(output_csv, index=False)
        print(f"Results saved to: {output_csv}")
        
        return df

# ========== 6. Complete Training and Deployment Pipeline ==========
def train_and_deploy_pipeline(jsonl_path, model_output_dir="./fine_tuned_model"):
    """
    Complete training and deployment pipeline
    """
    # 1. Prepare data
    print("Preparing training data...")
    train_df, test_df, label2id, id2label = prepare_training_data(jsonl_path)
    
    # 2. Train model
    print("Training model...")
    model, tokenizer = simple_train_model(
        train_df, test_df, label2id, id2label, output_dir=model_output_dir
    )
    
    # 3. Deploy model
    print("Deploying model...")
    classifier = ReportClassifier(model_output_dir)
    
    # Test classifier
    test_text = "This is a sample annual report with financial statements and auditor's report."
    result = classifier.classify(test_text, return_confidence=True)
    print(f"Test classification: {result}")
    
    return classifier

# ========== 7. Main Execution Function ==========
if __name__ == "__main__":
    # Set paths
    jsonl_path = "/Users/zhangjingyu/Desktop/课件/IRP/starting/output/bert_training_data.jsonl"
    model_dir = "./fine_tuned_report_classifier"
    pdf_folder = "pdf_folder"  
    
    # Ensure output directory exists
    os.makedirs(model_dir, exist_ok=True)
    os.makedirs("output", exist_ok=True)
    
    # Execute full pipeline
    classifier = train_and_deploy_pipeline(jsonl_path, model_dir)
    
    # Use trained model to batch classify PDFs, process first 100 files
    print("Classifying PDFs with trained model (first 100)...")
    results_df = classifier.batch_classify_pdfs(pdf_folder, max_files=1500)
    
    print("Pipeline completed!")

准备训练数据...
标签分布:
label
sustainability report    29
annual report             8
integrated report         7
other                     3
Name: count, dtype: int64
训练模型...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/4: 100%|██████████| 5/5 [00:36<00:00,  7.31s/it, loss=1.21]


Epoch 1, Average Loss: 1.4021


Epoch 2/4: 100%|██████████| 5/5 [00:34<00:00,  6.91s/it, loss=1.19] 


Epoch 2, Average Loss: 1.1812


Epoch 3/4: 100%|██████████| 5/5 [00:34<00:00,  6.86s/it, loss=0.9]  


Epoch 3, Average Loss: 1.1299


Epoch 4/4: 100%|██████████| 5/5 [00:40<00:00,  8.14s/it, loss=1.02]


Epoch 4, Average Loss: 1.1007
分类报告:
                       precision    recall  f1-score   support

        annual report       0.00      0.00      0.00         2
    integrated report       0.00      0.00      0.00         1
                other       0.00      0.00      0.00         1
sustainability report       0.60      1.00      0.75         6

             accuracy                           0.60        10
            macro avg       0.15      0.25      0.19        10
         weighted avg       0.36      0.60      0.45        10



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


部署模型...
测试分类: ('sustainability report', 0.416676789522171)
使用训练好的模型分类PDF (前100个)...


分类PDF文件 (最多1500个):  20%|██        | 256/1278 [01:36<04:31,  3.77it/s]

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



分类PDF文件 (最多1500个):  57%|█████▋    | 725/1278 [03:58<02:41,  3.42it/s]

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: obje

分类PDF文件 (最多1500个):  90%|████████▉ | 1150/1278 [22:56<00:34,  3.66it/s]   

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



分类PDF文件 (最多1500个):  91%|█████████▏| 1169/1278 [23:01<00:25,  4.23it/s]

MuPDF error: format error: No default Layer config



分类PDF文件 (最多1500个): 100%|██████████| 1278/1278 [23:29<00:00,  1.10s/it]

结果已保存到: output/local_classification_results.csv
流程完成!





In [8]:
import pandas as pd
!pip install openpyxl

# ==== 1. Set file paths ====
human_label_path = "check/pdf_classification.xlsx"
gpt_result_path = "output/local_classification_results.csv"

# ==== 2. Read files ====
df_human = pd.read_excel(human_label_path)
df_gpt = pd.read_csv(gpt_result_path)

# ==== 3. Normalize column names, ensure filename and report_type exist ====
df_human.columns = [col.strip().lower() for col in df_human.columns]
df_gpt.columns = [col.strip().lower() for col in df_gpt.columns]

# ==== 4. Merge two tables ====
df_merged = pd.merge(df_human, df_gpt, left_on="file_name",
    right_on="filename",
    how="left")
df_merged = df_merged.drop_duplicates(subset="file_name", keep="first")

# ==== 6. Compare two classification results ====
df_merged["report_type_human"] = df_merged["report_type_human"].str.strip().str.lower()
df_merged["report_type_gpt"] = df_merged["label"].str.strip().str.lower()
df_merged["is_correct"] = df_merged["report_type_human"] == df_merged["report_type_gpt"]

# ==== 7. Calculate accuracy ====
accuracy = df_merged["is_correct"].mean()
print(f"accuracy: {accuracy:.2%}")

# ==== 8. Export comparison results (optional) ====
import re

def clean_illegal_excel_chars(text):
    if isinstance(text, str):
        return re.sub(r"[\x00-\x1F\x7F]", "", text)
    return text

# Clean illegal characters
df_merged_cleaned = df_merged.applymap(clean_illegal_excel_chars)

# Export again
# df_merged_cleaned.to_excel("pipeline_eval/7/classification_comparison.xlsx", index=False)
print("✅ Saved to classification_comparison.xlsx")
# df_merged.to_excel("pipeline_eval/7/classification_comparison.xlsx", index=False)
# print("saved to classification_comparison_result.xlsx")

import sklearn
from sklearn.metrics import classification_report, confusion_matrix

# === Only keep non-empty comparison items (to avoid influence of null values) ===
valid = df_merged[~df_merged["report_type_human"].isna() & ~df_merged["report_type_gpt"].isna()]

# === Print classification report ===
report = classification_report(
    valid["report_type_human"], 
    valid["report_type_gpt"], 
    digits=3, 
    output_dict=False  # Set to True to return dict
)
print("📊 detailed \n")
print(report)

# === Visualize confusion matrix (optional) ===
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(valid["report_type_human"], valid["report_type_gpt"], labels=valid["report_type_human"].unique())
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=valid["report_type_human"].unique(),
            yticklabels=valid["report_type_human"].unique())
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


accuracy: 51.85%
✅ Saved to classification_comparison.xlsx
📊 detailed 

                       precision    recall  f1-score   support

        annual report      0.000     0.000     0.000         7
    integrated report      0.000     0.000     0.000        16
                other      0.000     0.000     0.000         2
sustainability report      0.627     1.000     0.771        42

             accuracy                          0.627        67
            macro avg      0.157     0.250     0.193        67
         weighted avg      0.393     0.627     0.483        67



  df_merged_cleaned = df_merged.applymap(clean_illegal_excel_chars)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ModuleNotFoundError: No module named 'seaborn'