In [4]:
import os
import base64
import tempfile
from pathlib import Path
from pdf2image import convert_from_path
import openai
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI


# Load OpenAI API key from .env file
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# ‚úÖ Vision Prompt


vision_prompt = """
This image is from the first few pages of a corporate report.

Please identify the time period that the report covers, based on any visible text, charts, or tables, body text, or headers, footer, or any other visible elements in the image
As much as you can, you can also finf the mounth and date.


Return examples like:
- ‚ÄúApril 2023 ‚Äì March 2024‚Äù
- ‚ÄúFiscal Year Ending 31 March 2023‚Äù
- ‚Äú2022‚Äù
- "31 December 2022 to 30 June 2023"

If no such date range is visible, respond with ‚ÄúNOT FOUND‚Äù.
Return only the most specific date range or fiscal year visible.
"""




def encode_image_to_base64(image_path):
    with open(image_path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")

def call_gpt4_vision(image_path):
    b64_image = encode_image_to_base64(image_path)
    try:
        response = openai.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": [
                {"type": "text", "text": vision_prompt},
                {"type": "image_url", "image_url": {
                    "url": f"data:image/png;base64,{b64_image}"
                }}
            ]}
        ],
        max_tokens=300
    )
        return response.choices[0].message.content.strip()
    except Exception as e:
        return f"ERROR: {e}"

def extract_report_year_from_pdf(pdf_path, max_pages=3):
    with tempfile.TemporaryDirectory() as tmpdir:
        try:
            images = convert_from_path(pdf_path, dpi=300, first_page=1, last_page=max_pages, output_folder=tmpdir)
            for img in images:
                img_path = Path(tmpdir) / f"{Path(pdf_path).stem}.png"
                img.save(img_path)
                result = call_gpt4_vision(img_path)
                if "not found" not in result.lower():
                    return result
        except Exception as e:
            return f"ERROR: {e}"
    return "NOT FOUND"

def batch_process_pdf_folder(pdf_folder, output_csv, max_pages=5):
    results = []

    for filename in tqdm(os.listdir(pdf_folder)):
        if not filename.lower().endswith(".pdf"):
            continue
        pdf_path = os.path.join(pdf_folder, filename)
        result = extract_report_year_from_pdf(pdf_path, max_pages)
        results.append({
            "filename": filename,
            "report_year_vision": result
        })

    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    print(f"‚úÖ Saved results to {output_csv}")


In [5]:
if __name__ == "__main__":
    folder_path = "pdf_folder"  # ‚ö†Ô∏è ÊõøÊç¢Êàê‰Ω†Êú¨Âú∞PDFÊñá‰ª∂Â§πË∑ØÂæÑ
    output_csv = "output/report_years_extracted_new.csv"
    batch_process_pdf_folder(folder_path, output_csv, max_pages=3)


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1279/1279 [5:47:13<00:00, 16.29s/it] 

‚úÖ Saved results to output/report_years_extracted_new.csv





In [3]:
import pandas as pd

# ËØªÂèñÊñá‰ª∂
anno_df = pd.read_excel("check/rfyear_annotation.xlsx")
result_df = pd.read_csv("output/full_pipeline_results.csv")

# Ê∏ÖÊ¥óÊñá‰ª∂ÂêçÂàó
anno_df["pdf_name_clean"] = anno_df["pdf_name"].str.strip().str.lower()
result_df["filename_clean"] = result_df["filename"].str.strip().str.lower()

# ÂêàÂπ∂‰∏§‰∏™Ë°®
merged_df = pd.merge(
    anno_df,
    result_df,
    left_on="pdf_name_clean",
    right_on="filename_clean",
    how="left"
)

# Âπ¥‰ªΩÂØπÊØîÂáΩÊï∞
def match_year(human, gpt):
    if pd.isna(human) or pd.isna(gpt):
        return "missing"
    return "match" if str(human).strip().lower() == str(gpt).strip().lower() else "mismatch"

# Ê∑ªÂä†ÂåπÈÖçÁªìÊûú
merged_df["match_result"] = merged_df.apply(
    lambda row: match_year(row["chosen_rfyear"], row["report_year_vision"]),
    axis=1
)

# ‰ªÖ‰øùÁïôÊúâÁî®Â≠óÊÆµ
final_df = merged_df[[
    "pdf_name", "chosen_rfyear", "report_year_vision", "match_result"
]]

# ‰øùÂ≠òÁªìÊûú
final_df.to_csv("rfyear_comparison_results_minimal.csv", index=False)

# ËæìÂá∫ÁªüËÆ°
print("‚úÖ Â∑≤‰øùÂ≠òÁÆÄÊ¥ÅÂØπÊØîÁªìÊûú‰∏∫ rfyear_comparison_results_minimal.csv")
print("üìä ÂåπÈÖçÊÉÖÂÜµÁªüËÆ°Ôºö")
print(final_df["match_result"].value_counts())

‚úÖ Â∑≤‰øùÂ≠òÁÆÄÊ¥ÅÂØπÊØîÁªìÊûú‰∏∫ rfyear_comparison_results_minimal.csv
üìä ÂåπÈÖçÊÉÖÂÜµÁªüËÆ°Ôºö
match_result
missing     386
mismatch    137
match        78
Name: count, dtype: int64


In [6]:
import pandas as pd

# ËØªÂèñÊñá‰ª∂
anno_df = pd.read_excel("check/rfyear_annotation.xlsx")
result_df = pd.read_csv("output/report_years_extracted_new.csv")

# Ê∏ÖÊ¥óÊñá‰ª∂ÂêçÂàó
anno_df["pdf_name_clean"] = anno_df["pdf_name"].str.strip().str.lower()
result_df["filename_clean"] = result_df["filename"].str.strip().str.lower()

# ÂêàÂπ∂‰∏§‰∏™Ë°®
merged_df = pd.merge(
    anno_df,
    result_df,
    left_on="pdf_name_clean",
    right_on="filename_clean",
    how="left"
)

# Âπ¥‰ªΩÂØπÊØîÂáΩÊï∞
def match_year(human, gpt):
    if pd.isna(human) or pd.isna(gpt):
        return "missing"
    return "match" if str(human).strip().lower() == str(gpt).strip().lower() else "mismatch"

# Ê∑ªÂä†ÂåπÈÖçÁªìÊûú
merged_df["match_result"] = merged_df.apply(
    lambda row: match_year(row["chosen_rfyear"], row["report_year_vision"]),
    axis=1
)

# ‰ªÖ‰øùÁïôÊúâÁî®Â≠óÊÆµ
final_df = merged_df[[
    "pdf_name", "chosen_rfyear", "report_year_vision", "match_result"
]]

# ‰øùÂ≠òÁªìÊûú
final_df.to_csv("rfyear_comparison_results_minimal.csv", index=False)

# ËæìÂá∫ÁªüËÆ°
print("‚úÖ Â∑≤‰øùÂ≠òÁÆÄÊ¥ÅÂØπÊØîÁªìÊûú‰∏∫ rfyear_comparison_results_minimal.csv")
print("üìä ÂåπÈÖçÊÉÖÂÜµÁªüËÆ°Ôºö")
print(final_df["match_result"].value_counts())

‚úÖ Â∑≤‰øùÂ≠òÁÆÄÊ¥ÅÂØπÊØîÁªìÊûú‰∏∫ rfyear_comparison_results_minimal.csv
üìä ÂåπÈÖçÊÉÖÂÜµÁªüËÆ°Ôºö
match_result
missing     386
mismatch    215
Name: count, dtype: int64


In [3]:
import os
import fitz  # PyMuPDF
import openai
import pytesseract
import tempfile
import base64
import json
from pdf2image import convert_from_path
from PIL import Image
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI


# Load OpenAI API key from .env file
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# ‚úÖ Vision Prompt
openai.api_key = "sk-..."  # ‚Üê ËØ∑ÊõøÊç¢‰∏∫‰Ω†ÁöÑ API key

def extract_front_back_text(pdf_path, front_n=20, back_n=20):
    try:
        doc = fitz.open(pdf_path)
        texts = [doc[i].get_text() for i in range(min(front_n, len(doc)))]
        texts += [doc[i].get_text() for i in range(max(0, len(doc) - back_n), len(doc))]
        doc.close()
        full_text = "\n".join(texts)
        if len(full_text.strip()) < 50:
            raise ValueError("Too little text, fallback to OCR.")
        return full_text
    except:
        return extract_text_with_ocr(pdf_path)
    

def extract_text_with_ocr(pdf_path, dpi=300):
    with tempfile.TemporaryDirectory() as path:
        images = convert_from_path(pdf_path, dpi=dpi, output_folder=path)
        text_parts = [pytesseract.image_to_string(img) for img in images[:3] + images[-3:]]
        return "\n".join(text_parts)
    
def build_report_year_prompt(text):
    return f"""
You are an assistant helping extract the reporting year or fiscal year from the following corporate report excerpt.

1. Look for expressions such as:
   - "For the year ended 31 March 2022"
   - "Reporting period: April 2021 ‚Äì March 2022"
   - "FY2020", etc.
2. If found, return this JSON:
{{
  "report_year": "April 2021 ‚Äì March 2022",
  "source": "Page 2, main text"
}}

If not found:
{{
  "report_year": null,
  "source": "NOT FOUND"
}}

Report text:
{text}
"""


def encode_image_to_base64(pil_image):
    with tempfile.NamedTemporaryFile(suffix=".png") as f:
        pil_image.save(f.name, format="PNG")
        with open(f.name, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")
        
def extract_year_from_vision(pdf_path, client, page_limit=3):
    images = convert_from_path(pdf_path, dpi=300)
    for i, img in enumerate(images[:page_limit]):
        b64 = encode_image_to_base64(img)
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "user", "content": [
                    {"type": "text", "text": f"Please extract the fiscal year or reporting period from page {i+1}."},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}"}}
                ]}
            ],
            max_tokens=300
        )
        result = response.choices[0].message.content
        if "not found" not in result.lower():
            return {"report_year": result.strip(), "source": f"Vision page {i+1}"}
    return {"report_year": None, "source": "Vision NOT FOUND"}

def extract_report_year(pdf_path, client):
    text = extract_front_back_text(pdf_path)
    prompt = build_report_year_prompt(text)
    try:
        response = client.chat.completions.create(
            model="gpt-4.1-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=300
        )
        content = response.choices[0].message.content
        parsed = json.loads(content) if content.startswith("{") else eval(content)
        if parsed["report_year"]:
            return parsed
    except:
        pass
    return extract_year_from_vision(pdf_path, client)

In [4]:
from pathlib import Path
import pandas as pd

pdf_dir = "pdf_folder"  # ÊõøÊç¢‰∏∫‰Ω†ÁöÑÊñá‰ª∂Â§πË∑ØÂæÑ
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))


results = []
for pdf in tqdm(Path(pdf_dir).glob("*.pdf")):
    try:
        out = extract_report_year(str(pdf), client)
        out["filename"] = pdf.name
        results.append(out)
    except Exception as e:
        results.append({"filename": pdf.name, "report_year": None, "source": f"ERROR: {e}"})

df = pd.DataFrame(results)
df.to_csv("results/extracted_report_years_mini.csv", index=False)

256it [11:29,  1.59s/it]

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed

MuPDF error: format error: cmsOpenProfileFromMem failed



724it [36:16,  2.99s/it]

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: object is not a stream

MuPDF error: format error: obje

835it [40:33,  2.04s/it]

: 

In [None]:
import os
import fitz  # PyMuPDF
import openai
import pytesseract
import tempfile
import base64
import json
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
from tqdm import tqdm
from pdf2image import convert_from_path
from PIL import Image
from openai import OpenAI

# ========== 1. ÂàùÂßãÂåñ OpenAI ==========
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# ========== 2. ÊèêÂèñÊñáÊú¨ÔºöÂâçÂêé20È°µ ==========
def extract_front_back_text(pdf_path, front_n=10, back_n=10):
    try:
        doc = fitz.open(pdf_path)
        texts = [doc[i].get_text() for i in range(min(front_n, len(doc)))]
        texts += [doc[i].get_text() for i in range(max(0, len(doc) - back_n), len(doc))]
        doc.close()
        full_text = "\n".join(texts)
        if len(full_text.strip()) < 50:
            raise ValueError("Too little text")
        return full_text
    except Exception as e:
        print(f"‚ö†Ô∏è PyMuPDF failed on {pdf_path.name}, switching to OCR...")
        return extract_text_with_ocr(pdf_path)

# ========== 3. OCR Ë°•Êïë ==========
def extract_text_with_ocr(pdf_path, dpi=300):
    with tempfile.TemporaryDirectory() as path:
        images = convert_from_path(pdf_path, dpi=dpi, output_folder=path)
        text_parts = [pytesseract.image_to_string(img) for img in images[:3] + images[-3:]]
        return "\n".join(text_parts)

# ========== 4. ÊûÑÈÄ† Prompt ==========
def build_report_year_prompt(text):
    return f"""
You are an assistant helping extract the reporting year or fiscal year from the following corporate report excerpt.

1. Look for expressions such as:
   - "For the year ended 31 March 2022"
   - "Reporting period: April 2021 ‚Äì March 2022"
   - "FY2020", etc.
2. If found, return this JSON:
{{
  "report_year": "April 2021 ‚Äì March 2022",
  "source": "Page 2, main text"
}}

If not found:
{{
  "report_year": null,
  "source": "NOT FOUND"
}}

Report text:
{text}
"""

# ========== 5. Vision Ê®°ÂûãËæÖÂä© ==========
def encode_image_to_base64(pil_image):
    with tempfile.NamedTemporaryFile(suffix=".png") as f:
        pil_image.save(f.name, format="PNG")
        with open(f.name, "rb") as image_file:
            return base64.b64encode(image_file.read()).decode("utf-8")

def extract_year_from_vision(pdf_path, client, page_limit=3):
    try:
        images = convert_from_path(pdf_path, dpi=200)
        for i, img in enumerate(images[:page_limit]):
            b64 = encode_image_to_base64(img)
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "user", "content": [
                        {"type": "text", "text": f"Please extract the fiscal year or reporting period from page {i+1}."},
                        {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}"}}
                    ]}
                ],
                max_tokens=300
            )
            result = response.choices[0].message.content
            if "not found" not in result.lower():
                return {"report_year": result.strip(), "source": f"Vision page {i+1}"}
    except Exception as e:
        return {"report_year": None, "source": f"Vision ERROR: {e}"}
    return {"report_year": None, "source": "Vision NOT FOUND"}

# ========== 6. ‰∏ªÂáΩÊï∞ÔºöÂÖàÊñáÊú¨ÔºåÂÜç Vision ==========
def extract_report_year(pdf_path, client):
    try:
        text = extract_front_back_text(pdf_path)
        prompt = build_report_year_prompt(text)
        response = client.chat.completions.create(
            model="gpt-4.1-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=300
        )
        content = response.choices[0].message.content
        parsed = json.loads(content) if content.startswith("{") else eval(content)
        if parsed.get("report_year"):
            return parsed
    except Exception as e:
        print(f"‚ö†Ô∏è GPT-4.1-mini failed on {Path(pdf_path).name}, fallback to Vision...")
    return extract_year_from_vision(pdf_path, client)
from joblib import Parallel, delayed

# Â§öÁ∫øÁ®ãÂ§ÑÁêÜÂáΩÊï∞
def process_pdf(pdf_path):
    try:
        out = extract_report_year(str(pdf_path), client)
        out["filename"] = pdf_path.name
    except Exception as e:
        out = {"filename": pdf_path.name, "report_year": None, "source": f"ERROR: {e}"}
    return out

def batch_extract_years_multithread(pdf_dir, output_csv="results/extracted_report_years_mini.csv", n_jobs=2):
    pdf_dir = Path(pdf_dir)
    pdf_files = sorted(pdf_dir.glob("*.pdf"))
    os.makedirs(Path(output_csv).parent, exist_ok=True)

    # ‚úÖ Âπ∂Ë°åÂ§ÑÁêÜ
    results = Parallel(n_jobs=n_jobs, prefer="threads")(
        delayed(process_pdf)(pdf) for pdf in tqdm(pdf_files, desc="üìÑ Processing PDFs")
    )

    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    print(f"\n‚úÖ Extraction complete! Results saved to: {output_csv}")

# ========== 8. ËøêË°å ==========
if __name__ == "__main__":
    batch_extract_years_multithread("pdf_folder")  



‚ö†Ô∏è GPT-4.1-mini failed on AAK_AB_aak-sustainability-report-2012-2013_8y6o3xp0.pdf, fallback to Vision...
‚ö†Ô∏è GPT-4.1-mini failed on 888_Holdings_888_Holdings_PLC_Annual_Report__Accounts_2022_4ya4gkz2.pdf, fallback to Vision...




‚ö†Ô∏è GPT-4.1-mini failed on ABC_Holdings_Limited_P020160316609968328069_w68f32zb.pdf, fallback to Vision...
‚ö†Ô∏è GPT-4.1-mini failed on AFLAC_Inc_2013-csr-report-final_oxgizar1.pdf, fallback to Vision...




‚ö†Ô∏è GPT-4.1-mini failed on AIA_Engineering_Ltd_ANNUALREPORT19_20_7ta9u15v.pdf, fallback to Vision...
‚ö†Ô∏è GPT-4.1-mini failed on AIA_Group_Ltd_aia-annual-report-2015-eng_6dpw11ja.pdf, fallback to Vision...




‚ö†Ô∏è GPT-4.1-mini failed on AIB_Group_PLC_annual-financial-report-2010_nzf6dpw1.pdf, fallback to Vision...
‚ö†Ô∏è GPT-4.1-mini failed on AIB_Group_PLC_annual-report-2013_w2uuexld.pdf, fallback to Vision...




‚ö†Ô∏è GPT-4.1-mini failed on AKSA_A40C80A20FF2492CAC2C9CD3A2967D88_u2o5uhav.pdf, fallback to Vision...




‚ö†Ô∏è GPT-4.1-mini failed on AKSA_Aksa-Dogalgaz-Annual-Report-2019_w9j206fd.pdf, fallback to Vision...
‚ö†Ô∏è GPT-4.1-mini failed on AMETEK_Inc_86f51b22-fde1-4f26-afc9-5a60c2a24431_fd0pue9s.pdf, fallback to Vision...




‚ö†Ô∏è GPT-4.1-mini failed on Acast_AB_acast-annual-report-2021_5hv0716n.pdf, fallback to Vision...
‚ö†Ô∏è GPT-4.1-mini failed on Accell_Group_NV_Heerenveen_AnualReport2014_dfllq4qw.pdf, fallback to Vision...




‚ö†Ô∏è GPT-4.1-mini failed on Access_Co_Ltd_Sustainability_Report_2022_4phm0wik.pdf, fallback to Vision...




‚ö†Ô∏è GPT-4.1-mini failed on Armstrong_Flooring_Inc_SustainabilityReport-2020_kot54emv.pdf, fallback to Vision...
‚ö†Ô∏è GPT-4.1-mini failed on Arvind_Ltd_Arvind_AR_2022-23_0_iwp4673c.pdf, fallback to Vision...



üìÑ Processing PDFs:   9%|‚ñâ         | 114/1277 [05:14<13:53,  1.40it/s]

‚ö†Ô∏è GPT-4.1-mini failed on BASF_SE_2012_BASF_Report_lmq79gwn.pdf, fallback to Vision...


[A

‚ö†Ô∏è GPT-4.1-mini failed on Boryung_Corporation_EBB3B4EBA0B920ECA780EC868DEAB080EB8AA5EAB2BDEC9881EBB3B4EAB3A0EC849CEC9881EBACB8_ebpit5lz.pdf, fallback to Vision...




‚ö†Ô∏è GPT-4.1-mini failed on EKI_Energy_Services_Limited_69298543284_zj7y1tjh.pdf, fallback to Vision...




‚ö†Ô∏è GPT-4.1-mini failed on Hansae_Yes24_Holdings_Co_Ltd_HANSAE20YES2420HOLDINGS20ESG20REPORT202022_th5kzsfk.pdf, fallback to Vision...




‚ö†Ô∏è GPT-4.1-mini failed on Home_Inns__Hotels_Management_Inc_Barclays_Bank_PLC_Annual_Report_202014_5lj1epic.pdf, fallback to Vision...




‚ö†Ô∏è GPT-4.1-mini failed on Hyosung_Corp_SR_2020_en_8g98j6gk.pdf, fallback to Vision...



üìÑ Processing PDFs:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 920/1277 [28:21<08:11,  1.38s/it]

‚ö†Ô∏è GPT-4.1-mini failed on Tam_Jai_International_Co_Ltd_2022083101184_go5rbp4a.pdf, fallback to Vision...


[A

‚ö†Ô∏è GPT-4.1-mini failed on Unknown_2014SustainRpt_FNL_lr_7mrwsfm7.pdf, fallback to Vision...





‚ö†Ô∏è GPT-4.1-mini failed on Unknown_2023042101335_kyzhtmjn.pdf, fallback to Vision...


üìÑ Processing PDFs:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 1174/1277 [34:38<04:43,  2.75s/it][A

‚ö†Ô∏è GPT-4.1-mini failed on Unknown_23076_Whitbread_AR2020_web_0v2mxh4f.pdf, fallback to Vision...




‚ö†Ô∏è GPT-4.1-mini failed on Unknown_adbi-managing-transition-low-carbon-economy_087is5zy.pdf, fallback to Vision...
‚ö†Ô∏è GPT-4.1-mini failed on Unknown_adp07-sus-fr_95qx6prh.pdf, fallback to Vision...


üìÑ Processing PDFs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1277/1277 [38:47<00:00,  1.82s/it]



‚úÖ Extraction complete! Results saved to: results/extracted_report_years_mini.csv


In [3]:
import pandas as pd

# ËØªÂèñÊñá‰ª∂
anno_df = pd.read_excel("check/rfyear_annotation.xlsx")
result_df = pd.read_csv("output/report_years_extracted_new.csv")

# Ê∏ÖÊ¥óÊñá‰ª∂ÂêçÂàó
anno_df["pdf_name_clean"] = anno_df["pdf_name"].str.strip().str.lower()
result_df["filename_clean"] = result_df["filename"].str.strip().str.lower()

# ÂêàÂπ∂‰∏§‰∏™Ë°®
merged_df = pd.merge(
    anno_df,
    result_df,
    left_on="pdf_name_clean",
    right_on="filename_clean",
    how="left"
)

# Âπ¥‰ªΩÂØπÊØîÂáΩÊï∞
def match_year(human, gpt):
    if pd.isna(human) or pd.isna(gpt):
        return "missing"
    return "match" if str(human).strip().lower() == str(gpt).strip().lower() else "mismatch"

# Ê∑ªÂä†ÂåπÈÖçÁªìÊûú
merged_df["match_result"] = merged_df.apply(
    lambda row: match_year(row["chosen_rfyear"], row["report_year_vision"]),
    axis=1
)

# ‰ªÖ‰øùÁïôÊúâÁî®Â≠óÊÆµ
final_df = merged_df[[
    "pdf_name", "chosen_rfyear", "report_year_vision", "match_result"
]]

# ‰øùÂ≠òÁªìÊûú
final_df.to_csv("rfyear_comparison_results_minimal.csv", index=False)

# ËæìÂá∫ÁªüËÆ°
print("‚úÖ Â∑≤‰øùÂ≠òÁÆÄÊ¥ÅÂØπÊØîÁªìÊûú‰∏∫ rfyear_comparison_results_minimal.csv")
print("üìä ÂåπÈÖçÊÉÖÂÜµÁªüËÆ°Ôºö")
print(final_df["match_result"].value_counts())

‚úÖ Â∑≤‰øùÂ≠òÁÆÄÊ¥ÅÂØπÊØîÁªìÊûú‰∏∫ rfyear_comparison_results_minimal.csv
üìä ÂåπÈÖçÊÉÖÂÜµÁªüËÆ°Ôºö
match_result
missing     386
mismatch    215
Name: count, dtype: int64


In [None]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

# 1. ËØªÂèñ‰∏§‰∏™Êñá‰ª∂
gpt_df = pd.read_csv("results/extracted_report_years_mini.csv")  # ÊèêÂèñÁªìÊûú
label_df = pd.read_excel("check/rfyear_annotation.xlsx")         # ‰∫∫Â∑•Ê†áÊ≥®Êñá‰ª∂

# 2. ÈáçÂëΩÂêçÂàó‰ª•‰æøÂØπÈΩêÂêàÂπ∂
gpt_df.rename(columns={"filename": "pdf_name", "report_year": "report_year_pred"}, inplace=True)

# 3. ÂêàÂπ∂‰∏§‰∏™Ë°®Ôºàinner joinÔºåÂè™‰øùÁïô‰∏§‰∏™ÈÉΩÊúâÁöÑ pdfÔºâ
merged = pd.merge(label_df, gpt_df, on="pdf_name", how="inner")

# 4. Ê†áÂáÜÂåñÂ≠óÁ¨¶‰∏≤Ê†ºÂºèÔºàÂéªÈô§Á©∫Ê†ºÂ§ßÂ∞èÂÜôÁ≠âÔºâ
merged["report_year_pred"] = merged["report_year_pred"].astype(str).str.strip().str.lower()
merged["chosen_rfyear"] = merged["chosen_rfyear"].astype(str).str.strip().str.lower()

# 5. ÂÆö‰πâÂåπÈÖçÈÄªËæëÔºàÂÆåÂÖ®ÂåπÈÖçÂç≥ÂèØÔºâ
merged["match"] = merged["report_year_pred"] == merged["chosen_rfyear"]

# 6. ÁªüËÆ°ÊåáÊ†á
total = len(merged)
correct = merged["match"].sum()
accuracy = correct / total

print(f"‚úÖ ÂêàÂπ∂Ê†∑Êú¨Êï∞Ôºö{total}")
print(f"‚úÖ ÂåπÈÖçÊ≠£Á°ÆÊï∞Ôºö{correct}")
print(f"‚úÖ ÂáÜÁ°ÆÁéáÔºàaccuracyÔºâÔºö{accuracy:.2%}")

# 7. ÂèØÈÄâÔºö‰øùÂ≠òÂØπÊØîÁªìÊûú
merged[["pdf_name", "chosen_rfyear", "report_year_pred", "match"]].to_csv("eval/year_comparison.csv", index=False)

‚úÖ ÂêàÂπ∂Ê†∑Êú¨Êï∞Ôºö224
‚úÖ ÂåπÈÖçÊ≠£Á°ÆÊï∞Ôºö78
‚úÖ ÂáÜÁ°ÆÁéáÔºàaccuracyÔºâÔºö34.82%
