In [None]:
# Text Extraction: Final Version with Doc ID

import pymupdf
import re
import csv
from pathlib import Path
from multi_column import column_boxes


def clean_text(text: str) -> str:
    """
    Bereinigt Text: Fixiert Silbentrennung, Initialen und Whitespace.
    """

    if not text:
        return ""

    # 1. Fix Hyphenation (Word-\npart -> Wordpart)
    text = re.sub(r"(\w+)-\s*\n\s*(\w+)", r"\1\2", text)

    # 2. Fix Drop Caps (A\ndvances -> Advances)
    text = re.sub(r"^\s*([A-Z])\s*\n\s*([a-z])", r"\1\2", text)

    # 3. Collapse whitespace
    return re.sub(r"\s+", " ", text).strip()

def process_folder(folder_path: str, output_csv: str):
    """
    Iterates through all PDFs in folder_path, extracts metadata from filenames,
    extracts/cleans text from pages, and writes everything to a single CSV.
    """

    pdf_dir = Path(folder_path)
    
    # Check if directory exists
    if not pdf_dir.exists():
        print(f"Directory not found: {folder_path}")
        return

    print(f"Scanning folder: {folder_path}...")

    # Open CSV file once for writing
    with open(output_csv, mode='w', newline='', encoding='utf-8') as csv_file:
        writer = csv.writer(csv_file)

        # Update Header: Added "Doc_ID" as the first column
        writer.writerow(["Doc_ID", "Country", "Year", "Document_Name", "Page", "Block_ID", "text"])

        # Iterate over all PDF files in the directory
        pdf_files = list(pdf_dir.glob("*.pdf"))

        if not pdf_files:
            print("No PDF files found in the directory.")
            return

        # Enumerate gives us a counter (doc_id) starting at 0
        for doc_id, pdf_path in enumerate(pdf_files, start=0):

            # --- Metadata Extraction ---
            parts = pdf_path.stem.split("_")
            country = parts[0]

            # Check if second part is a year (digits)
            year = parts[1] if len(parts) > 1 and parts[1].isdigit() else ""

            # Join the rest as the document name
            doc_name = "_".join(parts[2:]) if len(parts) > 2 else ""

            print(f"Processing ID {doc_id}: {pdf_path.name} | Country: {country}, Year: {year}")

            try:

                # --- PDF Text Extraction ---
                doc = pymupdf.open(pdf_path)

                for page_num, page in enumerate(doc, start=1):

                    # Column detection (Bounding Boxes)
                    bboxes = column_boxes(page, footer_margin=50, no_image_text=True)

                    for block_id, rect in enumerate(bboxes, start=1):

                        # Extract text from the specific box
                        raw_text = page.get_text(clip=rect, sort=True)

                        # Clean text
                        final_text = clean_text(raw_text)

                        # Write to CSV if text exists
                        if final_text:
                            # Added doc_id to the row data
                            writer.writerow([doc_id, country, year, doc_name, page_num, block_id, final_text])

                doc.close()

            except Exception as e:
                print(f"Failed to process {pdf_path.name}: {e}")


    print(f"Extraction complete. All data saved in '{output_csv}'.")


# --- Main Execution ---
if __name__ == "__main__":

    # Define your folder and output filename here
    input_folder = "countries_edited"
    output_filename = "text_countries_edited.csv"

    process_folder(input_folder, output_filename)