In [3]:
import os
import re
import fitz  # PyMuPDF

# -------- Utility Functions --------
def is_formula(text):
    return bool(re.search(r"[=+\-*/^<>]|Δ|∇|∂|Ω|π|λ|μ|Σ|∫", text))

def is_figure_or_table_caption(text):
    return bool(re.match(r"^(Fig\.?|Table)\s?\d+", text.strip()))

def is_repeated_footer_or_header(text):
    patterns = [
        r"npj Materials Degradation",
        r"Published in partnership with",
        r"https?://",
        r"www\.",
        r"© The Author\(s\)",
        r"Open Access This article is licensed",
        r"Reprints and permission information",
        r"Supplementary information.*available at"
    ]
    return any(re.search(p, text) for p in patterns)

section_headers = [
    "ABSTRACT", "INTRODUCTION", "RESULTS AND DISCUSSION", 
    "METHODS", "CONCLUSION", "ACKNOWLEDGEMENTS", "REFERENCES",
    "DATA AVAILABILITY", "CODE AVAILABILITY", "AUTHOR CONTRIBUTIONS",
    "COMPETING INTERESTS", "ADDITIONAL INFORMATION"
]

# -------- Input/Output Folder Paths --------
input_folder = "Papers"          # <-- Replace with your PDF folder path
output_folder = "Text_Mechanics"         # <-- Replace with output folder path

os.makedirs(output_folder, exist_ok=True)

# -------- Process Each PDF File --------
for filename in os.listdir(input_folder):
    if not filename.lower().endswith(".pdf"):
        continue

    pdf_path = os.path.join(input_folder, filename)
    doc = fitz.open(pdf_path)

    left_by_page = []
    right_by_page = []
    figures_tables = []

    for page in doc:
        blocks = page.get_text("blocks")
        left_col = []
        right_col = []

        for block in blocks:
            x0, y0, x1, y1, text, *_ = block
            text = text.strip()
            if not text or is_repeated_footer_or_header(text):
                continue
            if is_figure_or_table_caption(text):
                figures_tables.append(text)
                continue
            if x0 < 300:
                left_col.append((y0, text))
            else:
                right_col.append((y0, text))

        left_by_page.append(sorted(left_col, key=lambda x: x[0]))
        right_by_page.append(sorted(right_col, key=lambda x: x[0]))

    # Order: left_0, right_0, left_1, right_1, ...
    final_blocks = []
    for lpage, rpage in zip(left_by_page, right_by_page):
        final_blocks.extend(lpage)
        final_blocks.extend(rpage)

    # Format text
    ordered_text = []
    for _, text in final_blocks:
        upper_text = text.upper()
        for header in section_headers:
            if upper_text.startswith(header):
                text = f"\n### {header}\n{text[len(header):].strip()}"
                break
        if is_formula(text):
            text = f"\n```\n{text}\n```\n"
        ordered_text.append(text)

    # Truncate at references
    cutoff_index = None
    for i, text in enumerate(ordered_text):
        if any(text.strip().upper().startswith(ref) for ref in ["### REFERENCES", "### REFERENCES AND NOTES"]):
            cutoff_index = i
            break
    if cutoff_index is not None:
        ordered_text = ordered_text[:cutoff_index]

    # Append figures and tables
    if figures_tables:
        ordered_text.append("\n### FIGURES AND TABLES\n" + "\n\n".join(figures_tables))

    # Save to output folder
    base_name = os.path.splitext(filename)[0]
    output_path = os.path.join(output_folder, f"{base_name}.txt")
    with open(output_path, "w", encoding="utf-8") as f:
        f.write("\n\n".join(ordered_text))