In [2]:
import fitz  # PyMuPDF
import re
import os

def extract_paper_content(pdf_path):
    content = []
    temp_paragraph = []
    recording = False

    # Attempt extraction using PyMuPDF
    try:
        with fitz.open(pdf_path) as doc:
            for page in doc:
                page_text = page.get_text()
                content.extend(page_text.split('\n'))
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ''

    # Check if extraction failed
    if not ''.join(content).strip():
        print(f"Error: PyMuPDF failed to extract content from {pdf_path}.")
        return ''

    # Attempt to construct a citation string from the first 20 lines
    citation_info = ""
    for line in content[:20]:
        if "doi" in line.lower():
            citation_info += " " + line.strip()
        elif re.match(r'^\d{4}', line.strip()):
            citation_info += " " + line.strip()
        else:
            citation_info += " " + line.strip()
    citation_info = re.sub(r'\s+', ' ', citation_info).strip()

    processed_content = [f"CITATION: {citation_info}"]

    for line in content:
        line = line.strip()
        recording = True

        # Skip citations, legends, domain names, DOIs
        domain_pattern = r"\.com|\.org|\.net|\.edu|www\.|https://|http://"
        doi_pattern = r"\bdoi\b"

        if recording and (
            "Fig." in line or 
            ("[" in line and "]" in line) or 
            re.search(domain_pattern, line) or 
            re.search(doi_pattern, line)
        ):
            continue

        if recording:
            temp_paragraph.append(line)

        if any(ref in line.lower() for ref in ["reference", "references", "references and notes"]):
            recording = False

    processed_content.append(' '.join(temp_paragraph))
    return '\n'.join(processed_content)


def main():
    papers_path = 'Papers'              # Folder where PDFs are located
    output_path = 'Extracted_Texts'               # Folder where .txt files will be saved
    os.makedirs(output_path, exist_ok=True)       # Create output folder if it doesn't exist

    pdf_files = [f for f in os.listdir(papers_path) if f.endswith('.pdf')]

    for pdf_file in pdf_files:
        full_pdf_path = os.path.join(papers_path, pdf_file)
        content = extract_paper_content(full_pdf_path)

        if not content.strip():
            print(f"Warning: No content extracted from {pdf_file}.")
            continue

        # Construct output filename (same name as PDF, but .txt)
        txt_filename = os.path.splitext(pdf_file)[0] + '.txt'
        full_txt_path = os.path.join(output_path, txt_filename)

        with open(full_txt_path, "w", encoding="utf-8") as txt_file:
            txt_file.write(content)

        print(f"Saved: {txt_filename}")

if __name__ == "__main__":
    main()

Saved: A Novel B2 Precipitate Gives High Strength and High Impact Toughness to Bcc-Structured Cryogenic Steels.txt
Saved: Boron Enhanced Complex Concentrated Silicides – New pathway for designing and optimizing ultra-high temperature intermetallic composite materials.txt
Saved: Development of a high strength, low density and corrosion resistant novel FeCrMoNb1.5Ti0.5 complex concentrated alloy.txt
Saved: Ductilization of single-phase refractory high-entropy alloys via activation of edge dislocation.txt
Saved: Electronic descriptors for dislocation deformationbehavior and intrinsic ductility in bcchigh-entropy alloys.txt
Saved: Enhanced fracture toughness in NbxTiZrHf.txt
Saved: Fundamental Effects of Al and Ta on Microstructure and Phase Transformations in the Al–Cr–Mo–Ta–Ti Refractory Complex Concentrated Alloy System.txt
Saved: Influence of the oxidation on the optical properties of Mo-Si-Ti, Ta-Mo-Cr-Al and Ta-Mo-Cr-Ti-Al alloys.txt
Saved: Intrinsic factors responsible for brittle.t

In [3]:
import os
import re
import fitz  # PyMuPDF

# -------- Utility Functions --------
def is_formula(text):
    return bool(re.search(r"[=+\-*/^<>]|Δ|∇|∂|Ω|π|λ|μ|Σ|∫", text))

def is_figure_or_table_caption(text):
    return bool(re.match(r"^(Fig\.?|Table)\s?\d+", text.strip()))

def is_repeated_footer_or_header(text):
    patterns = [
        r"npj Materials Degradation",
        r"Published in partnership with",
        r"https?://",
        r"www\.",
        r"© The Author\(s\)",
        r"Open Access This article is licensed",
        r"Reprints and permission information",
        r"Supplementary information.*available at"
    ]
    return any(re.search(p, text) for p in patterns)

section_headers = [
    "ABSTRACT", "INTRODUCTION", "RESULTS AND DISCUSSION", 
    "METHODS", "CONCLUSION", "ACKNOWLEDGEMENTS", "REFERENCES",
    "DATA AVAILABILITY", "CODE AVAILABILITY", "AUTHOR CONTRIBUTIONS",
    "COMPETING INTERESTS", "ADDITIONAL INFORMATION"
]

# -------- Input/Output Folder Paths --------
input_folder = "Papers"          # <-- Replace with your PDF folder path
output_folder = "Text_Mechanics"         # <-- Replace with output folder path

os.makedirs(output_folder, exist_ok=True)

# -------- Process Each PDF File --------
for filename in os.listdir(input_folder):
    if not filename.lower().endswith(".pdf"):
        continue

    pdf_path = os.path.join(input_folder, filename)
    doc = fitz.open(pdf_path)

    left_by_page = []
    right_by_page = []
    figures_tables = []

    for page in doc:
        blocks = page.get_text("blocks")
        left_col = []
        right_col = []

        for block in blocks:
            x0, y0, x1, y1, text, *_ = block
            text = text.strip()
            if not text or is_repeated_footer_or_header(text):
                continue
            if is_figure_or_table_caption(text):
                figures_tables.append(text)
                continue
            if x0 < 300:
                left_col.append((y0, text))
            else:
                right_col.append((y0, text))

        left_by_page.append(sorted(left_col, key=lambda x: x[0]))
        right_by_page.append(sorted(right_col, key=lambda x: x[0]))

    # Order: left_0, right_0, left_1, right_1, ...
    final_blocks = []
    for lpage, rpage in zip(left_by_page, right_by_page):
        final_blocks.extend(lpage)
        final_blocks.extend(rpage)

    # Format text
    ordered_text = []
    for _, text in final_blocks:
        upper_text = text.upper()
        for header in section_headers:
            if upper_text.startswith(header):
                text = f"\n### {header}\n{text[len(header):].strip()}"
                break
        if is_formula(text):
            text = f"\n```\n{text}\n```\n"
        ordered_text.append(text)

    # Truncate at references
    cutoff_index = None
    for i, text in enumerate(ordered_text):
        if any(text.strip().upper().startswith(ref) for ref in ["### REFERENCES", "### REFERENCES AND NOTES"]):
            cutoff_index = i
            break
    if cutoff_index is not None:
        ordered_text = ordered_text[:cutoff_index]

    # Append figures and tables
    if figures_tables:
        ordered_text.append("\n### FIGURES AND TABLES\n" + "\n\n".join(figures_tables))

    # Save to output folder
    base_name = os.path.splitext(filename)[0]
    output_path = os.path.join(output_folder, f"{base_name}.txt")
    with open(output_path, "w", encoding="utf-8") as f:
        f.write("\n\n".join(ordered_text))