In [5]:
import re
import fitz

In [84]:
# input_path = "1709.00284v2.pdf"
input_path = "A subgraph matching algorithm based on subgraph index for knowledge graph.pdf"
output_path = "clean_text_output.txt"

In [85]:
import re
import fitz  # PyMuPDF

def extract_clean_text(pdf_path, output_txt_path):
    # Open the PDF file
    pdf = fitz.open(pdf_path)
    total_pages = len(pdf)
    all_pages_lines = []

    # First pass: extract text lines from each page
    for page_index in range(total_pages):
        page = pdf[page_index]
        text = page.get_text("text")  # Extract text preserving layout-based order
        if text is None:
            all_pages_lines.append([])  # no text on this page (e.g. blank or image-only)
        else:
            # Split into individual lines
            lines = text.splitlines()
            all_pages_lines.append(lines)
    pdf.close()

    # Detect repeating headers/footers by analyzing first and last lines of each page
    header_counts = {}
    footer_counts = {}
    for i, lines in enumerate(all_pages_lines):
        if not lines or len(lines) == 0:
            continue
        first_line = lines[0].strip()
        last_line = lines[-1].strip()
        if first_line:
            header_counts[first_line] = header_counts.get(first_line, 0) + 1
        if last_line:
            footer_counts[last_line] = footer_counts.get(last_line, 0) + 1

    # Determine which lines are frequent headers/footers (appearing on >= 50% of pages)
    headers_to_remove = {line for line, count in header_counts.items() if count >= 0.5 * total_pages}
    footers_to_remove = {line for line, count in footer_counts.items() if count >= 0.5 * total_pages}

    # Now, clean and filter lines
    cleaned_lines = []
    in_references = False
    for page_index, lines in enumerate(all_pages_lines):
        for line in lines:
            if line is None:
                continue
            raw = line  # original line
            text = raw.strip()

            # Stop if we reached References section
            if not in_references:
                low = text.lower()
                if low.startswith("references") or low.startswith("bibliography"):
                    in_references = True
                    break  # stop processing this page (and subsequent pages)
            else:
                break
            # Skip if this line is a detected header or footer
            if text in headers_to_remove or text in footers_to_remove:
                continue
            # Remove pure page numbers (e.g. "12")
            if re.fullmatch(r'\d+', text):
                continue
            # Remove "Table of Contents" if present (skip all lines until an empty line or main content)
            # (In our case, academic articles usually don't include TOC, so this can be omitted or implemented if needed.)

            # Remove "Keywords" section lines entirely
            if text.lower().startswith("keywords"):
                # skip the entire keywords section (often one or two lines) 
                # skip this line and continue to next until an empty line
                continue  # skip the "Keywords: ..." line
            # If currently in a Keywords section (could add logic to skip subsequent keyword lines until blank)

            # Remove the "Abstract" label if it’s at the start of the line
            if text.lower().startswith("abstract"):
                cleaned_lines = []
                # If the line is just "Abstract" or "Abstract:" and nothing else, skip it
                parts = text.split(None, 1)
                if len(parts) < 2:
                    continue
                else:
                    # Remove the word "Abstract" and keep the rest of the line
                    text = parts[1].lstrip(":-– ")  # remove any colon or dash after 'Abstract'
            # Remove section headings (Introduction, Conclusion, etc.)
            # Check common headings or numbered headings
            lower_txt = text.lower().rstrip(":")
            common_headings = {"abstract", "introduction", "background", "related work", 
                               "methods", "materials", "methodology", "experiments", 
                               "results", "discussion", "conclusion", "conclusions", 
                               "acknowledgments", "acknowledgements"}
            if lower_txt in common_headings:
                continue
            # Check for numbered section headings like "1. Introduction" or "2.1 Methods"
            if re.match(r'^\d+(\.\d+)*\s+', text):
                # Remove leading numbers and dots and re-check
                without_num = re.sub(r'^\d+(\.\d+)*\s+', '', text)
                if without_num.lower() in common_headings:
                    continue

            # Remove figure and table captions (lines starting with "Figure", "Fig", or "Table")
            if re.match(r'^(figure|fig|table)\b', text, flags=re.IGNORECASE):
                # If the line clearly starts a caption, skip it
                # (Assumes that normal sentences rarely start with "Figure" or "Table")
                continue

            # # Remove inline citation markers like “[12]” or “(1999)” from the line
            text = re.sub(r'\[[0-9,\s]+\]', '', text)         # remove [1], [2, 5], etc.
            text = re.sub(r'\([A-Za-z]+ et al\.?, \d{4}\)', '', text)  # remove (Name et al., 2020) patterns (basic approach)
            text = re.sub(r'\([0-9]{4}\)', '', text)          # remove (2020) year-only citations
            text = re.sub(r'\s{2,}', ' ', text)  # collapse multiple spaces that might result

            # Append the cleaned text line (preserve any original newline structure by keeping empty lines as separators)
            if text == "":
                cleaned_lines.append("")  # keep an empty line if it was blank (paragraph break)
            else:
                cleaned_lines.append(text)

        if in_references:
            break  # stop processing further pages once references start

    # Merge hyphenated words that were split across lines
    merged_lines = []
    for line in cleaned_lines:
        if merged_lines and merged_lines[-1].endswith('-') and line and line[0].islower():
            # Merge with previous line: remove trailing hyphen and concatenate
            merged_lines[-1] = merged_lines[-1][:-1] + line.lstrip()
        else:
            merged_lines.append(line)

    # Join lines into continuous text, preserving paragraph breaks
    output_text = ""
    prev_line_blank = False
    for line in merged_lines:
        if line == "":
            # blank line indicates a paragraph break
            if not prev_line_blank:  # avoid multiple blank lines in a row
                output_text += "\n"
            prev_line_blank = True
        else:
            if output_text and not prev_line_blank:
                output_text += " "  # add space before concatenating if within paragraph
            output_text += " " + line.strip()
            prev_line_blank = False
    
    output_text = output_text.replace("i.e.", "").replace("e.g.", "").replace("etc", "").replace("Eq.", "").replace("Refs.", "")
    sentences = output_text.replace(". ", ".\n").split("\n")

    cleaned_sentences = []
    for sentence in sentences:
        sentence = sentence.strip()
        # Remove lines that are likely formulas or non-text (by character analysis)
        # If the line has very few letters compared to other chars, or contains specific math symbols, skip it.
        letters = sum(ch.isalpha() for ch in sentence)
        digits = sum(ch.isdigit() for ch in sentence)
        # Count non-alphanumeric, non-space characters
        others = sum((not ch.isalnum() and not ch.isspace()) for ch in sentence)
        # If no letters at all, skip (e.g. an equation number or purely symbols)
        if letters == 0:
            continue
        # If the line is short (e.g. under 5 chars) and not ending in punctuation, skip it (likely fragment)
        if len(sentence) < 5 and sentence[-1] not in ".?!":
            continue
        # If letters are less than half of the characters (indicates lots of symbols/digits), skip
        if letters / (letters + digits + others) < 0.5:
            continue
        # If contains obvious math symbols or notation, skip
        if re.search(r'[=<>±×÷∑√≤≥φρΨσ𝐿∈→≠~⟨⟩]', sentence):
            continue

        cleaned_sentences.append(sentence)

    output_text = "\n".join(cleaned_sentences)

    # Final cleanup: remove any spaces before punctuation and multiple spaces
    output_text = re.sub(r"\s+([.,;:!?])", r"\1", output_text)
    output_text = re.sub(r" {2,}", " ", output_text)

    # output_text = re.sub(r'\s*\([^)]*\)\s*', ' ', output_text)
    output_text = re.sub(r'\s*\[[^\]]*\]\s*', ' ', output_text)
    # output_text = re.sub(r'\s{2,}', ' ', output_text).strip()

    # Write the clean text to output file
    with open(output_txt_path, 'w', encoding='utf-8') as out_f:
        out_f.write(output_text)

# Process the sample PDFs and save outputs
extract_clean_text(input_path, output_path)

In [67]:
import re

text = "This is an example [abc123]! And another [xyz@#]."
cleaned = re.sub(r'\s*\[[^\]]*\]\s*', ' ', text)
cleaned = re.sub(r'\s{2,}', ' ', cleaned).strip()

print(cleaned)

This is an example ! And another .


In [None]:
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered

converter = PdfConverter(
    artifact_dict=create_model_dict(),
)
pdf_path = "/content/1709.00284v2.pdf"

import gc
gc.collect()

rendered = converter(pdf_path)
text, _, _ = text_from_rendered(rendered)
print(text)