In [8]:
import pdfplumber
import pandas as pd
from collections import Counter
import os
import re
import pandas as pd

In [15]:
# footnotes_assassin_v2.4.0 #

def extract_text_without_footnotes(
    pdf_path
    , threshold_ratio=0.90 # Updated to 0.90 to capture most footnotes;
    , header_threshold=50  # e.g., 50 means: remove anything within top 50 units of page
):
    """
    Extracts text from a PDF, removing:
    - Words that are smaller than a threshold relative to the main text (footnotes) --> default set to 0.90
    - Words within a specified top region of each page (headers), if header_threshold is set
    
    Args:
        pdf_path (str): Path to the PDF file
        threshold_ratio (float): Words with height smaller than this ratio of 
                                 the most common height will be considered footnotes
        header_threshold (float or None): Distance from top of page (in points) to treat as header.
                                          If None, headers are not filtered.
    
    Returns:
        str: Entire document text without footnotes and optional headers
    """
    all_pages_text = []

    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages):
            words = page.extract_words()
            if not words:
                continue

            df = pd.DataFrame(words)

            # Remove headers if header_threshold is set
            if header_threshold is not None:
                df = df[df['top'] > header_threshold]

            # Determine the most common height from remaining words
            if df.empty:
                continue
            height_counts = Counter(df['height'])
            most_common_height = height_counts.most_common(1)[0][0]

            # Define height threshold
            height_threshold = most_common_height * threshold_ratio

            # Filter out footnotes by height
            filtered_words = df[df['height'] >= height_threshold]

            # Reconstruct text in reading order
            filtered_words = filtered_words.sort_values(by=[ 'top', 'x0'])
            page_text = " ".join(filtered_words['text'].tolist())

            all_pages_text.append(page_text)

    return "\n\n".join(all_pages_text)

In [None]:
# ---------------- Example usage ---------------- #

pdf_path = "./input/611s6_George, Covert Action.pdf"
output_dir = './output/'

# Create output directory if needed
os.makedirs(output_dir, exist_ok=True)

# Adjust header_threshold per PDF as needed (e.g., 50 points from top)
cleaned_text = extract_text_without_footnotes(
    pdf_path
    , threshold_ratio=0.9
    , header_threshold=75  # set to None if not needed
)

cleaned_text

In [14]:
import pdfplumber
import pandas as pd

with pdfplumber.open(pdf_path) as pdf:
    page = pdf.pages[0]
    df = pd.DataFrame(page.extract_words())
    df.sort_values('top').head(10)
    
df.sort_values('top').head(10)

Unnamed: 0,text,x0,x1,top,doctop,bottom,upright,height,width,direction
0,9,90.0,104.016,102.272,102.272,126.272,True,24.0,14.016,ltr
1,Covert,90.0,168.06,149.16,149.16,169.16,True,20.0,78.06,ltr
2,Action,177.56,248.16,149.16,149.16,169.16,True,20.0,70.6,ltr
3,as,258.16,283.18,149.16,149.16,169.16,True,20.0,25.02,ltr
4,Policy,90.0,160.6,179.16,179.16,199.16,True,20.0,70.6,ltr
5,Support,168.6,259.42,179.16,179.16,199.16,True,20.0,90.82,ltr
15,it,419.7165,426.0165,276.011,276.011,286.511,True,10.5,6.3,ltr
14,distasteful;,368.7075,416.8815,276.011,276.011,286.511,True,10.5,48.174,ltr
13,is,358.512,365.8725,276.011,276.011,286.511,True,10.5,7.3605,ltr
12,politics,322.938,355.677,276.011,276.011,286.511,True,10.5,32.739,ltr


In [20]:
# For cleanup of stubborn text artifacts (e.g., watermarks, chapter headers, etc.)

def clean_text_artifacts(text, patterns_to_remove=None):
    """
    Post-processes extracted text to remove unwanted recurring artifacts.
    
    Args:
        text (str): The text to clean
        patterns_to_remove (list of str or regex): List of strings or regex patterns to remove
    
    Returns:
        str: Cleaned text
    """
    if patterns_to_remove is None:
        patterns_to_remove = []

    cleaned_text = text
    for pattern in patterns_to_remove:
        # Use regex sub with re.IGNORECASE for flexibility
        cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.IGNORECASE)
    
    # Optional: collapse multiple newlines to tidy formatting
    cleaned_text = re.sub(r'\n{3,}', '\n\n', cleaned_text)

    return cleaned_text

In [29]:
# Define recurring junk patterns you want to remove
junk_patterns = [
    "iryp" # This tends to appear at bottom and/or top of academic papers... why?
    , "\n\n" # The double skipline was reccurrent in the sample I used, but can adjust;
#     r'iryp\s*',           # removes 'iryp' followed by optional whitespace/newlines
#     r'Page\s+\d+',        # example: removes "Page 34"
#     r'\[\d+\]',           # example: removes [1], [2] etc.
]

# Apply cleanup to the text extracted from the main script
final_text = clean_text_artifacts(cleaned_text, patterns_to_remove=junk_patterns)

# (Optional) overwrite the file or save to a new one
with open(output_path, 'w', encoding='utf-8') as f:
    f.write(final_text)

print("Post-cleanup complete")

Post-cleanup complete


In [None]:
final_text

In [None]:
# Build output filename
base_name = os.path.splitext(os.path.basename(pdf_path))[0]
output_filename = f"{base_name}_CLEANED_TEXT_v002.txt"
output_path = os.path.join(output_dir, output_filename)

# Write to a single text file
with open(output_path, 'w', encoding='utf-8') as f:
    f.write(final_text)

print(f"Cleaned text exported to: {output_path}")