In [1]:
import pdfplumber
import csv
import re

def clean_paragraph(text):
    """
    Clean the paragraph text by removing extra spaces and unwanted characters.
    """
    text = re.sub(r'[^\w\s.!?;:\'\-]', '', text)  # Remove unwanted characters
    return ' '.join(text.split())  # Normalize spaces

def remove_header_footer(text):
    """
    Remove potential header and footer content such as chapter names, page numbers,
    section numbering, bullet points, repeated subheader-like lines, and references to tables, figures, and theorems.
    """
    lines = text.splitlines()
    filtered_lines = []

    for line in lines:
        line = line.strip()

        # Skip lines with page numbers (e.g., "Page 1", "1", "1/10", etc.)
        if re.match(r'^(Page\s*\d+|\d+(/\d+)?\s*)$', line, re.IGNORECASE):
            continue

        # Skip lines with chapter titles
        if re.match(r'^(Chapter\s+\d+|CHAPTER\s+\w+)', line, re.IGNORECASE):
            continue

        # Skip decorative lines (e.g., "-----", "====", etc.)
        if re.match(r'^[-=_]{3,}$', line.strip()):
            continue

        # Skip lines with section numbering like "13.8 Topic Name"
        if re.match(r'^\d+\.\d+\s+.+$', line.strip()):
            continue

        # Skip subsection numbering like "1.2.2 Subsection Name" or deeper levels like "1.2.4.1.2"
        if re.match(r'^\d+(\.\d+)+\s+.+$', line.strip()):
            continue

        # Skip lines starting with '0 ' (zero followed by space)
        if re.match(r'^0\s+', line):
            continue

        # Skip specific repeated subheaders or bullet-like lines
        if re.match(r'^(learning to\s+.+)', line, re.IGNORECASE):
            continue

        # Skip references to tables, figures, and theorems
        if re.match(r'^(Table|Figure|Theorem)\s+\d+\.\d+', line, re.IGNORECASE):
            continue

        filtered_lines.append(line)

    return "\n".join(filtered_lines)

def extract_paragraphs_with_continuation(pdf_path, output_csv_path):
    """
    Extract paragraphs from a PDF, considering continuation across pages, and save to a CSV file.

    :param pdf_path: Path to the PDF file.
    :param output_csv_path: Path to the CSV file to save the extracted paragraphs.
    """
    paragraphs = []
    current_paragraph = ""

    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                # Extract text from the current page
                text = page.extract_text()

                if text:
                    # Remove headers/footers and unwanted lines
                    filtered_text = remove_header_footer(text)
                    lines = filtered_text.splitlines()

                    for line in lines:
                        line = line.strip()
                        if not line:  # Skip empty lines
                            continue
                        
                        # Add line to the current paragraph
                        if current_paragraph:
                            current_paragraph += " " + line
                        else:
                            current_paragraph = line

                        # Check if the paragraph ends
                        if line.endswith(('.', '!', '?')):
                            paragraphs.append(clean_paragraph(current_paragraph))
                            current_paragraph = ""  # Reset for the next paragraph
            
            # Add any remaining paragraph
            if current_paragraph:
                paragraphs.append(clean_paragraph(current_paragraph))

        # Write paragraphs to a CSV file
        with open(output_csv_path, mode='w', encoding='utf-8', newline='') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(["Paragraph"])  # Add header
            for paragraph in paragraphs:
                writer.writerow([paragraph])

        print(f"Paragraphs extracted and saved to {output_csv_path}")

    except Exception as e:
        print(f"Error: {e}")

# Usage
pdf_file = r"C:\Users\eanam\OneDrive\Desktop\AI3\AI_Russell_Norvig_removed.pdf"
output_csv = r"C:\Users\eanam\OneDrive\Desktop\AI3\extracted_paragraphs.csv"

extract_paragraphs_with_continuation(pdf_file, output_csv)


Paragraphs extracted and saved to C:\Users\eanam\OneDrive\Desktop\AI3\extracted_paragraphs.csv


In [2]:
import pdfplumber
import csv
import re

def clean_paragraph(text):
    """
    Clean the paragraph text by removing extra spaces and unwanted characters.
    """
    text = re.sub(r'[^\w\s.!?;:\'\-]', '', text)  # Remove unwanted characters
    return ' '.join(text.split())  # Normalize spaces

def remove_header_footer(text):
    """
    Remove potential header and footer content such as chapter names, page numbers,
    section numbering, bullet points, repeated subheader-like lines, and references to tables, figures, and theorems.
    """
    lines = text.splitlines()
    filtered_lines = []

    for line in lines:
        line = line.strip()

        # Skip lines like "1 Why Machine Learning Strategy"
        if re.match(r'^\d+\s+.+$', line):
            continue

        # Skip lines like "Page 6 Machine Learning Yearning-Draft Andrew Ng"
        if re.match(r'^Page\s+\d+.+$', line, re.IGNORECASE):
            continue

        # Skip lines with chapter titles
        if re.match(r'^(Chapter\s+\d+|CHAPTER\s+\w+)', line, re.IGNORECASE):
            continue

        # Skip decorative lines (e.g., "-----", "====", etc.)
        if re.match(r'^[-=_]{3,}$', line.strip()):
            continue

        # Skip lines with section numbering like "13.8 Topic Name"
        if re.match(r'^\d+\.\d+\s+.+$', line.strip()):
            continue

        # Skip subsection numbering like "1.2.2 Subsection Name" or deeper levels like "1.2.4.1.2"
        if re.match(r'^\d+(\.\d+)+\s+.+$', line.strip()):
            continue

        # Skip lines starting with '0 ' (zero followed by space)
        if re.match(r'^0\s+', line):
            continue

        # Skip lines starting with "Topic Name" followed by any integer
        if re.match(r'^Topic Name\s+\d+', line, re.IGNORECASE):
            continue

        # Skip references to tables, figures, and theorems
        if re.match(r'^(Table|Figure|Theorem)\s+\d+\.\d+', line, re.IGNORECASE):
            continue

        # Skip lines that appear as author name, footer notes, etc.
        if re.match(r'^(Author Name|Book Title|.*\s*\d{4})$', line, re.IGNORECASE):
            continue

        # Skip lines with patterns like "Page X Title-Subtitle Author"
        if re.match(r'^Page\s+\d+\s+.*$', line, re.IGNORECASE):
            continue

        filtered_lines.append(line)

    return "\n".join(filtered_lines)

def extract_paragraphs_with_continuation(pdf_path, output_csv_path):
    """
    Extract paragraphs from a PDF, considering continuation across pages, and save to a CSV file.

    :param pdf_path: Path to the PDF file.
    :param output_csv_path: Path to the CSV file to save the extracted paragraphs.
    """
    paragraphs = []
    current_paragraph = ""

    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                # Extract text from the current page
                text = page.extract_text()

                if text:
                    # Remove headers/footers and unwanted lines
                    filtered_text = remove_header_footer(text)
                    lines = filtered_text.splitlines()

                    for line in lines:
                        line = line.strip()
                        if not line:  # Skip empty lines
                            continue
                        
                        # Add line to the current paragraph
                        if current_paragraph:
                            current_paragraph += " " + line
                        else:
                            current_paragraph = line

                        # Check if the paragraph ends
                        if line.endswith(('.', '!', '?')):
                            paragraphs.append(clean_paragraph(current_paragraph))
                            current_paragraph = ""  # Reset for the next paragraph
            
            # Add any remaining paragraph
            if current_paragraph:
                paragraphs.append(clean_paragraph(current_paragraph))

        # Write paragraphs to a CSV file
        with open(output_csv_path, mode='w', encoding='utf-8', newline='') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(["Paragraph"])  # Add header
            for paragraph in paragraphs:
                writer.writerow([paragraph])

        print(f"Paragraphs extracted and saved to {output_csv_path}")

    except Exception as e:
        print(f"Error: {e}")

# Usage
pdf_file = r"C:\Users\eanam\OneDrive\Desktop\AI3\AI_Russell_Norvig_removed.pdf"
output_csv = r"C:\Users\eanam\OneDrive\Desktop\AI3\extracted_paragraphs.csv"

extract_paragraphs_with_continuation(pdf_file, output_csv)


Paragraphs extracted and saved to C:\Users\eanam\OneDrive\Desktop\AI3\extracted_paragraphs.csv


In [11]:
import pdfplumber
import csv
import re

def restore_spaces(text):
    """
    Restore spaces in the extracted text by identifying concatenated words.
    """
    # Regex to insert spaces where necessary:
    # This pattern ensures spaces between lowercase and uppercase letters that are likely
    # part of separate words.
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)  # e.g., "wordAnd" -> "word And"
    
    # Add space where a lowercase letter is followed by another lowercase letter and a word boundary.
    text = re.sub(r'([a-z])([a-z])', r'\1 \2', text)  # e.g., "wordword" -> "word word"
    
    # Ensure proper spacing between words that have a number and alphabet or vice versa
    text = re.sub(r'([a-zA-Z])([0-9])', r'\1 \2', text)  # e.g., "word123" -> "word 123"
    text = re.sub(r'([0-9])([a-zA-Z])', r'\1 \2', text)  # e.g., "123word" -> "123 word"

    # Also, add a space between words and punctuation marks where necessary
    text = re.sub(r'([a-zA-Z])([.,;!?])', r'\1 \2', text)  # e.g., "word,word" -> "word, word"

    # Normalize multiple spaces to a single space
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def extract_raw_text_to_csv(pdf_path, output_csv_path):
    """
    Extract raw text from a PDF and save it to a CSV file,
    ensuring that words are properly spaced.
    
    :param pdf_path: Path to the PDF file.
    :param output_csv_path: Path to the CSV file to save the extracted text.
    """
    try:
        paragraphs = []

        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                # Extract raw text from the current page
                text = page.extract_text()
                if text:
                    # Restore spaces in text and split into lines
                    lines = [restore_spaces(line.strip()) for line in text.splitlines() if line.strip()]
                    paragraphs.extend(lines)

        # Write processed text to a CSV file
        with open(output_csv_path, mode='w', encoding='utf-8', newline='') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(["Line"])  # Add header
            for line in paragraphs:
                writer.writerow([line])

        print(f"Raw text extracted and saved to {output_csv_path}")

    except Exception as e:
        print(f"Error: {e}")

# Usage
pdf_file = r"C:\Users\eanam\OneDrive\Desktop\AI3\AI_Russell_Norvig_removed.pdf"
output_csv = r"C:\Users\eanam\OneDrive\Desktop\AI3\extracted_raw_text.csv"

extract_raw_text_to_csv(pdf_file, output_csv)


Raw text extracted and saved to C:\Users\eanam\OneDrive\Desktop\AI3\extracted_raw_text.csv
