In [None]:
import os
"""
PDF File Processor

This script:
1. Scans a specified directory (ww_private/vDaten/.test/) for PDF files
2. Collects all PDF file paths into a list
3. Creates an output directory (md) for markdown files
4. Prints the list of found PDF files

Note: Currently sets up the file processing pipeline but doesn't 
      perform actual PDF-to-markdown conversion yet.
"""

datarel = "ww_private/vDaten/.test/"
# print files and dir in datarel directory
# Get the list of files and directories in the datarel directory
# datarel is a relative path

contents = os.listdir(os.path.join(os.getcwd(), datarel))

# Print each item in the contents
#for item in contents:
#    print(item)


# alright, put all pdf's in a list
# convert all pdf's to markdown
# save all markdown's in a new directory
# Create a list to store PDF file paths
pdf_files = []

# Iterate through the contents of the data directory
for item in contents:
    # Check if the file has a .pdf extension
    if item.lower().endswith('.pdf'):
        # Add the full path of the PDF file to the list
        pdf_files.append(os.path.join(os.getcwd(), datarel, item))

# Print the list of PDF files found
print(f"Found {len(pdf_files)} PDF files:")
for pdf_file in pdf_files:
    print(pdf_file)


# Create a new directory to store markdown files, relative to the data dir
markdown_dir = os.path.join(os.getcwd(), datarel, "md")
#markdown_dir = "markdown_output"
os.makedirs(markdown_dir, exist_ok=True)


Found 2 PDF files:
/Users/johannwaldherr/code/ww/ww_private/vDaten/.test/2023_02002_AAB_EV.pdf
/Users/johannwaldherr/code/ww/ww_private/vDaten/.test/2023_01002_AAB_EV.pdf


In [5]:

extractor = "pypandoc"


# Import necessary libraries for PDF to Markdown conversion
import pypandoc
from PyPDF2 import PdfReader

# Function to convert PDF to Markdown
def pdf_to_markdown_pypandoc(pdf_path, output_dir):
    # Extract the filename without extension
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_path = os.path.join(output_dir, f"{base_name}.{extractor}.md")
    
    # Read PDF content
    with open(pdf_path, 'rb') as file:
        pdf = PdfReader(file)
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    
    # Convert text to Markdown using pypandoc
    markdown = pypandoc.convert_text(text, 'md', format='markdown')
    
    # Write Markdown content to file
    with open(output_path, 'w', encoding='utf-8') as md_file:
        md_file.write(markdown)
    
    return output_path

# Convert each PDF to Markdown
for pdf_file in pdf_files:
    markdown_file = pdf_to_markdown_pypandoc(pdf_file, markdown_dir)
    print(f"Converted {pdf_file} to {markdown_file}")

print(f"All PDF files have been converted to Markdown and saved in the '{markdown_dir}' directory.")


Converted /Users/johannwaldherr/code/ww/ww_private/vDaten/.test/2023_02002_AAB_EV.pdf to /Users/johannwaldherr/code/ww/ww_private/vDaten/.test/md/2023_02002_AAB_EV.pypandoc.md
Converted /Users/johannwaldherr/code/ww/ww_private/vDaten/.test/2023_01002_AAB_EV.pdf to /Users/johannwaldherr/code/ww/ww_private/vDaten/.test/md/2023_01002_AAB_EV.pypandoc.md
All PDF files have been converted to Markdown and saved in the '/Users/johannwaldherr/code/ww/ww_private/vDaten/.test/md' directory.


In [7]:
import fitz  # PyMuPDF
import markdownify
import re

extractor = "fitz"


def clean_text(text):
    # Directly remove unnecessary escape sequences
    text = text.replace(r'\.', '.')
    text = text.replace(r'\-', '-')
    text = text.replace(r'\+', '+')
    
    # Replace hyphenated line breaks with proper words
    text = re.sub(r'\-\n', '', text)
    
    # Replace multiple newlines with two newlines (Markdown paragraph separator)
    text = re.sub(r'\n{2,}', '\n\n', text)
    
    # Remove excessive whitespace
    text = re.sub(r' +', ' ', text)
    
    return text

def convert_links(text):
    # Convert URLs to markdown links
    text = re.sub(r'(https?://[^\s]+)', r'[\1](\1)', text)
    
    # Convert emails to mailto links
    text = re.sub(r'(\S+@\S+\.\S+)', r'[\1](mailto:\1)', text)
    
    return text


def pdf_to_markdown_fitz(pdf_path, output_dir):
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]
    markdown_path = os.path.join(output_dir, f"{base_name}.{extractor}.md")

    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    
    # Initialize an empty string to store the text
    pdf_text = ""

    # Iterate through each page and extract text
    for page_num in range(pdf_document.page_count):
        page = pdf_document.load_page(page_num)
        pdf_text += page.get_text("text")
    
    # Clean the extracted text

    # Convert extracted text to markdown
    markdown_text = markdownify.markdownify(pdf_text, heading_style="ATX")
    # Convert URLs and emails to markdown links
    markdown_text = convert_links(markdown_text)    
    markdown_text = clean_text(markdown_text)
    # Write the markdown text to a file
    with open(markdown_path, "w") as markdown_file:
        markdown_file.write(markdown_text)

    return markdown_path

# Example usage

# Convert each PDF to Markdown
for pdf_file in pdf_files:
    markdown_file = pdf_to_markdown_fitz(pdf_file, markdown_dir)
    print(f"Converted {pdf_file} to {markdown_file}")

print(f"All PDF files have been converted to Markdown and saved in the '{markdown_dir}' directory.")


Converted /Users/johannwaldherr/code/ww/ww_private/vDaten/.test/2023_02002_AAB_EV.pdf to /Users/johannwaldherr/code/ww/ww_private/vDaten/.test/md/2023_02002_AAB_EV.fitz.md
Converted /Users/johannwaldherr/code/ww/ww_private/vDaten/.test/2023_01002_AAB_EV.pdf to /Users/johannwaldherr/code/ww/ww_private/vDaten/.test/md/2023_01002_AAB_EV.fitz.md
All PDF files have been converted to Markdown and saved in the '/Users/johannwaldherr/code/ww/ww_private/vDaten/.test/md' directory.


In [9]:
import re
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer, LTChar

extractor = "pdfminer"

def extract_text_with_style(pdf_path):
    text_elements = []
    for page_layout in extract_pages(pdf_path):
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                for text_line in element:
                    line_text = ""
                    line_font_size = None
                    for character in text_line:
                        if isinstance(character, LTChar):
                            line_text += character.get_text()
                            if line_font_size is None:
                                line_font_size = round(character.size)
                    text_elements.append((line_text.strip(), line_font_size))
    return text_elements

def determine_heading_level(font_size, base_font_size):
    if font_size >= base_font_size * 1.5:
        return 1
    elif font_size >= base_font_size * 1.3:
        return 2
    elif font_size >= base_font_size * 1.1:
        return 3
    return 0

def convert_to_markdown(text_elements):
    markdown_text = ""
    base_font_size = min(element[1] for element in text_elements if element[1] is not None)
    
    for text, font_size in text_elements:
        if not text:
            continue
        
        heading_level = determine_heading_level(font_size, base_font_size)
        
        if heading_level > 0:
            markdown_text += f"{'#' * heading_level} {text}\n\n"
        else:
            markdown_text += f"{text}\n\n"
    
    return markdown_text

def clean_markdown(markdown_text):
    # Remove extra newlines
    markdown_text = re.sub(r'\n{3,}', '\n\n', markdown_text)
    
    # Add proper line breaks for list items
    markdown_text = re.sub(r'(\n[â€¢\-*] .+?)(\n{2,})', r'\1\n', markdown_text)
    
    return markdown_text.strip()

def pdf_to_markdown_pdfminer(pdf_path, output_dir):
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]
    markdown_path = os.path.join(output_dir, f"{base_name}.{extractor}.md")

    text_elements = extract_text_with_style(pdf_path)
    markdown_text = convert_to_markdown(text_elements)
#    cleaned_markdown = markdown_text
    cleaned_markdown = clean_markdown(markdown_text)
    
    with open(markdown_path, 'w', encoding='utf-8') as f:
        f.write(cleaned_markdown)

    return markdown_path
# Usage

# Convert each PDF to Markdown
for pdf_file in pdf_files:
    markdown_file = pdf_to_markdown_pdfminer(pdf_file, markdown_dir)
    print(f"Converted {pdf_file} to {markdown_file}")

print(f"All PDF files have been converted to Markdown and saved in the '{markdown_dir}' directory.")

TypeError: 'LTChar' object is not iterable

In [10]:
import os
import re
from pdfminer.high_level import extract_text
from pdfminer.layout import LAParams

# Main script
#pdf_dir = 'path/to/your/pdf/directory'
# Create the output directory if it doesn't exist
extractor = "pdfminer2"


def pdf_to_markdown(pdf_path, mode='clean'):
    # Extract text from PDF
    laparams = LAParams(line_margin=0.5)
    text = extract_text(pdf_path, laparams=laparams)
    
    # Split text into lines
    lines = text.split('\n')
    
    markdown = ""
    current_heading_level = 0
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
        
        if mode == 'headingstructure':
            # Check if line is a potential heading
            if re.match(r'^[IVX]+\.', line):  # Roman numeral headings
                current_heading_level = 1
                markdown += f"# {line}\n\n"
            elif re.match(r'^[A-Z]\.', line):  # Letter headings
                current_heading_level = 2
                markdown += f"## {line}\n\n"
            elif re.match(r'^\d+\.', line):  # Numbered headings
                current_heading_level = 3
                markdown += f"### {line}\n\n"
            elif line.isupper():  # All caps lines as subheadings
                current_heading_level = 4
                markdown += f"#### {line}\n\n"
            else:
                # Regular text
                if current_heading_level > 0:
                    markdown += "\n"  # Add extra newline after headings
                    current_heading_level = 0
                markdown += f"{line}\n"
        else:  # 'clean' mode
            # Remove any special characters or formatting
            clean_line = re.sub(r'[^\w\s]', '', line)
            markdown += f"{clean_line}\n"
    
    return markdown

def pdf_to_markdown_pdfminer(pdf_path, output_dir, mode='clean'):
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]
    markdown_path = os.path.join(output_dir, f"{base_name}.{extractor}.md")
    
    markdown_text = pdf_to_markdown(pdf_path, mode)
    
    with open(markdown_path, 'w', encoding='utf-8') as f:
        f.write(markdown_text)
    
    return markdown_path


# Convert each PDF to Markdown
for pdf_file in pdf_files:
#    pdf_path = os.path.join(pdf_dir, pdf_file)
    markdown_file = pdf_to_markdown_pdfminer(pdf_file, markdown_dir,mode='headingstructure')
    print(f"Converted {pdf_file} to {markdown_file}")

print(f"All PDF files have been converted to Markdown and saved in the '{markdown_dir}' directory.")

Converted /Users/johannwaldherr/code/ww/ww_private/vDaten/.test/2023_02002_AAB_EV.pdf to /Users/johannwaldherr/code/ww/ww_private/vDaten/.test/md/2023_02002_AAB_EV.pdfminer2.md
Converted /Users/johannwaldherr/code/ww/ww_private/vDaten/.test/2023_01002_AAB_EV.pdf to /Users/johannwaldherr/code/ww/ww_private/vDaten/.test/md/2023_01002_AAB_EV.pdfminer2.md
All PDF files have been converted to Markdown and saved in the '/Users/johannwaldherr/code/ww/ww_private/vDaten/.test/md' directory.


In [11]:
import pdfplumber
import markdownify
import os
import re

extractor = "pdfplumber"


def clean_text(text):
    # Directly remove unnecessary escape sequences
    text = text.replace(r'\.', '.')
    text = text.replace(r'\-', '-')
    text = text.replace(r'\+', '+')
    
    # Replace hyphenated line breaks with proper words
    text = re.sub(r'\-\n', '', text)
    
    # Replace multiple newlines with two newlines (Markdown paragraph separator)
    text = re.sub(r'\n{2,}', '\n\n', text)
    
    # Remove excessive whitespace
    text = re.sub(r' +', ' ', text)
    
    return text

def pdf_to_text_with_pdfplumber(pdf_path):
    extracted_text = ""
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract text from each page
            page_text = page.extract_text()
            if page_text:
                extracted_text += page_text + "\n\n"
    
    return extracted_text

def save_to_markdown(text, base_name, output_dir):
    # Convert extracted text to markdown

    markdown_text = markdownify.markdownify(text, heading_style="ATX")
    markdown_text = clean_text(markdown_text)
    markdown_path = os.path.join(output_dir, f"{base_name}.{extractor}.md")

    # Save the markdown text to a file
    with open(markdown_path, "w") as md_file:
        md_file.write(markdown_text)
    
    return markdown_path

for pdf_file in pdf_files:
    base_name = os.path.splitext(os.path.basename(pdf_file))[0]
    extracted_text = pdf_to_text_with_pdfplumber(pdf_file)  # Extract text from the PDF
    markdown_file = save_to_markdown(extracted_text, base_name, markdown_dir)  # Save text as Markdown
    
    print(f"Converted {pdf_file} to {markdown_file}")

print(f"All PDF files have been converted to Markdown and saved in the '{markdown_dir}' directory.")

Converted /Users/johannwaldherr/code/ww/ww_private/vDaten/.test/2023_02002_AAB_EV.pdf to /Users/johannwaldherr/code/ww/ww_private/vDaten/.test/md/2023_02002_AAB_EV.pdfplumber.md
Converted /Users/johannwaldherr/code/ww/ww_private/vDaten/.test/2023_01002_AAB_EV.pdf to /Users/johannwaldherr/code/ww/ww_private/vDaten/.test/md/2023_01002_AAB_EV.pdfplumber.md
All PDF files have been converted to Markdown and saved in the '/Users/johannwaldherr/code/ww/ww_private/vDaten/.test/md' directory.
