# First Step PDF Text to Markdown Extraction

In [9]:
# Import required libraries
import pdfplumber
import re
import os
from pathlib import Path

In [10]:

def extract_text_with_formatting(pdf_path):
    """
    Extract text from PDF while preserving original formatting and line spaces
    Returns a list of pages with text content
    """
    pages = []
    
    # Open PDF with pdfplumber
    with pdfplumber.open(pdf_path) as pdf:
        # Process each page
        for page in pdf.pages:
            # Extract text with original positioning
            text = page.extract_text(x_tolerance=3)  # Adjust x_tolerance to better detect columns
            
            # Split into lines while preserving empty lines
            lines = text.splitlines()
            formatted_lines = []
            
            # Track indentation and line spacing
            prev_line_empty = False
            
            for line in lines:
                # Calculate leading spaces for indentation
                leading_spaces = len(line) - len(line.lstrip())
                preserved_spaces = ' ' * leading_spaces
                
                # Preserve empty lines
                if not line.strip():
                    formatted_lines.append('')
                    prev_line_empty = True
                    continue
                
                # Handle different types of lines
                line_content = line.strip()
                
                # Check for list markers
                list_match = re.match(r'^(\s*[•●○\-*]|\s*\d+[\.\)])\s+(.+)$', line)
                if list_match:
                    # Preserve original list marker and indentation
                    formatted_lines.append(f"{preserved_spaces}{line_content}")
                else:
                    # Check for potential headers (all caps or numbered sections)
                    if (line_content.isupper() and len(line_content) > 1) or \
                       re.match(r'^\d+\.\d*\s+[A-Z]', line_content):
                        formatted_lines.append(f"{preserved_spaces}## {line_content}")
                    else:
                        formatted_lines.append(f"{preserved_spaces}{line_content}")
                
                prev_line_empty = False
            
            # Join lines back together, preserving empty lines
            formatted_page = '\n'.join(formatted_lines)
            
            # Preserve paragraph breaks
            formatted_page = re.sub(r'\n{3,}', '\n\n', formatted_page)
            
            # Add extra newline at the end of each page
            pages.append(formatted_page + '\n')
    
    return pages

def save_markdown_chunks(pages, output_dir, pages_per_file=100):
    """
    Save pages to markdown files, with each file containing the specified number of pages
    """
    # Create output directory if it doesn't exist
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Calculate number of files needed
    num_files = (len(pages) + pages_per_file - 1) // pages_per_file
    
    # Save pages to files
    for i in range(num_files):
        start_page = i * pages_per_file
        end_page = min((i + 1) * pages_per_file, len(pages))
        
        # Join pages with clear page separators
        chunk_content = '\n\n' + '\n\n' + ('=' * 80) + '\n\nPage {}\n\n'.format(i + 1) + ('=' * 80) + '\n\n'.join(pages[start_page:end_page])
        
        # Create filename with page range
        filename = f'pages_{start_page + 1}-{end_page}.md'
        filepath = output_dir / filename
        
        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(chunk_content)
            
        print(f'Saved {filename}')

def convert_pdf_to_markdown(pdf_path, output_dir):
    """
    Main function to convert PDF to markdown files
    """
    try:
        print(f"Processing PDF: {pdf_path}")
        pages = extract_text_with_formatting(pdf_path)
        print(f"Extracted {len(pages)} pages")
        
        save_markdown_chunks(pages, output_dir)
        print("Conversion completed successfully!")
        
    except Exception as e:
        print(f"Error processing PDF: {str(e)}")
        raise  # Re-raise the exception to see the full error trace

# Example usage
if __name__ == "__main__":
    # Replace these paths with your actual PDF path and desired output directory
    pdf_path = "input.pdf"
    output_dir = "markdown_output"
    
    convert_pdf_to_markdown(pdf_path, output_dir)

Processing PDF: input.pdf
Extracted 1 pages
Saved pages_1-1.md
Conversion completed successfully!


In [12]:
# Example usage
if __name__ == "__main__":
    # Replace these paths with your actual PDF path and desired output directory
    pdf_path = "input-stock.pdf"
    output_dir = "markdown_output"
    
    convert_pdf_to_markdown(pdf_path, output_dir)

Processing PDF: input-stock.pdf
Extracted 363 pages
Saved pages_1-100.md
Saved pages_101-200.md
Saved pages_201-300.md
Saved pages_301-363.md
Conversion completed successfully!


# Second Step Translation of Markdown Text

In [20]:
#%pip install deep-translator

In [21]:
from deep_translator import GoogleTranslator
from pathlib import Path
import re
import time

In [24]:

class MarkdownTranslator:
    def __init__(self):
        self.translator = GoogleTranslator(source='zh-CN', target='en')
        # Regex patterns to identify markdown elements
        self.patterns = {
            'headers': r'^(#{1,6})\s+(.+)$',
            'code_blocks': r'```[\s\S]*?```',
            'inline_code': r'`[^`]+`',
            'links': r'\[([^\]]+)\]\(([^\)]+)\)',
            'images': r'!\[([^\]]*)\]\(([^\)]+)\)'
        }

    def preserve_markdown_elements(self, text):
        """
        Preserve markdown elements by replacing them with placeholders
        Returns the modified text and a dictionary of replacements
        """
        preserved = {}
        count = 0

        # Preserve code blocks first (they might contain other markdown elements)
        for match in re.finditer(self.patterns['code_blocks'], text, re.MULTILINE):
            placeholder = f'CODE_BLOCK_{count}'
            preserved[placeholder] = match.group(0)
            text = text.replace(match.group(0), placeholder)
            count += 1

        # Preserve inline code
        for match in re.finditer(self.patterns['inline_code'], text):
            placeholder = f'INLINE_CODE_{count}'
            preserved[placeholder] = match.group(0)
            text = text.replace(match.group(0), placeholder)
            count += 1

        # Preserve headers with their markdown symbols
        for match in re.finditer(self.patterns['headers'], text, re.MULTILINE):
            placeholder = f'HEADER_{count}'
            preserved[placeholder] = match.group(1)  # Save the # symbols
            text = text.replace(match.group(1), placeholder)
            count += 1

        # Preserve links and images
        for pattern_name in ['links', 'images']:
            for match in re.finditer(self.patterns[pattern_name], text):
                placeholder = f'{pattern_name.upper()}_{count}'
                preserved[placeholder] = (match.group(1), match.group(2))
                text = text.replace(match.group(0), 
                                  f"{'!' if pattern_name == 'images' else ''}[{placeholder}]({match.group(2)})")
                count += 1

        return text, preserved

    def restore_markdown_elements(self, text, preserved):
        """Restore markdown elements from their placeholders"""
        for placeholder, original in preserved.items():
            if isinstance(original, tuple):  # Links and images
                text = text.replace(f'[{placeholder}]({original[1]})',
                                  f'[{original[0]}]({original[1]})')
            else:  # Other elements
                text = text.replace(placeholder, original)
        return text

    def translate_text(self, text):
        """Translate text from Chinese to English with retry mechanism"""
        max_retries = 3
        delay = 1  # Initial delay in seconds
        
        # Split text into smaller chunks (max 5000 characters per chunk)
        chunks = [text[i:i+4900] for i in range(0, len(text), 4900)]
        translated_chunks = []
        
        for chunk in chunks:
            for attempt in range(max_retries):
                try:
                    translated_chunk = self.translator.translate(text=chunk)
                    translated_chunks.append(translated_chunk)
                    time.sleep(1)  # Add delay between chunks to avoid rate limiting
                    break  # Success, move to next chunk
                except Exception as e:
                    if attempt == max_retries - 1:  # Last attempt
                        raise Exception(f"Translation failed after {max_retries} attempts: {str(e)}")
                    time.sleep(delay)
                    delay *= 2  # Exponential backoff
        
        return ' '.join(translated_chunks)

    def translate_markdown_file(self, input_path, output_path):
        """Translate a markdown file from Chinese to English while preserving formatting"""
        try:
            # Read the input file
            with open(input_path, 'r', encoding='utf-8') as f:
                content = f.read()

            # Preserve markdown elements
            content, preserved = self.preserve_markdown_elements(content)

            # Split content into paragraphs for better translation
            paragraphs = content.split('\n\n')
            translated_paragraphs = []

            # Translate each paragraph
            for i, paragraph in enumerate(paragraphs):
                if paragraph.strip():  # Only translate non-empty paragraphs
                    try:
                        translated_paragraph = self.translate_text(paragraph)
                        translated_paragraphs.append(translated_paragraph)
                        print(f"Translated paragraph {i+1}/{len(paragraphs)}")
                    except Exception as e:
                        print(f"Error translating paragraph {i+1}: {str(e)}")
                        translated_paragraphs.append(paragraph)  # Keep original if translation fails
                else:
                    translated_paragraphs.append(paragraph)  # Keep empty paragraphs as-is

            # Join paragraphs back together
            translated_content = '\n\n'.join(translated_paragraphs)

            # Restore markdown elements
            final_content = self.restore_markdown_elements(translated_content, preserved)

            # Write the translated content to the output file
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(final_content)

            return True

        except Exception as e:
            print(f"Error processing {input_path}: {str(e)}")
            return False

def translate_markdown_directory(input_dir, output_dir):
    """
    Translate all markdown files in a directory from Chinese to English
    """
    # Create output directory if it doesn't exist
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Initialize translator
    translator = MarkdownTranslator()
    
    # Process all markdown files
    input_dir = Path(input_dir)
    markdown_files = list(input_dir.glob('*.md'))
    
    print(f"Found {len(markdown_files)} markdown files to translate")
    
    for input_file in markdown_files:
        print(f"\nTranslating {input_file.name}...")
        output_file = output_dir / f"{input_file.stem}_english.md"
        
        if translator.translate_markdown_file(input_file, output_file):
            print(f"Successfully translated {input_file.name} to {output_file.name}")
        else:
            print(f"Failed to translate {input_file.name}")


In [25]:
# Example usage
if __name__ == "__main__":
    input_directory = "markdown_output"  # Directory containing your Chinese markdown files
    output_directory = "markdown_translated"  # Directory where translated files will be saved
    
    translate_markdown_directory(input_directory, output_directory)

Found 5 markdown files to translate

Translating pages_201-300.md...
Translated paragraph 3/104
Translated paragraph 4/104
Translated paragraph 5/104
Translated paragraph 6/104
Translated paragraph 7/104
Translated paragraph 8/104
Translated paragraph 9/104
Translated paragraph 10/104
Translated paragraph 11/104
Translated paragraph 12/104
Translated paragraph 13/104
Translated paragraph 14/104
Translated paragraph 15/104
Translated paragraph 16/104
Translated paragraph 17/104
Translated paragraph 18/104
Translated paragraph 19/104
Translated paragraph 20/104
Translated paragraph 21/104
Translated paragraph 22/104
Translated paragraph 23/104
Translated paragraph 24/104
Translated paragraph 25/104
Translated paragraph 26/104
Translated paragraph 27/104
Translated paragraph 28/104
Translated paragraph 29/104
Translated paragraph 30/104
Translated paragraph 31/104
Translated paragraph 32/104
Translated paragraph 33/104
Translated paragraph 34/104
Translated paragraph 35/104
Translated par

# Third Step translated text to PDF conversion

In [27]:
#%pip install markdown weasyprint PyPDF2

In [28]:
import os
from pathlib import Path
import markdown
from weasyprint import HTML, CSS
from PyPDF2 import PdfMerger
import re

In [33]:

class MarkdownPDFConverter:
    def __init__(self):
        self.css = CSS(string='''
            @page {
                size: letter;
                margin: 1in 1.25in;
                @bottom-center {
                    content: counter(page);
                    font-family: Arial, sans-serif;
                    font-size: 10pt;
                }
            }
            
            body {
                font-family: 'Times New Roman', serif;
                font-size: 11pt;
                line-height: 1.5;
                color: #333333;
            }
            
            /* Headings */
            h1, h2, h3, h4, h5, h6 {
                font-family: Arial, sans-serif;
                color: #222222;
                margin-top: 1.2em;
                margin-bottom: 0.5em;
                line-height: 1.2;
                page-break-after: avoid;
            }
            
            h1 { font-size: 18pt; }
            h2 { font-size: 16pt; }
            h3 { font-size: 14pt; }
            h4 { font-size: 12pt; }
            h5 { font-size: 11pt; font-weight: bold; }
            h6 { font-size: 11pt; font-style: italic; }
            
            /* Paragraphs */
            p {
                margin: 0.8em 0;
                text-align: justify;
                hyphens: auto;
            }
            
            /* Lists */
            ul, ol {
                margin: 0.5em 0 0.5em 0;
                padding-left: 2em;
            }
            
            li {
                margin: 0.3em 0;
                line-height: 1.4;
                text-align: justify;
            }
            
            ul li {
                list-style-type: disc;
            }
            
            ul li li {
                list-style-type: circle;
            }
            
            /* Tables */
            table {
                width: 100%;
                border-collapse: collapse;
                margin: 1em 0;
                page-break-inside: avoid;
            }
            
            th {
                background-color: #f5f5f5;
                border: 1pt solid #dddddd;
                padding: 8pt;
                text-align: left;
            }
            
            td {
                border: 1pt solid #dddddd;
                padding: 8pt;
                vertical-align: top;
            }
            
            /* Remove page break markers */
            .page-break {
                display: none;
            }
            
            /* Hide old page numbers */
            .old-page-number {
                display: none;
            }
        ''')

    def clean_markdown_content(self, content):
        """Clean and prepare markdown content before conversion"""
        # Remove page break markers and old page numbers
        content = re.sub(r'={20,}[\s\S]*?={20,}', '', content)
        content = re.sub(r'Page \d+', '', content)
        
        # Split content into lines
        lines = content.split('\n')
        cleaned_lines = []
        
        # Track if we're in a list
        in_list = False
        
        for line in lines:
            # Skip empty lines at the start of the document
            if not cleaned_lines and not line.strip():
                continue
                
            # Remove extra header markers from regular text
            if re.match(r'^#+\s+.*$', line):
                # Check if this is really a title (shorter than 100 chars and not ending with period)
                text = re.sub(r'^#+\s+', '', line).strip()
                if len(text) > 100 or text.endswith('.'):
                    # This is probably not a title, remove the '#' markers
                    line = text
            
            # Preserve list formatting
            list_match = re.match(r'^(\s*)[•\-\*]\s', line)
            if list_match:
                in_list = True
                indent = len(list_match.group(1))
                # Ensure proper list formatting
                line = ' ' * indent + '* ' + line.lstrip()[1:].strip()
            elif in_list and line.strip():
                # Check if this line should be part of the previous list item
                if line.startswith('    '):
                    cleaned_lines[-1] += ' ' + line.strip()
                    continue
                else:
                    in_list = False
            
            cleaned_lines.append(line)
        
        return '\n'.join(cleaned_lines)

    def extract_page_number(self, filename):
        """Extract page number from filename for sorting"""
        match = re.search(r'pages_(\d+)-', filename)
        return int(match.group(1)) if match else 0

    def markdown_to_pdf(self, input_file, output_file):
        """Convert a single markdown file to PDF"""
        try:
            # Read markdown content
            with open(input_file, 'r', encoding='utf-8') as f:
                md_content = f.read()
            
            # Clean and prepare the content
            cleaned_content = self.clean_markdown_content(md_content)
            
            # Convert markdown to HTML
            html_content = markdown.markdown(
                cleaned_content,
                extensions=[
                    'extra',
                    'codehilite',
                    'tables',
                    'sane_lists',
                    'attr_list'
                ]
            )

            # Wrap HTML content
            full_html = f'''
                <!DOCTYPE html>
                <html>
                <head>
                    <meta charset="UTF-8">
                    <meta name="viewport" content="width=device-width, initial-scale=1.0">
                </head>
                <body>
                    {html_content}
                </body>
                </html>
            '''

            # Convert HTML to PDF
            HTML(string=full_html).write_pdf(
                output_file,
                stylesheets=[self.css]
            )
            return True

        except Exception as e:
            print(f"Error converting {input_file} to PDF: {str(e)}")
            return False

def convert_and_merge_files(input_dir, output_pdf):
    """
    Convert all markdown files to PDFs and merge them into a single file
    """
    try:
        # Create temporary directory for individual PDFs
        temp_dir = Path('temp_pdfs')
        temp_dir.mkdir(exist_ok=True)
        
        # Initialize converter
        converter = MarkdownPDFConverter()
        
        # Get all markdown files and sort them by page number
        input_dir = Path(input_dir)
        markdown_files = list(input_dir.glob('*_english.md'))
        markdown_files.sort(key=lambda x: converter.extract_page_number(x.name))
        
        print(f"Found {len(markdown_files)} markdown files to convert")
        
        # Convert each markdown file to PDF
        pdf_files = []
        for md_file in markdown_files:
            print(f"Converting {md_file.name} to PDF...")
            pdf_file = temp_dir / f"{md_file.stem}.pdf"
            
            if converter.markdown_to_pdf(md_file, pdf_file):
                pdf_files.append(pdf_file)
                print(f"Successfully converted {md_file.name}")
            else:
                print(f"Failed to convert {md_file.name}")
        
        # Merge PDFs
        if pdf_files:
            print("\nMerging PDFs...")
            merger = PdfMerger()
            
            for pdf_file in pdf_files:
                merger.append(str(pdf_file))
            
            # Write the merged PDF
            merger.write(output_pdf)
            merger.close()
            
            print(f"Successfully created merged PDF: {output_pdf}")
        
        # Cleanup temporary files
        for pdf_file in pdf_files:
            pdf_file.unlink()
        temp_dir.rmdir()
        
    except Exception as e:
        print(f"Error during conversion/merging: {str(e)}")
        raise


In [34]:
# Example usage
if __name__ == "__main__":
    input_directory = "markdown_translated"  # Directory containing translated markdown files
    output_pdf = "final_translated_document.pdf"  # Name of the final merged PDF
    
    convert_and_merge_files(input_directory, output_pdf)

Found 4 markdown files to convert
Converting pages_1-100_english.md to PDF...
Successfully converted pages_1-100_english.md
Converting pages_101-200_english.md to PDF...
Successfully converted pages_101-200_english.md
Converting pages_201-300_english.md to PDF...
Successfully converted pages_201-300_english.md
Converting pages_301-363_english.md to PDF...
Successfully converted pages_301-363_english.md

Merging PDFs...
Successfully created merged PDF: final_translated_document.pdf
