## Join the markdown files into one

In [2]:
import os

first_page = 429
last_page = 473
source_dir = 'markdown_ua'
dest_dir = 'output'


def join_md_files(first_page, last_page, source_dir, dest_dir):
    # Ensure the destination directory exists
    os.makedirs(dest_dir, exist_ok=True)

    # Define the output file name based on the first and last page numbers
    output_file_name = f'pages_{first_page}-{last_page}_ua.md'
    output_file_path = os.path.join(dest_dir, output_file_name)

    # Open the output file in write mode
    with open(output_file_path, 'w', encoding='utf-8') as output_file:
        # Loop through the range of pages
        for page in range(first_page, last_page + 1):
            # Format the page number with leading zeros
            page_str = str(page).zfill(3)
            # Construct the input file path
            input_file_path = os.path.join(
                source_dir, f'page_{page_str}_ua.md')

            # Check if the input file exists before trying to read it
            if os.path.exists(input_file_path):
                # Open the input file and read its content
                with open(input_file_path, 'r', encoding='utf-8') as input_file:
                    content = input_file.read()
                    # Write the content to the output file
                    output_file.write(content)
                    # Optionally, add a newline between files' content if needed
                    output_file.write('\n')
            else:
                print(f'Warning: File {input_file_path} does not exist and was skipped.')

    print(f'All files successfully joined into {output_file_path}')
    return output_file_path


markdown_file_path = join_md_files(first_page, last_page, source_dir, dest_dir)

All files successfully joined into output/pages_429-473_ua.md


## Approach 1: Markdown LateX to PDF (fails for complex formulas)

Install pandoc (Linux)

In [None]:
!apt-get update
!apt-get install pandoc texlive-latex-base


Install pandoc (Macos)

In [20]:
!brew install pandoc

[34m==>[0m [1mSearching for similarly named formulae and casks...[0m
[31mError:[0m No formulae or casks found for xelatex.


To parse LateX on Mac, you would also need to install https://www.tug.org/mactex/ 

In [2]:
!pandoc --from=markdown --to=pdf --output=./output/pages_419-427_ua.pdf ./output/pages_419-427_ua.md

Error producing PDF.
! LaTeX Error: Unicode character Т (U+0422)
               not set up for use with LaTeX.

See the LaTeX manual or LaTeX Companion for explanation.
Type  H <return>  for immediate help.
 ...                                              
                                                  
l.68 Т



In [7]:
import subprocess
import os

source_file = './output/pages_419-427_ua.md'

def convert_md_to_pdf(source=markdown_file_path):
    # Construct the Pandoc command
    destination = os.path.splitext(source)[0] + '.pdf'
    command = ['pandoc', source, '-o', destination]

    try:
        # Execute the command
        subprocess.run(command, check=True)
        print(f'Successfully converted {source} to {destination}')
    except subprocess.CalledProcessError as e:
        # Handle errors in the conversion process
        print(f'Error during conversion: {e}')


convert_md_to_pdf(source_file)

FileNotFoundError: [Errno 2] No such file or directory: 'pandoc'

## Approach 2: LateX to images

In [3]:
%pip install matplotlib -q

Note: you may need to restart the kernel to use updated packages.


In [12]:
import re
import os
import matplotlib.pyplot as plt

# Config

first_page = 429
last_page = 473
source_dir = 'output'
dest_dir = 'output_with_img'

markdown_file = f'./output/pages_{first_page}-{last_page}_ua.md'
img_output_dir = f'./{dest_dir}/img'
output_markdown_file = f'./{dest_dir}/pages_{first_page}-{last_page}_ua.md'
failed_formulas_file = f'./{dest_dir}/failed_formulas_{first_page}-{last_page}.md'

# List to store failed LaTeX formulas
failed_formulas = []


def generate_latex_image(latex_string, output_path):
    """
    Generate an image from a LaTeX string.

    Parameters:
    latex_string (str): The LaTeX string to render.
    output_path (str): The file path where the image will be saved.
    """
    # Create a figure with minimal padding
    fig = plt.figure(figsize=(0.1, 0.1), dpi=300)
    ax = fig.add_subplot(111)

    # Hide the axis
    ax.axis('off')

    # Render the LaTeX string
    ax.text(0.5, 0.5, f'${latex_string}$',
            fontsize=12, ha='center', va='center')

    # Adjust the margins
    plt.subplots_adjust(left=0, right=1, top=1, bottom=0)

    try:
        # Save the image with minimal padding
        plt.savefig(output_path, bbox_inches='tight',
                    pad_inches=0, transparent=True)
        plt.close(fig)
    except ValueError as e:
        print(f"Error generating image for LaTeX string: {latex_string}")
        print(e)
        plt.close(fig)
        # Append the failed formula to the list
        failed_formulas.append(latex_string)


def convert_latex_to_images(markdown_file, img_output_dir, output_markdown_file):
    """
    Convert LaTeX strings in a markdown file to images and replace them with image links.

    Parameters:
    markdown_file (str): Path to the input markdown file.
    img_output_dir (str): Directory to save the generated images.
    output_markdown_file (str): Path to the output markdown file with replaced LaTeX strings.
    """
    if not os.path.exists(img_output_dir):
        os.makedirs(img_output_dir)

    # Read the markdown file
    with open(markdown_file, 'r', encoding='utf-8') as file:
        content = file.read()

    # Find all block LaTeX strings in the markdown file
    block_latex_patterns = re.findall(r'\$\$(.*?)\$\$', content, re.DOTALL)

    for i, latex_string in enumerate(block_latex_patterns):
        latex_string = latex_string.strip()

        if '\\tag{' in latex_string:
            print(f"Skipping LaTeX string with unsupported command: {
                  latex_string}")
            failed_formulas.append(latex_string)
            continue

        image_name = f'latex_image_block_{i+1}.png'
        image_path = os.path.join(img_output_dir, image_name)

        # Generate image from LaTeX string
        generate_latex_image(latex_string, image_path)

        # Replace LaTeX string with image link in markdown content using regex
        image_link = f'\n\n![LaTeX]({os.path.join("img", image_name)})\n\n'
        content = re.sub(rf'\$\$\s*{re.escape(latex_string)}\s*\$\$',
                         image_link, content, count=1, flags=re.DOTALL)

    # Find all inline LaTeX strings in the markdown file
    inline_latex_patterns = re.findall(r'\$(.*?)\$', content)

    for i, latex_string in enumerate(inline_latex_patterns):
        latex_string = latex_string.strip()

        if '\\tag{' in latex_string:
            print(f"Skipping LaTeX string with unsupported command: {
                  latex_string}")
            failed_formulas.append(latex_string)
            continue

        image_name = f'latex_image_inline_{i+1}.png'
        image_path = os.path.join(img_output_dir, image_name)

        # Generate image from LaTeX string
        generate_latex_image(latex_string, image_path)

        # Replace LaTeX string with image link in markdown content using regex
        image_link = f' ![LaTeX]({os.path.join("img", image_name)}) '
        content = re.sub(
            rf'\$\s*{re.escape(latex_string)}\s*\$', image_link, content, count=1)

    # Save the modified markdown file
    with open(output_markdown_file, 'w', encoding='utf-8') as file:
        file.write(content)

    # Save the failed LaTeX formulas to a markdown file
    with open(failed_formulas_file, 'w', encoding='utf-8') as file:
        for formula in failed_formulas:
            file.write(f'$$ {formula} $$\n\n')


convert_latex_to_images(markdown_file, img_output_dir, output_markdown_file)

Skipping LaTeX string with unsupported command: E[f_j] = r_j \cdot Q_i \tag{12.29}
Skipping LaTeX string with unsupported command: \text{Objective}(S, \mathcal{Y}) = \sum_{X_i \in S, X_i \overset{\leftarrow}{Y_j}} \text{dist}(X_i, Y_j). \tag{12.31}
Skipping LaTeX string with unsupported command: n = \frac{R^2 \cdot \ln(1/\delta)}{2 \epsilon^2}
\tag{12.32}
Skipping LaTeX string with unsupported command: id_i = \log(n/n_i). \tag{13.1}
Skipping LaTeX string with unsupported command: h(x_i) = f(x_i) id_i. \tag{13.2}
Skipping LaTeX string with unsupported command: \cos(X, Y) = \frac{\sum_{i=1}^{d} h(x_i) h(y_i)}{\sqrt{\sum_{i=1}^{d} h(x_i)^2} \sqrt{\sum_{i=1}^{d} h(y_i)^2}} \tag{13.3}
Skipping LaTeX string with unsupported command: J(X, Y) = \frac{\sum_{i=1}^{d} h(x_i) h(y_i)}{\sum_{i=1}^{d} h(x_i)^2 + \sum_{i=1}^{d} h(y_i)^2 - \sum_{i=1}^{d} h(x_i) h(y_i)} \tag{13.4}
Skipping LaTeX string with unsupported command: P(w_j|G_m) = \frac{\sum_X P(G_m|X) \cdot I(X, w_j)}{\sum_X P(G_m|X)} \tag{13

In [18]:
import re
import os
import matplotlib.pyplot as plt

# Config

first_page = 429
last_page = 473
source_dir = 'output'
dest_dir = 'output_with_img'

markdown_file = f'./output/pages_{first_page}-{last_page}_ua.md'
img_output_dir = f'./{dest_dir}/img'
output_markdown_file = f'./{dest_dir}/pages_{first_page}-{last_page}_ua.md'
failed_formulas_file = f'./{dest_dir}/failed_formulas_{first_page}-{last_page}.md'

# List to store failed LaTeX formulas
failed_formulas = []


def generate_latex_image(latex_string, output_path):
    """
    Generate an image from a LaTeX string.

    Parameters:
    latex_string (str): The LaTeX string to render.
    output_path (str): The file path where the image will be saved.
    """
    # Create a figure with minimal padding
    fig = plt.figure(figsize=(0.1, 0.1), dpi=300)
    ax = fig.add_subplot(111)

    # Hide the axis
    ax.axis('off')

    # Render the LaTeX string
    ax.text(0.5, 0.5, f'${latex_string}$',
            fontsize=12, ha='center', va='center')

    # Adjust the margins
    plt.subplots_adjust(left=0, right=1, top=1, bottom=0)

    try:
        # Save the image with minimal padding
        plt.savefig(output_path, bbox_inches='tight',
                    pad_inches=0, transparent=True)
        plt.close(fig)
    except ValueError as e:
        print(f"Error generating image for LaTeX string: {latex_string}")
        print(e)
        plt.close(fig)
        # Append the failed formula to the list
        failed_formulas.append(latex_string)


def process_tag(latex_string):
    """
    Remove \tag{} from LaTeX string and return the tag separately.

    Parameters:
    latex_string (str): The LaTeX string containing \tag{}.

    Returns:
    tuple: (cleaned_latex_string, tag)
    """
    tag_match = re.search(r'\\tag\{(.*?)\}', latex_string)
    if tag_match:
        tag = tag_match.group(1)
        latex_string = re.sub(r'\\tag\{.*?\}', '', latex_string)
        return latex_string.strip(), tag
    return latex_string, None


def convert_latex_to_images(markdown_file, img_output_dir, output_markdown_file):
    """
    Convert LaTeX strings in a markdown file to images and replace them with image links.

    Parameters:
    markdown_file (str): Path to the input markdown file.
    img_output_dir (str): Directory to save the generated images.
    output_markdown_file (str): Path to the output markdown file with replaced LaTeX strings.
    """
    if not os.path.exists(img_output_dir):
        os.makedirs(img_output_dir)

    # Read the markdown file
    with open(markdown_file, 'r', encoding='utf-8') as file:
        content = file.read()

    # Find all block LaTeX strings in the markdown file
    block_latex_patterns = re.findall(r'\$\$(.*?)\$\$', content, re.DOTALL)

    for i, latex_string in enumerate(block_latex_patterns):
        latex_string, tag = process_tag(latex_string)
        image_name = f'latex_image_block_{i+1}.png'
        image_path = os.path.join(img_output_dir, image_name)

        # Generate image from LaTeX string
        generate_latex_image(latex_string, image_path)

        # Replace LaTeX string with image link in markdown content using regex
        if tag:
            image_link = (
                f'<figure>\n'
                f'    <img src="{os.path.join("img", image_name)}" alt="{
                    tag}">\n'
                f'    <figcaption>({tag})</figcaption>\n'
                f'</figure>\n'
            )
        else:
            image_link = f'\n\n![LaTeX]({os.path.join("img", image_name)})\n\n'

        content = re.sub(rf'\$\$\s*{re.escape(latex_string)}\s*\$\$',
                         image_link, content, count=1, flags=re.DOTALL)

    # Find all inline LaTeX strings in the markdown file
    inline_latex_patterns = re.findall(r'\$(.*?)\$', content)

    for i, latex_string in enumerate(inline_latex_patterns):
        latex_string, tag = process_tag(latex_string)
        image_name = f'latex_image_inline_{i+1}.png'
        image_path = os.path.join(img_output_dir, image_name)

        # Generate image from LaTeX string
        generate_latex_image(latex_string, image_path)

        # Replace LaTeX string with image link in markdown content using regex
        if tag:
            image_link = (
                f'<figure>\n'
                f'    <img src="{os.path.join("img", image_name)}" alt="{
                    tag}">\n'
                f'    <figcaption>({tag})</figcaption>\n'
                f'</figure>\n'
            )
        else:
            image_link = f' ![LaTeX]({os.path.join("img", image_name)}) '

        content = re.sub(
            rf'\$\s*{re.escape(latex_string)}\s*\$', image_link, content, count=1)

    # Save the modified markdown file
    with open(output_markdown_file, 'w', encoding='utf-8') as file:
        file.write(content)

    # Save the failed LaTeX formulas to a markdown file
    with open(failed_formulas_file, 'w', encoding='utf-8') as file:
        for formula in failed_formulas:
            file.write(f'$$ {formula} $$\n\n')


convert_latex_to_images(markdown_file, img_output_dir, output_markdown_file)

Error generating image for LaTeX string: 

$$
^
ParseException: Expected end of text, found '$'  (at char 0), (line:1, col:1)
Error generating image for LaTeX string: 

$$
^
ParseException: Expected end of text, found '$'  (at char 0), (line:1, col:1)
Error generating image for LaTeX string: 

$$
^
ParseException: Expected end of text, found '$'  (at char 0), (line:1, col:1)
Error generating image for LaTeX string: 

$$
^
ParseException: Expected end of text, found '$'  (at char 0), (line:1, col:1)
Error generating image for LaTeX string: 

$$
^
ParseException: Expected end of text, found '$'  (at char 0), (line:1, col:1)
Error generating image for LaTeX string: 

$$
^
ParseException: Expected end of text, found '$'  (at char 0), (line:1, col:1)
Error generating image for LaTeX string: E[e^{t \cdot Z}] \le e^{t^2 \cdot (b-a)^2/8}

E[e^{t \cdot Z}] \le e^{t^2 \cdot (b-a)^2/8}
                 ^
ParseFatalException: Unknown symbol: \le, found '\'  (at char 17), (line:1, col:18)
Error gen

In [20]:
import re
import os
import matplotlib.pyplot as plt

# Config

first_page = 429
last_page = 473
source_dir = 'output'
dest_dir = 'output_with_img'

markdown_file = f'./output/pages_{first_page}-{last_page}_ua.md'
img_output_dir = f'./{dest_dir}/img'
output_markdown_file = f'./{dest_dir}/pages_{first_page}-{last_page}_ua.md'
failed_formulas_file = f'./{dest_dir}/failed_formulas_{first_page}-{last_page}.md'

# List to store failed LaTeX formulas
failed_formulas = []


def replace_problematic_latex_symbols(latex_string):
    """
    Replace problematic LaTeX commands with Unicode equivalents.

    Parameters:
    latex_string (str): The original LaTeX string.

    Returns:
    str: The modified LaTeX string with problematic symbols replaced.
    """
    replacements = {
        r'\le': '≤',
        # Add more replacements if needed
    }

    for latex_cmd, unicode_symbol in replacements.items():
        latex_string = latex_string.replace(latex_cmd, unicode_symbol)

    return latex_string


def generate_latex_image(latex_string, output_path):
    """
    Generate an image from a LaTeX string.

    Parameters:
    latex_string (str): The LaTeX string to render.
    output_path (str): The file path where the image will be saved.
    """
    # Replace problematic LaTeX symbols
    latex_string = replace_problematic_latex_symbols(latex_string)

    # Create a figure with minimal padding
    fig = plt.figure(figsize=(0.1, 0.1), dpi=300)
    ax = fig.add_subplot(111)

    # Hide the axis
    ax.axis('off')

    # Render the LaTeX string
    ax.text(0.5, 0.5, f'${latex_string}$',
            fontsize=12, ha='center', va='center')

    # Adjust the margins
    plt.subplots_adjust(left=0, right=1, top=1, bottom=0)

    try:
        # Save the image with minimal padding
        plt.savefig(output_path, bbox_inches='tight',
                    pad_inches=0, transparent=True)
        plt.close(fig)
    except ValueError as e:
        print(f"Error generating image for LaTeX string: {latex_string}")
        print(e)
        plt.close(fig)
        # Append the failed formula to the list
        failed_formulas.append(latex_string)


def process_tag(latex_string):
    """
    Remove \tag{} from LaTeX string and return the tag separately.

    Parameters:
    latex_string (str): The LaTeX string containing \tag{}.

    Returns:
    tuple: (cleaned_latex_string, tag)
    """
    tag_match = re.search(r'\\tag\{(.*?)\}', latex_string)
    if tag_match:
        tag = tag_match.group(1)
        latex_string = re.sub(r'\\tag\{.*?\}', '', latex_string)
        return latex_string.strip(), tag
    return latex_string, None


def convert_latex_to_images(markdown_file, img_output_dir, output_markdown_file):
    """
    Convert LaTeX strings in a markdown file to images and replace them with image links.

    Parameters:
    markdown_file (str): Path to the input markdown file.
    img_output_dir (str): Directory to save the generated images.
    output_markdown_file (str): Path to the output markdown file with replaced LaTeX strings.
    """
    if not os.path.exists(img_output_dir):
        os.makedirs(img_output_dir)

    # Read the markdown file
    with open(markdown_file, 'r', encoding='utf-8') as file:
        content = file.read()

    # Find all block LaTeX strings in the markdown file
    block_latex_patterns = re.findall(r'\$\$(.*?)\$\$', content, re.DOTALL)

    for i, latex_string in enumerate(block_latex_patterns):
        latex_string, tag = process_tag(latex_string)
        image_name = f'latex_image_block_{i+1}.png'
        image_path = os.path.join(img_output_dir, image_name)

        # Generate image from LaTeX string
        generate_latex_image(latex_string, image_path)

        # Replace LaTeX string with image link in markdown content using regex
        if tag:
            image_link = (
                f'<figure>\n'
                f'    <img src="{os.path.join("img", image_name)}" alt="{
                    tag}">\n'
                f'    <figcaption>({tag})</figcaption>\n'
                f'</figure>\n'
            )
        else:
            image_link = f'\n\n![LaTeX]({os.path.join("img", image_name)})\n\n'

        content = re.sub(rf'\$\$\s*{re.escape(latex_string)}\s*\$\$',
                         image_link, content, count=1, flags=re.DOTALL)

    # Find all inline LaTeX strings in the markdown file
    inline_latex_patterns = re.findall(r'\$(.*?)\$', content)

    for i, latex_string in enumerate(inline_latex_patterns):
        latex_string, tag = process_tag(latex_string)
        image_name = f'latex_image_inline_{i+1}.png'
        image_path = os.path.join(img_output_dir, image_name)

        # Generate image from LaTeX string
        generate_latex_image(latex_string, image_path)

        # Replace LaTeX string with image link in markdown content using regex
        if tag:
            image_link = (
                f'<figure>\n'
                f'    <img src="{os.path.join("img", image_name)}" alt="{
                    tag}">\n'
                f'    <figcaption>({tag})</figcaption>\n'
                f'</figure>\n'
            )
        else:
            image_link = f' ![LaTeX]({os.path.join("img", image_name)}) '

        content = re.sub(
            rf'\$\s*{re.escape(latex_string)}\s*\$', image_link, content, count=1)

    # Save the modified markdown file
    with open(output_markdown_file, 'w', encoding='utf-8') as file:
        file.write(content)

    # Save the failed LaTeX formulas to a markdown file
    with open(failed_formulas_file, 'w', encoding='utf-8') as file:
        for formula in failed_formulas:
            file.write(f'$$ {formula} $$\n\n')


convert_latex_to_images(markdown_file, img_output_dir, output_markdown_file)

Error generating image for LaTeX string: \xi = \max_{u_1, \ldots, u_n} ≤ft\{ \frac{\sum_{i=1}^n u_i}{n} - \frac{1}{n} \sum_{i=1}^n u_i y_i W \cdot X_i \right\}.

\xi = \max_{u_1, \ldots, u_n} ≤ft\{ \frac{\sum_{i=1}^n u_i}{n} - \frac{1}{n} \sum_{i=1}^n u_i y_i W \cdot X_i \right\}.
                                                                                                              ^
ParseFatalException: Unknown symbol: \right, found '\'  (at char 110), (line:1, col:111)
Error generating image for LaTeX string: \xi = \frac{1}{n} \sum_{i=1}^n \max_{u_i} u_i ≤ft\{ \frac{1}{n} - \frac{1}{n} y_i W \cdot X_i \right\}.

\xi = \frac{1}{n} \sum_{i=1}^n \max_{u_i} u_i ≤ft\{ \frac{1}{n} - \frac{1}{n} y_i W \cdot X_i \right\}.
                                                                                              ^
ParseFatalException: Unknown symbol: \right, found '\'  (at char 94), (line:1, col:95)
Error generating image for LaTeX string: \xi = \frac{1}{n} \sum_{i=1}^n \max ≤ft\{ 0

In [7]:
%pip install sympy -q


Note: you may need to restart the kernel to use updated packages.


In [29]:
import subprocess
import os


def convert_md_to_docx(input_md, output_docx):
    """
    Convert a markdown file to a DOCX file using Pandoc.

    Parameters:
    input_md (str): Path to the input markdown file.
    output_docx (str): Path to the output DOCX file.
    """
    # Ensure the output directory exists
    output_dir = os.path.dirname(output_docx)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Construct the pandoc command
    command = [
        'pandoc',
        input_md,
        '-o', output_docx,
        '--resource-path', os.path.dirname(input_md)
    ]

    try:
        # Run the pandoc command
        subprocess.run(command, check=True)
        print(f'Successfully converted {input_md} to {output_docx}')
    except subprocess.CalledProcessError as e:
        print(f'Error during conversion: {e}')


# Example usage
input_md = './output_with_img/pages_419-427_ua.md'
output_docx = './output_docx/pages_419-427_ua.docx'
convert_md_to_docx(input_md, output_docx)

Successfully converted ./output_with_img/pages_419-427_ua.md to ./output_docx/pages_419-427_ua.docx
