In [1]:
import docx
from pylatex import Document, Section, Subsection, Command
from pylatex.utils import NoEscape
from pathlib import Path

In [2]:
def find_word_documents(root_folder):
    """
    Recursively finds all .docx files in the given root_folder and its subfolders.

    Args:
        root_folder (str): The root directory to start searching from.

    Returns:
        list: A list of paths to the .docx files found.
    """
    root_path = Path(root_folder)
    word_documents = list(root_path.rglob("*.docx"))
    return word_documents

In [3]:
list_of_word_docs = find_word_documents("D:\\MB Projects\\NS Standard Specs")

In [4]:
from markdownify import markdownify as md


def word_to_markdown(input_file_path, output_file_path):
    """
    Converts a Word document to a Markdown file while preserving the document structure and formatting.

    Args:
    input_file_path (str or Path): The path to the input .docx file.
    output_file_path (str or Path): The path to the output .md file.
    """
    # Load the Word document
    doc = docx.Document(input_file_path)

    # Initialize a list to hold Markdown lines
    md_content = []

    for para in doc.paragraphs:
        if para.style.name.startswith("Heading"):
            # Convert Word headings to Markdown headings
            level = int(para.style.name.split()[-1])
            md_content.append(f"{'#' * level} {para.text}")
        elif para.runs:
            # Process the paragraph for inline formatting
            md_para = ""
            for run in para.runs:
                text = run.text
                if run.bold:
                    text = f"**{text}**"
                if run.italic:
                    text = f"*{text}*"
                if run.underline:
                    text = f"__{text}__"

                # Links are handled implicitly by markdownify if present in the text
                md_para += text
            md_content.append(md_para)
        else:
            # Standard text paragraph
            md_content.append(para.text)

        # Add a line break after each paragraph to preserve structure
        md_content.append("")

    # Join all the collected Markdown lines
    final_md = "\n".join(md_content)

    # Write the Markdown content to the output file
    with open(output_file_path, "w", encoding="utf-8") as md_file:
        md_file.write(final_md)
    print(f"Markdown document saved to: {output_file_path}")

In [5]:
for document in list_of_word_docs:
    word_to_markdown(document, (document.parent / (document.stem + ".md")))

Markdown document saved to: D:\MB Projects\NS Standard Specs\Culvert Std Spec DRAFT 08092023.md
Markdown document saved to: D:\MB Projects\NS Standard Specs\Paint Specs_Updated_11022022.md
Markdown document saved to: D:\MB Projects\NS Standard Specs\APPENDICES\CROSSHOLE SONIC LOGGING TESTING PROCEDURES.md
Markdown document saved to: D:\MB Projects\NS Standard Specs\DIVISION 1 - GENERAL\NS SPECS-FRONT END.md
Markdown document saved to: D:\MB Projects\NS Standard Specs\DIVISION 1 - GENERAL\VOID\NS SPECS-FRONT END-OLD.md
Markdown document saved to: D:\MB Projects\NS Standard Specs\DIVISION 2 - SITE WORK\020000 - TRAFFIC CONTROL AND PROTECTION.md
Markdown document saved to: D:\MB Projects\NS Standard Specs\DIVISION 2 - SITE WORK\020005 - TEMPORARY PAVEMENT MARKINGS.md
Markdown document saved to: D:\MB Projects\NS Standard Specs\DIVISION 2 - SITE WORK\020010 - PERMANENT PAVEMENT MARKINGS.md
Markdown document saved to: D:\MB Projects\NS Standard Specs\DIVISION 2 - SITE WORK\020015 - SITE CLE

In [8]:
import os
import tkinter as tk
from tkinter import filedialog
import nbformat as nbf


class MarkdownToNotebookApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Markdown to Jupyter Notebook Converter")

        self.directory = tk.StringVar()

        # Entry for directory selection
        self.directory_entry = tk.Entry(root, textvariable=self.directory, width=50)
        self.directory_entry.pack(padx=10, pady=10, side=tk.LEFT)

        self.browse_button = tk.Button(
            root, text="Browse", command=self.browse_directory
        )
        self.browse_button.pack(padx=10, pady=10, side=tk.LEFT)

        # Creating a frame for the listbox with a scrollbar
        self.listbox_frame = tk.Frame(root)
        self.listbox_frame.pack(padx=10, pady=10, fill="both", expand=True)

        self.listbox = tk.Listbox(
            self.listbox_frame, selectmode=tk.EXTENDED, width=80, height=20
        )
        self.listbox.pack(side="left", fill="both", expand=True)

        self.scrollbar = tk.Scrollbar(self.listbox_frame, orient="vertical")
        self.scrollbar.config(command=self.listbox.yview)
        self.scrollbar.pack(side="right", fill="y")

        self.listbox.config(yscrollcommand=self.scrollbar.set)

        # Entry for output file name
        self.filename_frame = tk.Frame(root)
        self.filename_label = tk.Label(self.filename_frame, text="Filename:")
        self.filename_label.pack(padx=10, pady=10, side=tk.LEFT)

        self.filename_entry = tk.Entry(self.filename_frame)
        self.filename_entry.pack(padx=10, pady=10, side=tk.LEFT)

        self.filename_frame.pack(fill=tk.X)

        # Convert button
        self.convert_button = tk.Button(
            root, text="Convert", command=self.convert_files
        )
        self.convert_button.pack(padx=10, pady=10)

        self.markdown_files = []

    def browse_directory(self):
        directory = filedialog.askdirectory()
        self.directory.set(directory)
        self.load_markdown_files()

    def load_markdown_files(self):
        # Clear any existing listbox items
        self.listbox.delete(0, tk.END)

        # Recursively load markdown files from the selected directory
        self.markdown_files = {}
        for root, _, files in os.walk(self.directory.get()):
            markdown_files = [f for f in files if f.endswith(".md")]
            if markdown_files:
                relative_root = os.path.relpath(root, self.directory.get())
                for file in markdown_files:
                    full_path = os.path.join(relative_root, file)
                    self.markdown_files[full_path] = os.path.join(
                        self.directory.get(), full_path
                    )
                    self.listbox.insert(tk.END, full_path)

    def convert_files(self):
        selected_indices = self.listbox.curselection()
        selected_files = [
            self.markdown_files[self.listbox.get(i)] for i in selected_indices
        ]

        if not selected_files:
            print("No files selected.")
            return

        notebook_cells = []
        for file in selected_files:
            with open(file, "r", encoding="utf-8") as md_file:
                content = md_file.read()
                notebook_cells.append(nbf.v4.new_markdown_cell(content))

        notebook = nbf.v4.new_notebook(cells=notebook_cells)

        output_filename = self.filename_entry.get() or "output.ipynb"
        if not output_filename.endswith(".ipynb"):
            output_filename += ".ipynb"

        with open(output_filename, "w", encoding="utf-8") as nb_file:
            nbf.write(notebook, nb_file)

        print(f"Notebook saved as {output_filename}")

In [10]:
root = tk.Tk()
app = MarkdownToNotebookApp(root)
root.mainloop()

No files selected.
No files selected.
Notebook saved as D:\MB Projects\NS Standard Specs\test_file.ipynb
