In [5]:
import pymupdf4llm

md_text = pymupdf4llm.to_markdown("book7.pdf")

with open("book7.md", "w", encoding="utf-8") as f:
    f.write(md_text)

print("book7.pdf converted to book7.md")

Processing book7.pdf...
book7.pdf converted to book7.md


In [6]:
import re

def process_markdown(filepath):
    """
    Processes a markdown file, inserting END_OF_STORY and START_OF_STORY markers.

    Args:
        filepath: The path to the markdown file.
    """
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            content = f.read()

        # Regex to find story headings (### one, ### two, ..., ### twenty one)
        pattern = r"### (one|two|three|four|five|six|seven|eight|nine|ten|eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen|twenty|twenty one)"

        def replace_match(match):
            return f"END_OF_STORY\nSTART_OF_STORY\n{match.group(0)}"

        processed_content = re.sub(pattern, replace_match, content)

        with open(filepath, "w", encoding="utf-8") as f:
            f.write(processed_content)

        print(f"Processed {filepath} successfully.")

    except FileNotFoundError:
        print(f"Error: File '{filepath}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage (assuming your file is named "book7.md"):
process_markdown("book7.md")

Processed book7.md successfully.


In [7]:
import re

def clean_markdown(filepath):
    """
    Cleans a Markdown file by removing specified elements, preserving word spaces.

    Args:
        filepath: The path to the Markdown file.
    """
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            content = f.read()

        # Remove "####" and "----"
        content = re.sub(r"####|----", "", content)

        # Remove non-alphanumeric characters except spaces
        content = re.sub(r"[^a-zA-Z0-9\s]", "", content)

        # Remove extra spaces (multiple spaces to single spaces)
        content = re.sub(r"\s+", " ", content)

        #Remove leading/trailing spaces
        content = content.strip()

        with open(filepath, "w", encoding="utf-8") as f:
            f.write(content)

        print(f"Cleaned {filepath} successfully.")

    except FileNotFoundError:
        print(f"Error: File '{filepath}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage (assuming your file is named "book7.md"):
clean_markdown("book7.md")

Cleaned book7.md successfully.


In [8]:
import pymupdf4llm

md_text = pymupdf4llm.to_markdown("book8.pdf")

with open("book8.md", "w", encoding="utf-8") as f:
    f.write(md_text)

print("book8.pdf converted to book8.md")

Processing book8.pdf...
book8.pdf converted to book8.md


In [9]:
import pymupdf4llm

md_text = pymupdf4llm.to_markdown("book10.pdf")

with open("book10.md", "w", encoding="utf-8") as f:
    f.write(md_text)

print("book10.pdf converted to book10.md")

Processing book10.pdf...
book10.pdf converted to book10.md


In [10]:
clean_markdown("book10.md")

Cleaned book8.md successfully.


In [11]:
clean_markdown("book10.md")

Cleaned book10.md successfully.


In [13]:
import pymupdf4llm

md_text = pymupdf4llm.to_markdown("book8.pdf")

with open("book8.md", "w", encoding="utf-8") as f:
    f.write(md_text)

print("book8.pdf converted to book8.md")

Processing book8.pdf...
book8.pdf converted to book8.md


In [12]:
import pymupdf4llm

md_text = pymupdf4llm.to_markdown("book9.pdf")

with open("book9.md", "w", encoding="utf-8") as f:
    f.write(md_text)

print("book9.pdf converted to book9.md")

Processing book9.pdf...
book9.pdf converted to book9.md


In [14]:
import re

def process_stories(filepath):
    """
    Inserts START_OF_STORY and END_OF_STORY markers before/after specific headings.

    Args:
        filepath: The path to the Markdown file.
    """
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            content = f.read()

        # Define the story headings to look for
        headings = [
            "# They Eat Meat!",
            "# Sons",
            "# November Is the Month of Migrations",
            "# Getting Even",
            "# Eating with the Enemy",
            "# Blue Baby",
            "# Baso-jhi",
            "# Desire, Divination, Death",
            "# Merely a Whore",
            "# The Adivasi Will Not Dance",
        ]

        for heading in headings:
            # Escape special characters in the heading for regex
            escaped_heading = re.escape(heading)
            pattern = rf"^{escaped_heading}$"  # Match the heading at the beginning of a line

            def replace_match(match):
                return f"END_OF_STORY\nSTART_OF_STORY\n{match.group(0)}"

            content = re.sub(pattern, replace_match, content, flags=re.MULTILINE)

        with open(filepath, "w", encoding="utf-8") as f:
            f.write(content)

        print(f"Processed {filepath} successfully.")

    except FileNotFoundError:
        print(f"Error: File '{filepath}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage (replace "your_other_file.md" with the actual filename):
process_stories("book9.md")

Processed book9.md successfully.


In [15]:
import re

def clean_markdown(filepath):
    """
    Cleans a Markdown file by removing specified elements, preserving word spaces and underscores.

    Args:
        filepath: The path to the Markdown file.
    """
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            content = f.read()

        # Remove "####" and "----"
        content = re.sub(r"####|----", "", content)

        # Remove non-alphanumeric characters except spaces and underscores
        content = re.sub(r"[^a-zA-Z0-9\s_]", "", content)

        # Remove extra spaces (multiple spaces to single spaces)
        content = re.sub(r"\s+", " ", content)

        #Remove leading/trailing spaces
        content = content.strip()

        with open(filepath, "w", encoding="utf-8") as f:
            f.write(content)

        print(f"Cleaned {filepath} successfully.")

    except FileNotFoundError:
        print(f"Error: File '{filepath}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage (assuming your file is named "book7.md"):
clean_markdown("book9.md")

Cleaned book9.md successfully.


In [16]:
import re

def process_stories_with_hashes(filepath):
    """
    Inserts START_OF_STORY and END_OF_STORY markers before/after specific headings, adding ###.

    Args:
        filepath: The path to the Markdown file.
    """
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            content = f.read()

        # Define the story headings to look for (with ###)
        headings = [
            "### Sum Total",
            "### A Tale of a Summer Vacation",
            "### A Brief Lesson in Trust",
            "### Feet of Clay",
            "### My Lawfully Wedded Husband",
            "### Number 63",
            "### On the Night Train",
            "### Hourie",
            "### Silent Fear",
            "### St George and the Dragon",
            "### The Crusader",
            "### The Howling Waves of Tranquebar",
            "### A New Home for Bhainsa",
            "### One Night's Work",
        ]

        for heading in headings:
            # Escape special characters in the heading for regex
            escaped_heading = re.escape(heading)
            pattern = rf"^{escaped_heading}$"  # Match the heading at the beginning of a line

            def replace_match(match):
                return f"END_OF_STORY\nSTART_OF_STORY\n{match.group(0)}"

            content = re.sub(pattern, replace_match, content, flags=re.MULTILINE)

        with open(filepath, "w", encoding="utf-8") as f:
            f.write(content)

        print(f"Processed {filepath} successfully.")

    except FileNotFoundError:
        print(f"Error: File '{filepath}' not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage (replace "book8.md" with the actual filename):
process_stories_with_hashes("book8.md")

Processed book8.md successfully.


In [17]:
clean_markdown("book8.md")

Cleaned book8.md successfully.


In [18]:
def combine_md_files(input_files, output_file):
    """
    Combines multiple Markdown files into a single file in the specified order.

    Args:
        input_files: A list of input file paths (e.g., ["book7.md", "book8.md"]).
        output_file: The path to the output combined file.
    """
    try:
        combined_content = ""
        for input_file in input_files:
            with open(input_file, "r", encoding="utf-8") as f:
                combined_content += f.read() + "\n"  # Add a newline between files

        with open(output_file, "w", encoding="utf-8") as f:
            f.write(combined_content)

        print(f"Combined {input_files} into {output_file} successfully.")

    except FileNotFoundError as e:
        print(f"Error: One or more input files not found: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage:
input_files = ["book7.md", "book8.md", "book9.md", "book10.md"]
output_file = "combined_books.md"
combine_md_files(input_files, output_file)

Combined ['book7.md', 'book8.md', 'book9.md', 'book10.md'] into combined_books.md successfully.


In [None]:
input_files = ["preprocessed.md",]
output_file = "combined_books.md"
combine_md_files(input_files, output_file)