In [2]:
import re

def _divide_into_sections(text):
    # Regex pattern to match Markdown headings (e.g., # Heading, ##Heading, etc.)
    # This pattern allows for optional spaces after the # characters
    pattern = r'^#+\s?.*$'

    # Use re.finditer to find all matches of the pattern
    matches = list(re.finditer(pattern, text, re.MULTILINE))

    # Initialize the list to hold the formatted sections
    formatted_sections = []

    # Iterate over the matches and slice the text accordingly
    for i, match in enumerate(matches):
        start = match.start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        heading = match.group()
        content = text[start:end].strip()
        formatted_section = f"{heading}\n{content}"
        formatted_sections.append(formatted_section)

    return formatted_sections

# Example text with Markdown headings
text = """
# Heading 1
This is the content for heading 1.

## Heading 2
This is the content for heading 2.

###Heading 3 (missing space)

###### Heading 4
This is the content for heading 4.

####### Heading 5 (too many #)
This is invalid because Markdown only allows up to 6 # characters.

##     Heading 6 (extra space)
This is valid because extra spaces after ## are allowed.

#
This is a heading with no text.

##
##
"""

# Divide the text into sections
sections = _divide_into_sections(text)

# Print each section
for section in sections:
    print(section)
    print("---")

# Heading 1
# Heading 1
This is the content for heading 1.
---
## Heading 2
## Heading 2
This is the content for heading 2.
---
###Heading 3 (missing space)
###Heading 3 (missing space)
---
###### Heading 4
###### Heading 4
This is the content for heading 4.
---
####### Heading 5 (too many #)
####### Heading 5 (too many #)
This is invalid because Markdown only allows up to 6 # characters.
---
##     Heading 6 (extra space)
##     Heading 6 (extra space)
This is valid because extra spaces after ## are allowed.
---
#
This is a heading with no text.
#
This is a heading with no text.
---
##
##
##
##
---


In [3]:
import re

def _divide_into_sections(text: str) -> list:
    """
    Divides the input text into sections based on Markdown headings marked with `#` signs.

    Args:
        text (str): The input text to be divided into sections.

    Returns:
        list: A list of sections, where each section includes the heading and its
              related content (if any).
    """
    # Regex to identify sections based on Markdown headings marked with `#` signs
    # This regex allows for optional leading whitespace before the `#` characters
    section_regex = r"(^\s*#+\s?.*$)"  # Matches Markdown headings (e.g., # Heading, ## Heading, etc.)

    # Split the text into sections using the regex
    sections = re.split(section_regex, text, flags=re.MULTILINE)

    # Combine headings with their directly related content
    combined_sections = []
    for i in range(1, len(sections), 2):  # Start from 1 to skip the first empty element
        heading = sections[i].strip()  # Remove leading/trailing whitespace from the heading
        content = sections[i + 1].strip()  # Remove leading/trailing whitespace from the content

        # If there's no content, just keep the heading
        if not content:
            combined_sections.append(heading)
        else:
            combined_sections.append(f"{heading}\n{content}")

    return combined_sections

# Example text with Markdown headings
text = """
    # Heading 1
    This is the content for heading 1.

    ## Heading 2
    This is the content for heading 2.

    ###Heading 3 (missing space)

    ###### Heading 4
    This is the content for heading 4.

    ####### Heading 5 (too many #)
    This is invalid because Markdown only allows up to 6 # characters.

    ##     Heading 6 (extra space)
    This is valid because extra spaces after ## are allowed.

    #
    This is a heading with no text.

    ##
    ##
"""

# Divide the text into sections
sections = _divide_into_sections(text)

# Print each section
for section in sections:
    print(section)
    print("---")

# Heading 1
This is the content for heading 1.
---
## Heading 2
This is the content for heading 2.
---
###Heading 3 (missing space)
---
###### Heading 4
This is the content for heading 4.
---
####### Heading 5 (too many #)
This is invalid because Markdown only allows up to 6 # characters.
---
##     Heading 6 (extra space)
This is valid because extra spaces after ## are allowed.
---
#
    This is a heading with no text.
---
##
    ##
---
