In [None]:
import re
def recursive_split(text, max_size=2000):
    if len(text) <= max_size:
        return [text]
    mid = len(text) // 2

    # Try splitting on delimiters in order: period with whitespace, newline, then space.
    for pattern in [r'\n+', r'\.\s+', r'\s+']:
        matches = list(re.finditer(pattern, text))
        if matches:
            # Find the match closest to the middle.
            best_match = min(matches, key=lambda m: abs(m.start() - mid))
            split_index = best_match.end()  # split after the delimiter
            # Avoid degenerate splits.
            if split_index <= 0 or split_index >= len(text):
                continue
            left = text[:split_index].strip()
            right = text[split_index:].strip()
            return recursive_split(left, max_size) + recursive_split(right, max_size)
        # If no delimiter was found, force a split at max_size.
        return [text[:max_size]] + recursive_split(text[max_size:], max_size)

In [None]:
def run_tests():
    # Test 1: Text smaller than max_size should not split.
    text1 = 'Short text.'
    result1 = recursive_split(text1, max_size=50)
    assert result1 == [text1], f'Expected [text1] but got {result1}'

    # Test 2: Text longer than max_size should split based on delimiters.
    text2 = 'Sentence one. Sentence two with more text. Sentence three is here.'
    result2 = recursive_split(text2, max_size=25)
    assert len(result2) > 1, f'Expected multiple chunks but got {result2}'

    # Test 3: Text with newlines as delimiters.
    text3 = 'Line one\nLine two\nLine three\nLine four'
    result3 = recursive_split(text3, max_size=10)
    assert len(result3) > 1, f'Expected multiple chunks but got {result3}'

    # Test 4: Text forcing a split without any delimiters (force split at max_size).
    text4 = 'A' * 100
    result4 = recursive_split(text4, max_size=30)
    assert len(result4) > 1, f'Expected multiple chunks but got {result4}'

    print('All tests passed!')

run_tests()

In [None]:
multi_paragraph_text = """This is the first sentence of the first paragraph. Here's the second sentence. And the third sentence.

Now this is the beginning of the second paragraph. It also has multiple sentences. Finally, the last sentence is here.

The final paragraph starts here. It too, has more than one sentence. Indeed, another sentence follows."""

for size in [10, 25, 50, 100, 150]:
    chunks = recursive_split(multi_paragraph_text, max_size=size)
    print(f"Max size: {size}")
    print("Chunks:")
    print(chunks)
    print("-------------------------")