# utils

> Text processing utilities for segmentation: word counting, position mapping, and statistics

In [None]:
#| default_exp utils

In [None]:
#| export
from typing import List, Dict, Any, TYPE_CHECKING

if TYPE_CHECKING:
    from cjm_transcript_segmentation.models import TextSegment

## Word Operations

In [None]:
#| export
def count_words(
    text: str  # Text to count words in
) -> int:  # Word count
    """Count the number of whitespace-delimited words in text."""
    if not text:
        return 0
    return len(text.split())

## Position Mapping

In [None]:
#| export
def word_index_to_char_position(
    text: str,  # Full text
    word_index: int  # Word index (0-based, split happens before this word)
) -> int:  # Character position for split
    """Convert a word index to the character position where a split should occur."""
    if word_index <= 0:
        return 0
    
    words = text.split()
    if word_index >= len(words):
        return len(text)
    
    # Find the character position before the word at word_index
    position = 0
    for i, word in enumerate(words):
        if i == word_index:
            break
        position += len(word)
        # Account for space after word (except for last word before split)
        if i < word_index - 1 or i < len(words) - 1:
            # Find the actual space position in text
            while position < len(text) and text[position] == ' ':
                position += 1
    
    return position

## Segment Statistics

In [None]:
#| export
def calculate_segment_stats(
    segments: List["TextSegment"]  # List of segments to analyze
) -> Dict[str, Any]:  # Statistics dictionary with total_words, total_segments
    """Calculate aggregate statistics for a list of segments."""
    total_words = sum(count_words(s.text) for s in segments)
    total_segments = len(segments)
    
    return {
        "total_words": total_words,
        "total_segments": total_segments,
    }

## Tests

In [None]:
assert count_words("") == 0
assert count_words("hello") == 1
assert count_words("The art of war") == 4
print("count_words tests passed")

In [None]:
text = "The art of war is vital"

assert word_index_to_char_position(text, 0) == 0
assert word_index_to_char_position(text, 100) == len(text)
print("word_index_to_char_position tests passed")

In [None]:
from cjm_transcript_segmentation.models import TextSegment

test_segments = [
    TextSegment(index=0, text="The art of war"),
    TextSegment(index=1, text="is of vital importance"),
    TextSegment(index=2, text="to the state"),
]

stats = calculate_segment_stats(test_segments)
assert stats["total_segments"] == 3
assert stats["total_words"] == 11
print("calculate_segment_stats tests passed")

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()