# segmentation

> Segmentation service for text decomposition via NLTK plugin

In [None]:
#| default_exp services.segmentation

In [None]:
#| export
from typing import List, Dict, Any, Optional
import asyncio

from cjm_plugin_system.core.manager import PluginManager
from cjm_source_provider.models import SourceBlock

from cjm_transcript_segmentation.models import TextSegment

## SegmentationService

This service wraps the NLTK text processing plugin to provide sentence splitting functionality. It converts raw text into `TextSegment` objects for further refinement in the UI.

In [None]:
#| export
class SegmentationService:
    """Service for text segmentation via NLTK plugin."""
    
    def __init__(
        self,
        plugin_manager: PluginManager,  # Plugin manager for accessing text plugin
        plugin_name: str = "cjm-text-plugin-nltk"  # Name of the text processing plugin
    ):
        """Initialize the segmentation service."""
        self._manager = plugin_manager
        self._plugin_name = plugin_name
    
    def is_available(self) -> bool:  # True if plugin is loaded and ready
        """Check if the text processing plugin is available."""
        return self._manager.get_plugin(self._plugin_name) is not None
    
    def ensure_loaded(
        self,
        config: Optional[Dict[str, Any]] = None  # Optional plugin configuration
    ) -> bool:  # True if successfully loaded
        """Ensure the text processing plugin is loaded."""
        if self.is_available():
            return True
        
        # Try to find and load the plugin
        meta = self._manager.get_discovered_meta(self._plugin_name)
        if meta:
            return self._manager.load_plugin(meta, config or {"language": "english"})
        return False
    
    async def split_sentences_async(
        self,
        text: str,  # Text to split into sentences
        source_id: Optional[str] = None,  # Source block ID for traceability
        source_provider_id: Optional[str] = None  # Source provider identifier for traceability
    ) -> List[TextSegment]:  # List of TextSegment objects
        """Split text into sentences asynchronously."""
        if not self.is_available():
            raise RuntimeError(f"Plugin {self._plugin_name} not loaded")
        
        # Execute plugin
        result = await self._manager.execute_plugin_async(
            self._plugin_name,
            action="split_sentences",
            text=text
        )
        
        # Convert spans to TextSegments
        segments = []
        spans = result.get('spans', [])
        
        for idx, span in enumerate(spans):
            segment = TextSegment(
                index=idx,
                text=span['text'],
                source_id=source_id,
                source_provider_id=source_provider_id,
                start_char=span.get('start_char'),
                end_char=span.get('end_char')
            )
            segments.append(segment)
        
        return segments
    
    def split_sentences(
        self,
        text: str,  # Text to split into sentences
        source_id: Optional[str] = None,  # Source block ID for traceability
        source_provider_id: Optional[str] = None  # Source provider identifier for traceability
    ) -> List[TextSegment]:  # List of TextSegment objects
        """Split text into sentences synchronously."""
        return asyncio.get_event_loop().run_until_complete(
            self.split_sentences_async(text, source_id, source_provider_id)
        )
    
    async def split_combined_sources_async(
        self,
        source_blocks: List[SourceBlock]  # Ordered list of source blocks
    ) -> List[TextSegment]:  # Combined list of TextSegments with proper traceability
        """Split multiple source blocks into segments with proper source tracking."""
        all_segments = []
        global_index = 0
        
        for block in source_blocks:
            segments = await self.split_sentences_async(
                text=block.text,
                source_id=block.id,
                source_provider_id=block.provider_id
            )
            
            # Update indices to be globally sequential
            for seg in segments:
                seg.index = global_index
                global_index += 1
                all_segments.append(seg)
        
        return all_segments

## Segment Manipulation Helpers

These functions support the UI operations for splitting, merging, and reordering segments.

In [None]:
#| export
def split_segment_at_position(
    segment: TextSegment,  # Segment to split
    char_position: int  # Character position to split at (relative to segment text)
) -> tuple[TextSegment, TextSegment]:  # Two new segments
    """Split a segment into two at the given character position."""
    if char_position <= 0 or char_position >= len(segment.text):
        raise ValueError("Split position must be within segment text")
    
    # Calculate new character offsets if source tracking exists
    first_start = segment.start_char
    first_end = segment.start_char + char_position if segment.start_char is not None else None
    second_start = first_end
    second_end = segment.end_char
    
    first = TextSegment(
        index=segment.index,
        text=segment.text[:char_position].strip(),
        source_id=segment.source_id,
        source_provider_id=segment.source_provider_id,
        start_char=first_start,
        end_char=first_end
    )
    
    second = TextSegment(
        index=segment.index + 1,  # Will need reindexing
        text=segment.text[char_position:].strip(),
        source_id=segment.source_id,
        source_provider_id=segment.source_provider_id,
        start_char=second_start,
        end_char=second_end
    )
    
    return first, second

In [None]:
#| export
def merge_text_segments(
    first: TextSegment,  # First segment (earlier in sequence)
    second: TextSegment,  # Second segment (later in sequence)
    separator: str = " "  # Text separator between segments
) -> TextSegment:  # Merged segment
    """Merge two adjacent segments into one."""
    merged_text = first.text + separator + second.text
    
    # Preserve source tracking if from same source
    source_id = first.source_id if first.source_id == second.source_id else None
    source_provider_id = first.source_provider_id if first.source_provider_id == second.source_provider_id else None
    
    # Merge character ranges if both exist and from same source
    start_char = first.start_char if source_id else None
    end_char = second.end_char if source_id else None
    
    return TextSegment(
        index=first.index,
        text=merged_text,
        source_id=source_id,
        source_provider_id=source_provider_id,
        start_char=start_char,
        end_char=end_char,
    )

In [None]:
#| export
def reindex_segments(
    segments: List[TextSegment]  # List of segments to reindex
) -> List[TextSegment]:  # Segments with corrected indices
    """Reindex segments to have sequential indices starting from 0."""
    for idx, segment in enumerate(segments):
        segment.index = idx
    return segments

## Source Block Reconstruction

In [None]:
#| export
def reconstruct_source_blocks(
    segment_dicts: List[Dict[str, Any]],  # Serialized working segments
) -> List[SourceBlock]:  # Reconstructed source blocks with combined text
    """Reconstruct source blocks by grouping segments by source_id and combining text."""
    segments_by_source: Dict[str, List[Dict[str, Any]]] = {}
    for seg_dict in segment_dicts:
        source_id = seg_dict.get("source_id", "unknown")
        if source_id not in segments_by_source:
            segments_by_source[source_id] = []
        segments_by_source[source_id].append(seg_dict)
    
    source_blocks = []
    for source_id, segs in segments_by_source.items():
        combined_text = " ".join(s.get("text", "") for s in segs)
        source_provider_id = segs[0].get("source_provider_id", "unknown") if segs else "unknown"
        source_blocks.append(SourceBlock(
            id=source_id, provider_id=source_provider_id, text=combined_text,
        ))
    
    return source_blocks

## Tests

The following cells demonstrate the segmentation service and helper functions.

In [None]:
# Test split_segment_at_position
segment = TextSegment(
    index=0,
    text="The art of war is of vital importance to the state.",
    source_id="job_123",
    source_provider_id="test-plugin",
    start_char=0,
    end_char=51
)

# Split at position 18 (after "The art of war is ")
first, second = split_segment_at_position(segment, 18)
print(f"Original: '{segment.text}'")
print(f"First:    '{first.text}' (chars {first.start_char}-{first.end_char})")
print(f"Second:   '{second.text}' (chars {second.start_char}-{second.end_char})")

Original: 'The art of war is of vital importance to the state.'
First:    'The art of war is' (chars 0-18)
Second:   'of vital importance to the state.' (chars 18-51)


In [None]:
# Test merge_text_segments
seg1 = TextSegment(
    index=0,
    text="The art of war",
    source_id="job_123",
    source_provider_id="test-plugin",
    start_char=0,
    end_char=14,
)

seg2 = TextSegment(
    index=1,
    text="is of vital importance to the state.",
    source_id="job_123",
    source_provider_id="test-plugin",
    start_char=15,
    end_char=51,
)

merged = merge_text_segments(seg1, seg2)
print(f"Segment 1: '{seg1.text}'")
print(f"Segment 2: '{seg2.text}'")
print(f"Merged:    '{merged.text}'")
print(f"Char range: {merged.start_char} - {merged.end_char}")

Segment 1: 'The art of war'
Segment 2: 'is of vital importance to the state.'
Merged:    'The art of war is of vital importance to the state.'
Char range: 0 - 51


In [None]:
# Test reindex_segments
segments = [
    TextSegment(index=5, text="First"),
    TextSegment(index=10, text="Second"),
    TextSegment(index=3, text="Third")
]

print("Before reindex:")
for s in segments:
    print(f"  index={s.index}: '{s.text}'")

reindex_segments(segments)

print("\nAfter reindex:")
for s in segments:
    print(f"  index={s.index}: '{s.text}'")

Before reindex:
  index=5: 'First'
  index=10: 'Second'
  index=3: 'Third'

After reindex:
  index=0: 'First'
  index=1: 'Second'
  index=2: 'Third'


In [None]:
# Test reconstruct_source_blocks
seg_dicts = [
    {"text": "First sentence.", "source_id": "job_1", "source_provider_id": "provider_a"},
    {"text": "Second sentence.", "source_id": "job_1", "source_provider_id": "provider_a"},
    {"text": "Third sentence.", "source_id": "job_2", "source_provider_id": "provider_b"},
]

blocks = reconstruct_source_blocks(seg_dicts)
assert len(blocks) == 2
assert blocks[0].id == "job_1"
assert blocks[0].provider_id == "provider_a"
assert blocks[0].text == "First sentence. Second sentence."
assert blocks[1].id == "job_2"
assert blocks[1].text == "Third sentence."

# Empty input
assert reconstruct_source_blocks([]) == []

# Missing source_id defaults to "unknown"
blocks = reconstruct_source_blocks([{"text": "orphan"}])
assert blocks[0].id == "unknown"

print("reconstruct_source_blocks tests passed")

reconstruct_source_blocks tests passed


### SegmentationService with NLTK Plugin

These tests require the NLTK plugin to be installed and discoverable.

In [None]:
#| eval: false
# Test SegmentationService with NLTK plugin
from pathlib import Path
from cjm_plugin_system.core.manager import PluginManager

# Calculate project root from notebook location (nbs/services/ -> project root)
project_root = Path.cwd().parent.parent
manifests_dir = project_root / ".cjm" / "manifests"

# Create plugin manager with explicit search path
manager = PluginManager(search_paths=[manifests_dir])
manager.discover_manifests()

print(f"Discovered {len(manager.discovered)} plugins from {manifests_dir}")

# Check if NLTK plugin is available
nltk_meta = manager.get_discovered_meta("cjm-text-plugin-nltk")
if nltk_meta:
    print(f"Found plugin: {nltk_meta.name} v{nltk_meta.version}")
else:
    print("NLTK plugin not found - install via plugins.yaml")

[PluginManager] Discovered manifest: cjm-text-plugin-nltk from /mnt/SN850X_8TB_EXT4/Projects/GitHub/cj-mills/cjm-transcript-segmentation/.cjm/manifests/cjm-text-plugin-nltk.json


Discovered 1 plugins from /mnt/SN850X_8TB_EXT4/Projects/GitHub/cj-mills/cjm-transcript-segmentation/.cjm/manifests
Found plugin: cjm-text-plugin-nltk v0.0.2


In [None]:
#| eval: false
# Initialize and test SegmentationService
if nltk_meta:
    # Load the plugin
    manager.load_plugin(nltk_meta, {"language": "english"})
    
    seg_service = SegmentationService(manager)
    print(f"Plugin available: {seg_service.is_available()}")
    
    # Test sentence splitting (use await directly - Jupyter supports top-level await)
    test_text = (
        "The art of war is of vital importance to the state. "
        "It is a matter of life and death, a road either to safety or to ruin. "
        "Hence it is a subject of inquiry which can on no account be neglected."
    )
    
    segments = await seg_service.split_sentences_async(
        text=test_text,
        source_id="test_job",
        source_provider_id="test"
    )
    
    print(f"\nSplit into {len(segments)} segments:")
    for seg in segments:
        print(f"  [{seg.index}] chars {seg.start_char}-{seg.end_char}: '{seg.text[:40]}...'")

[PluginManager] Launching worker for cjm-text-plugin-nltk...


[cjm-text-plugin-nltk] Starting worker on port 33111...
[cjm-text-plugin-nltk] Logs: /home/innom-dt/.cjm/logs/cjm-text-plugin-nltk.log


[PluginManager] HTTP Request: GET http://127.0.0.1:33111/health "HTTP/1.1 200 OK"


[cjm-text-plugin-nltk] Worker ready.


[PluginManager] HTTP Request: POST http://127.0.0.1:33111/initialize "HTTP/1.1 200 OK"
[PluginManager] Loaded plugin: cjm-text-plugin-nltk
[PluginManager] HTTP Request: POST http://127.0.0.1:33111/execute "HTTP/1.1 200 OK"


Plugin available: True

Split into 3 segments:
  [0] chars 0-51: 'The art of war is of vital importance to...'
  [1] chars 52-121: 'It is a matter of life and death, a road...'
  [2] chars 122-192: 'Hence it is a subject of inquiry which c...'


In [None]:
#| eval: false
# Test split_combined_sources_async with multiple source blocks
from cjm_source_provider.models import SourceBlock

if nltk_meta and seg_service.is_available():
    # Create test source blocks
    blocks = [
        SourceBlock(
            id="job_1",
            provider_id="provider_a",
            text="Sun Tzu said the art of war is vital. It determines victory or defeat."
        ),
        SourceBlock(
            id="job_2",
            provider_id="provider_b",
            text="Know your enemy. Know yourself. A hundred battles, a hundred victories."
        )
    ]
    
    # Use await directly (Jupyter supports top-level await)
    all_segments = await seg_service.split_combined_sources_async(blocks)
    
    print(f"Combined {len(blocks)} blocks into {len(all_segments)} segments:")
    for seg in all_segments:
        print(f"  [{seg.index}] source={seg.source_id}: '{seg.text[:35]}...'")

[PluginManager] HTTP Request: POST http://127.0.0.1:33111/execute "HTTP/1.1 200 OK"
[PluginManager] HTTP Request: POST http://127.0.0.1:33111/execute "HTTP/1.1 200 OK"


Combined 2 blocks into 5 segments:
  [0] source=job_1: 'Sun Tzu said the art of war is vita...'
  [1] source=job_1: 'It determines victory or defeat....'
  [2] source=job_2: 'Know your enemy....'
  [3] source=job_2: 'Know yourself....'
  [4] source=job_2: 'A hundred battles, a hundred victor...'


In [None]:
#| eval: false
# Cleanup
if nltk_meta:
    manager.unload_all()
    print("Plugins unloaded")

[PluginManager] HTTP Request: POST http://127.0.0.1:33111/cleanup "HTTP/1.1 200 OK"
[PluginManager] Unloaded plugin: cjm-text-plugin-nltk


Plugins unloaded


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()