# Text Processing Plugin Interface

> Domain-specific plugin interface for text processing operations

In [None]:
#| default_exp plugin_interface

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from abc import abstractmethod
from typing import Dict, Any

from cjm_plugin_system.core.interface import PluginInterface

from cjm_text_plugin_system.core import TextProcessResult

In [None]:
#| export
class TextProcessingPlugin(PluginInterface):
    """
    Abstract base class for plugins that perform NLP operations.
    
    Extends PluginInterface with text processing requirements:
    - `execute`: Dispatch method for different text operations
    - `split_sentences`: Split text into sentence spans with character positions
    """
    
    # Entry point group for discovery (legacy, kept for metadata)
    entry_point_group = "text.plugins"

    @abstractmethod
    def execute(
        self,
        action: str = "split_sentences",  # Operation to perform: 'split_sentences', 'tokenize', etc.
        **kwargs
    ) -> Dict[str, Any]:  # JSON-serializable result
        """Execute a text processing operation."""
        ...

    @abstractmethod
    def split_sentences(
        self,
        text: str,  # Input text to split
        **kwargs
    ) -> TextProcessResult:  # Result with TextSpan objects containing character indices
        """Split text into sentence spans with accurate character positions."""
        ...

## How It Works

```
Host Process                              Worker Process (Isolated Env)
┌─────────────────────┐                  ┌─────────────────────────────┐
│                     │                  │                             │
│ plugin.execute(     │   HTTP/JSON      │  TextProcessingPlugin       │
│   action="split_    │ ─────────────────▶    .execute(                │
│     sentences",     │                  │       action="split_        │
│   text="Hello..."   │                  │         sentences",        │
│ )                   │                  │       text="Hello..."       │
│                     │                  │    )                        │
│                     │  ◀───────────────│                             │
│ # Receives JSON     │   JSON response  │  # Returns TextProcessResult│
│ # with spans        │                  │  # serialized to JSON       │
└─────────────────────┘                  └─────────────────────────────┘
```

The `execute()` method acts as a dispatcher that routes to specific operations like `split_sentences()`.

## Example Implementation

A minimal text processing plugin that demonstrates the interface:

In [None]:
import re
from typing import Optional, List
from cjm_text_plugin_system.core import TextSpan, TextProcessResult

class ExampleTextPlugin(TextProcessingPlugin):
    """Example implementation showing how to create a text processing plugin."""
    
    def __init__(self):
        self._config: Dict[str, Any] = {}

    @property
    def name(self) -> str:
        return "example-text-processor"
    
    @property
    def version(self) -> str:
        return "1.0.0"

    def initialize(self, config: Optional[Dict[str, Any]] = None) -> None:
        """Initialize with configuration."""
        self._config = config or {}

    def execute(
        self,
        action: str = "split_sentences",
        **kwargs
    ) -> Dict[str, Any]:
        """Dispatch to the appropriate text processing method."""
        if action == "split_sentences":
            result = self.split_sentences(**kwargs)
            return {
                "spans": [span.to_dict() for span in result.spans],
                "metadata": result.metadata
            }
        else:
            raise ValueError(f"Unknown action: {action}")

    def split_sentences(
        self,
        text: str,
        **kwargs
    ) -> TextProcessResult:
        """Split text into sentences using simple regex."""
        spans: List[TextSpan] = []
        
        # Simple sentence splitting on .!? followed by whitespace
        pattern = r'[^.!?]*[.!?]'
        
        for match in re.finditer(pattern, text):
            sentence = match.group().strip()
            if sentence:
                spans.append(TextSpan(
                    text=sentence,
                    start_char=match.start(),
                    end_char=match.end(),
                    label="sentence"
                ))
        
        return TextProcessResult(
            spans=spans,
            metadata={"processor": self.name, "method": "regex"}
        )

    def get_config_schema(self) -> Dict[str, Any]:
        """Return JSON Schema for configuration."""
        return {
            "type": "object",
            "properties": {}
        }

    def get_current_config(self) -> Dict[str, Any]:
        """Return current configuration."""
        return self._config

    def cleanup(self) -> None:
        """Clean up resources."""
        pass

In [None]:
# Test the example plugin
plugin = ExampleTextPlugin()
plugin.initialize({})

print(f"Plugin: {plugin.name} v{plugin.version}")
print(f"Config schema: {plugin.get_config_schema()}")
print(f"Current config: {plugin.get_current_config()}")

Plugin: example-text-processor v1.0.0
Config schema: {'type': 'object', 'properties': {}}
Current config: {}


In [None]:
# Test split_sentences directly
text = "Hello world. How are you? I am fine!"
result = plugin.split_sentences(text)

print(f"\nInput: '{text}'")
print(f"Spans found: {len(result.spans)}")
print(f"Metadata: {result.metadata}")

for i, span in enumerate(result.spans):
    print(f"  {i}: '{span.text}' [{span.start_char}:{span.end_char}]")
    # Verify mapping back to original
    assert text[span.start_char:span.end_char].strip() == span.text


Input: 'Hello world. How are you? I am fine!'
Spans found: 3
Metadata: {'processor': 'example-text-processor', 'method': 'regex'}
  0: 'Hello world.' [0:12]
  1: 'How are you?' [12:25]
  2: 'I am fine!' [25:36]


In [None]:
# Test execute() dispatcher (as Worker would call it)
json_result = plugin.execute(action="split_sentences", text=text)

print(f"\nJSON result from execute():")
print(f"  spans: {len(json_result['spans'])} items")
print(f"  metadata: {json_result['metadata']}")

for span_dict in json_result['spans']:
    print(f"    - {span_dict}")


JSON result from execute():
  spans: 3 items
  metadata: {'processor': 'example-text-processor', 'method': 'regex'}
    - {'text': 'Hello world.', 'start_char': 0, 'end_char': 12, 'label': 'sentence', 'metadata': {}}
    - {'text': 'How are you?', 'start_char': 12, 'end_char': 25, 'label': 'sentence', 'metadata': {}}
    - {'text': 'I am fine!', 'start_char': 25, 'end_char': 36, 'label': 'sentence', 'metadata': {}}


In [None]:
# Cleanup
plugin.cleanup()

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()