# Core Data Structures

> DTOs for text processing with character-level span tracking

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from dataclasses import dataclass, field, asdict
from typing import List, Dict, Any

In [None]:
#| export
@dataclass
class TextSpan:
    """Represents a segment of text with its original character coordinates."""
    text: str        # The text content of this span
    start_char: int  # 0-indexed start position in original string
    end_char: int    # 0-indexed end position (exclusive)
    label: str = "sentence"  # Span type: 'sentence', 'token', 'paragraph', etc.
    metadata: Dict[str, Any] = field(default_factory=dict)  # Additional span metadata

    def to_dict(self) -> Dict[str, Any]:  # Dictionary representation
        """Convert span to dictionary for serialization."""
        return asdict(self)

In [None]:
#| export
@dataclass
class TextProcessResult:
    """Container for text processing results."""
    spans: List[TextSpan]  # List of text spans from processing
    metadata: Dict[str, Any] = field(default_factory=dict)  # Processing metadata

## Testing TextSpan

`TextSpan` tracks character positions so you can map processed results back to the original text.

In [None]:
# Test TextSpan creation
span = TextSpan(
    text="Hello world.",
    start_char=0,
    end_char=12,
    label="sentence"
)

print(f"TextSpan: '{span.text}'")
print(f"Position: [{span.start_char}, {span.end_char})")
print(f"Label: {span.label}")
print(f"As dict: {span.to_dict()}")

TextSpan: 'Hello world.'
Position: [0, 12)
Label: sentence
As dict: {'text': 'Hello world.', 'start_char': 0, 'end_char': 12, 'label': 'sentence', 'metadata': {}}


In [None]:
# Test TextSpan with metadata
span_with_meta = TextSpan(
    text="This is a token.",
    start_char=13,
    end_char=30,
    label="token",
    metadata={"pos": "NOUN", "confidence": 0.98}
)

print(f"Span with metadata: {span_with_meta.to_dict()}")

Span with metadata: {'text': 'This is a token.', 'start_char': 13, 'end_char': 30, 'label': 'token', 'metadata': {'pos': 'NOUN', 'confidence': 0.98}}


## Testing TextProcessResult

`TextProcessResult` holds multiple spans from a processing operation.

In [None]:
# Test TextProcessResult with multiple spans
original_text = "Hello world. How are you?"

result = TextProcessResult(
    spans=[
        TextSpan(text="Hello world.", start_char=0, end_char=12, label="sentence"),
        TextSpan(text="How are you?", start_char=13, end_char=25, label="sentence"),
    ],
    metadata={"processor": "example", "language": "en"}
)

print(f"Number of spans: {len(result.spans)}")
print(f"Metadata: {result.metadata}")

for i, span in enumerate(result.spans):
    print(f"  Span {i}: '{span.text}' [{span.start_char}:{span.end_char}]")
    # Verify span maps back to original text
    assert original_text[span.start_char:span.end_char] == span.text

Number of spans: 2
Metadata: {'processor': 'example', 'language': 'en'}
  Span 0: 'Hello world.' [0:12]
  Span 1: 'How are you?' [13:25]


In [None]:
# Test minimal result (empty spans)
empty_result = TextProcessResult(spans=[])
print(f"Empty result: {len(empty_result.spans)} spans, metadata: {empty_result.metadata}")

Empty result: 0 spans, metadata: {}


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()