# Content Structure

> Domain schemas for content structure (Document, Segment)

In [None]:
#| default_exp domains.structure

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from typing import Optional
from pydantic import Field

from cjm_graph_domains.core import DomainNode

## Document

Represents a logical container for content such as a book chapter, podcast episode, lecture, or transcript. Documents group related segments into a traversable unit.

In [None]:
#| export
class Document(DomainNode):
    """A logical container for structured content."""
    
    title: str = Field(..., description="Title of the document")  # Document title
    media_type: str = Field("audio", description="Source media type")  # 'audio', 'video', or 'text'

In [None]:
show_doc(Document)

---

[source](https://github.com/cj-mills/cjm-graph-domains/blob/main/cjm_graph_domains/domains/structure.py#L15){target="_blank" style="float:right; font-size:smaller"}

### Document

```python

def Document(
    data:Any
)->None:


```

*A logical container for structured content.*

In [None]:
# Create a Document node
doc = Document(title="1. Laying Plans", media_type="audio")
print(f"Document: {doc.title}")
print(f"Media type: {doc.media_type}")
print(f"Label: {doc.get_label()}")

Document: 1. Laying Plans
Media type: audio
Label: Document


In [None]:
# Convert to GraphNode (name auto-populated from title)
graph_node = doc.to_graph_node()
print(f"GraphNode properties: {graph_node.properties}")
assert graph_node.properties['name'] == "1. Laying Plans"

GraphNode properties: {'title': '1. Laying Plans', 'media_type': 'audio', 'name': '1. Laying Plans'}


## Segment

Represents an atomic unit of text within a document, typically a sentence or paragraph. Segments are linked sequentially via `NEXT` edges to form a traversable "narrative spine".

For audio/video content, optional timing fields (`start_time`, `end_time`) enable alignment with the source media.

In [None]:
#| export
class Segment(DomainNode):
    """An atomic unit of text within a document."""
    
    text: str = Field(..., description="The segment text content")  # Segment text
    index: int = Field(..., description="Sequence position (0-indexed)")  # Position in sequence
    start_time: Optional[float] = Field(None, description="Start time in seconds")  # Audio/video start
    end_time: Optional[float] = Field(None, description="End time in seconds")  # Audio/video end
    role: str = Field("content", description="Segment role")  # 'content', 'title', or 'heading'

In [None]:
show_doc(Segment)

---

[source](https://github.com/cj-mills/cjm-graph-domains/blob/main/cjm_graph_domains/domains/structure.py#L22){target="_blank" style="float:right; font-size:smaller"}

### Segment

```python

def Segment(
    data:Any
)->None:


```

*An atomic unit of text within a document.*

In [None]:
# Create a Segment node
segment = Segment(
    text="The art of war is of vital importance to the state.",
    index=2,
    start_time=5.2,
    end_time=8.7,
    role="content"
)
print(f"Segment [{segment.index}]: {segment.text}")
print(f"Timing: {segment.start_time}s - {segment.end_time}s")

Segment [2]: The art of war is of vital importance to the state.
Timing: 5.2s - 8.7s


In [None]:
# Convert to GraphNode (name auto-populated from text, truncated to 50 chars)
graph_node = segment.to_graph_node()
print(f"GraphNode name: '{graph_node.properties['name']}'")
assert len(graph_node.properties['name']) <= 50

GraphNode name: 'The art of war is of vital importance to the state'


In [None]:
# Title segment (without timing)
title_segment = Segment(text="Laying Plans", index=0, role="title")
print(f"Title segment: {title_segment.text} (role={title_segment.role})")
assert title_segment.start_time is None

Title segment: Laying Plans (role=title)


## Example: Building a Narrative Spine

This example demonstrates building a traversable graph structure from transcript content.

In [None]:
from cjm_graph_plugin_system.core import SourceRef

# Source reference to the transcription job
source = SourceRef(
    plugin_name="cjm-transcription-plugin-voxtral-hf",
    table_name="transcriptions",
    row_id="b0ceddd3-05a0-40e6-ac99-1903dd3e7170"
)

# Create the document
doc = Document(title="1. Laying Plans", media_type="audio")

# Create segments from transcript sentences
sentences = [
    "Laying Plans",
    "Sun Tzu said,",
    "The art of war is of vital importance to the state.",
    "It is a matter of life and death, a road either to safety or to ruin.",
    "Hence it is a subject of inquiry which can on no account be neglected."
]

segments = [
    Segment(text=text, index=i, role="title" if i == 0 else "content")
    for i, text in enumerate(sentences)
]

# Convert to GraphNodes with provenance
doc_node = doc.to_graph_node(sources=[source])
segment_nodes = [s.to_graph_node(sources=[source]) for s in segments]

print(f"Document: {doc_node.properties['name']}")
print(f"Segments: {len(segment_nodes)}")
for node in segment_nodes:
    print(f"  [{node.properties['index']}] {node.properties['name'][:40]}...")

Document: 1. Laying Plans
Segments: 5
  [0] Laying Plans...
  [1] Sun Tzu said,...
  [2] The art of war is of vital importance to...
  [3] It is a matter of life and death, a road...
  [4] Hence it is a subject of inquiry which c...


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()