# Core Data Structures

> DTOs for Context Graph operations with FileBackedDTO support for zero-copy transfer

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import json
import tempfile
from dataclasses import dataclass, field, asdict
from pathlib import Path
from typing import Any, Dict, List, Optional

from cjm_plugin_system.core.interface import FileBackedDTO

## SourceRef

A pointer to external data residing in another plugin's domain. Used to anchor graph nodes to ground truth (e.g., a specific transcript segment from a transcription plugin).

This enables **federation** via DuckDB - a query can join graph nodes with their source data across isolated plugin databases.

In [None]:
#| export
@dataclass
class SourceRef:
    """A pointer to external data in another plugin's domain."""
    plugin_name: str  # e.g., "cjm-transcription-plugin-voxtral-hf"
    table_name: str   # e.g., "transcriptions"
    row_id: str       # e.g., "b0ceddd3-..." (typically a job_id)
    segment_slice: Optional[str] = None  # Optional slice: "char:0-500" or "timestamp:00:10-00:20"

    def to_dict(self) -> Dict[str, Any]:  # Dictionary representation for JSON serialization
        """Convert to dictionary."""
        return asdict(self)

In [None]:
# Test SourceRef creation
ref = SourceRef(
    plugin_name="cjm-transcription-plugin-voxtral-hf",
    table_name="transcriptions",
    row_id="b0ceddd3-1234-5678-9abc-def012345678",
    segment_slice="timestamp:00:10-00:30"
)

print(f"SourceRef: {ref}")
print(f"As dict: {ref.to_dict()}")

SourceRef: SourceRef(plugin_name='cjm-transcription-plugin-voxtral-hf', table_name='transcriptions', row_id='b0ceddd3-1234-5678-9abc-def012345678', segment_slice='timestamp:00:10-00:30')
As dict: {'plugin_name': 'cjm-transcription-plugin-voxtral-hf', 'table_name': 'transcriptions', 'row_id': 'b0ceddd3-1234-5678-9abc-def012345678', 'segment_slice': 'timestamp:00:10-00:30'}


In [None]:
# Test without optional segment_slice
ref_minimal = SourceRef(
    plugin_name="cjm-transcription-plugin-whisper",
    table_name="segments",
    row_id="job-123"
)
print(f"Minimal SourceRef: {ref_minimal}")

Minimal SourceRef: SourceRef(plugin_name='cjm-transcription-plugin-whisper', table_name='segments', row_id='job-123', segment_slice=None)


## GraphNode

Represents an entity in the Context Graph. Each node has:

- **id**: Unique identifier (UUID)
- **label**: Node type (e.g., "Person", "Concept", "Correction")
- **properties**: Arbitrary key-value data
- **sources**: Links to external plugin data for provenance tracking

In [None]:
#| export
@dataclass
class GraphNode:
    """Represents an entity in the Context Graph."""
    id: str           # UUID
    label: str        # e.g., "Person", "Concept", "Correction"
    properties: Dict[str, Any] = field(default_factory=dict)  # Arbitrary metadata
    sources: List[SourceRef] = field(default_factory=list)    # Links to external plugins

    def to_dict(self) -> Dict[str, Any]:  # Dictionary representation for JSON serialization
        """Convert to dictionary with nested sources."""
        return {
            "id": self.id,
            "label": self.label,
            "properties": self.properties,
            "sources": [s.to_dict() for s in self.sources]
        }

In [None]:
import uuid

# Test SourceRef creation
ref = SourceRef(
    plugin_name="cjm-transcription-plugin-voxtral-hf",
    table_name="transcriptions",
    row_id="b0ceddd3-1234-5678-9abc-def012345678",
    segment_slice="timestamp:00:10-00:30"
)

print(f"SourceRef: {ref}")
print(f"As dict: {ref.to_dict()}")

# Test GraphNode creation with sources
node = GraphNode(
    id=str(uuid.uuid4()),
    label="Person",
    properties={"name": "Alice", "role": "speaker"},
    sources=[ref]  # Link to transcript segment
)

print(f"GraphNode: {node}")
print(f"\nAs dict:\n{json.dumps(node.to_dict(), indent=2)}")

SourceRef: SourceRef(plugin_name='cjm-transcription-plugin-voxtral-hf', table_name='transcriptions', row_id='b0ceddd3-1234-5678-9abc-def012345678', segment_slice='timestamp:00:10-00:30')
As dict: {'plugin_name': 'cjm-transcription-plugin-voxtral-hf', 'table_name': 'transcriptions', 'row_id': 'b0ceddd3-1234-5678-9abc-def012345678', 'segment_slice': 'timestamp:00:10-00:30'}
GraphNode: GraphNode(id='84f11516-40d1-4741-bfb5-882d6d018148', label='Person', properties={'name': 'Alice', 'role': 'speaker'}, sources=[SourceRef(plugin_name='cjm-transcription-plugin-voxtral-hf', table_name='transcriptions', row_id='b0ceddd3-1234-5678-9abc-def012345678', segment_slice='timestamp:00:10-00:30')])

As dict:
{
  "id": "84f11516-40d1-4741-bfb5-882d6d018148",
  "label": "Person",
  "properties": {
    "name": "Alice",
    "role": "speaker"
  },
  "sources": [
    {
      "plugin_name": "cjm-transcription-plugin-voxtral-hf",
      "table_name": "transcriptions",
      "row_id": "b0ceddd3-1234-5678-9abc-de

In [None]:
# Test minimal node (no properties, no sources)
concept = GraphNode(
    id=str(uuid.uuid4()),
    label="Concept"
)
print(f"Minimal node: {concept.to_dict()}")

Minimal node: {'id': '6c354a53-a620-4bb7-adf2-dc9583915d5d', 'label': 'Concept', 'properties': {}, 'sources': []}


## GraphEdge

Represents a typed relationship between two nodes. Edges are directional (source â†’ target) and can carry properties.

In [None]:
#| export
@dataclass
class GraphEdge:
    """Represents a relationship between two nodes."""
    id: str            # UUID
    source_id: str     # Origin node UUID
    target_id: str     # Destination node UUID
    relation_type: str # e.g., "MENTIONS", "CORRECTS", "AUTHORED_BY"
    properties: Dict[str, Any] = field(default_factory=dict)  # Arbitrary metadata

    def to_dict(self) -> Dict[str, Any]:  # Dictionary representation for JSON serialization
        """Convert to dictionary."""
        return asdict(self)

In [None]:
# Create two nodes and an edge between them
person_id = str(uuid.uuid4())
concept_id = str(uuid.uuid4())

person_node = GraphNode(id=person_id, label="Person", properties={"name": "Bob"})
concept_node = GraphNode(id=concept_id, label="Concept", properties={"name": "Machine Learning"})

edge = GraphEdge(
    id=str(uuid.uuid4()),
    source_id=person_id,
    target_id=concept_id,
    relation_type="MENTIONS",
    properties={"confidence": 0.95, "timestamp": "00:15:30"}
)

print(f"Edge: {person_node.properties['name']} --[{edge.relation_type}]--> {concept_node.properties['name']}")
print(f"\nAs dict: {edge.to_dict()}")

Edge: Bob --[MENTIONS]--> Machine Learning

As dict: {'id': '2b4057bd-4199-4c72-a2da-158d1eb5f886', 'source_id': '205355aa-3dae-426a-a7cd-2ec340dd94b5', 'target_id': '5217de9d-ba75-4443-bea9-18b0a2b3cdfc', 'relation_type': 'MENTIONS', 'properties': {'confidence': 0.95, 'timestamp': '00:15:30'}}


## GraphContext

Container for graph query results (a subgraph). Implements `FileBackedDTO` for zero-copy transfer of large subgraphs between Host and Worker processes.

When passed through `RemotePluginProxy`, large GraphContext objects are automatically serialized to temp files rather than sent inline via JSON.

In [None]:
#| export
@dataclass
class GraphContext:
    """Container for graph query results (a subgraph)."""
    nodes: List[GraphNode]    # Nodes in the subgraph
    edges: List[GraphEdge]    # Edges in the subgraph
    metadata: Dict[str, Any] = field(default_factory=dict)  # Query metadata, stats, etc.

    def to_temp_file(self) -> str:  # Absolute path to temporary JSON file
        """Save graph data to a temp file for zero-copy transfer."""
        tmp = tempfile.NamedTemporaryFile(suffix=".json", delete=False, mode='w')
        
        data = {
            "nodes": [n.to_dict() for n in self.nodes],
            "edges": [e.to_dict() for e in self.edges],
            "metadata": self.metadata
        }
        
        json.dump(data, tmp)
        tmp.close()
        return str(Path(tmp.name).absolute())

    def to_dict(self) -> Dict[str, Any]:  # Dictionary representation for JSON serialization
        """Convert to dictionary."""
        return {
            "nodes": [n.to_dict() for n in self.nodes],
            "edges": [e.to_dict() for e in self.edges],
            "metadata": self.metadata
        }

    @classmethod
    def from_file(
        cls,
        filepath: str  # Path to JSON file
    ) -> "GraphContext":  # Reconstructed GraphContext
        """Load graph context from a JSON file."""
        with open(filepath, 'r') as f:
            data = json.load(f)

        nodes = []
        for n_data in data.get('nodes', []):
            sources = [SourceRef(**s) for s in n_data.get('sources', [])]
            nodes.append(GraphNode(
                id=n_data['id'],
                label=n_data['label'],
                properties=n_data.get('properties', {}),
                sources=sources
            ))

        edges = [GraphEdge(**e) for e in data.get('edges', [])]
        return cls(nodes=nodes, edges=edges, metadata=data.get('metadata', {}))

    @classmethod
    def from_dict(
        cls,
        data: Dict[str, Any]  # Dictionary with nodes, edges, metadata
    ) -> "GraphContext":  # Reconstructed GraphContext
        """Load graph context from a dictionary."""
        nodes = []
        for n_data in data.get('nodes', []):
            sources = [SourceRef(**s) for s in n_data.get('sources', [])]
            nodes.append(GraphNode(
                id=n_data['id'],
                label=n_data['label'],
                properties=n_data.get('properties', {}),
                sources=sources
            ))

        edges = [GraphEdge(**e) for e in data.get('edges', [])]
        return cls(nodes=nodes, edges=edges, metadata=data.get('metadata', {}))

In [None]:
# Create two nodes and an edge between them
person_id = str(uuid.uuid4())
concept_id = str(uuid.uuid4())

person_node = GraphNode(id=person_id, label="Person", properties={"name": "Bob"})
concept_node = GraphNode(id=concept_id, label="Concept", properties={"name": "Machine Learning"})

edge = GraphEdge(
    id=str(uuid.uuid4()),
    source_id=person_id,
    target_id=concept_id,
    relation_type="MENTIONS",
    properties={"confidence": 0.95, "timestamp": "00:15:30"}
)

print(f"Edge: {person_node.properties['name']} --[{edge.relation_type}]--> {concept_node.properties['name']}")
print(f"\nAs dict: {edge.to_dict()}")

# Test GraphContext creation
context = GraphContext(
    nodes=[person_node, concept_node],
    edges=[edge],
    metadata={"query": "neighbors of Bob", "depth": 1}
)

print(f"GraphContext: {len(context.nodes)} nodes, {len(context.edges)} edges")
print(f"Metadata: {context.metadata}")

# Test FileBackedDTO protocol
print(f"Implements FileBackedDTO: {isinstance(context, FileBackedDTO)}")

# Test to_temp_file (this is what the Proxy calls)
temp_path = context.to_temp_file()
print(f"Saved to temp file: {temp_path}")

# Verify file exists and content
import os
print(f"File exists: {os.path.exists(temp_path)}")
print(f"File size: {os.path.getsize(temp_path)} bytes")

# Read back and verify
with open(temp_path) as f:
    content = json.load(f)
print(f"\nFile content keys: {content.keys()}")
print(f"Nodes count: {len(content['nodes'])}")

# Clean up
os.unlink(temp_path)

Edge: Bob --[MENTIONS]--> Machine Learning

As dict: {'id': 'b38050e9-340b-40cb-b5c4-c19abb974518', 'source_id': '74bd93af-afe3-4bd1-b2f8-628808e4c063', 'target_id': '91f3eaf8-2a04-4872-8d48-297cd8a0d1b3', 'relation_type': 'MENTIONS', 'properties': {'confidence': 0.95, 'timestamp': '00:15:30'}}
GraphContext: 2 nodes, 1 edges
Metadata: {'query': 'neighbors of Bob', 'depth': 1}
Implements FileBackedDTO: True
Saved to temp file: /tmp/tmpqv06eyjh.json
File exists: True
File size: 561 bytes

File content keys: dict_keys(['nodes', 'edges', 'metadata'])
Nodes count: 2


In [None]:
# Test from_file round-trip
temp_path = context.to_temp_file()
loaded = GraphContext.from_file(temp_path)

print(f"Original: {len(context.nodes)} nodes, {len(context.edges)} edges")
print(f"Loaded:   {len(loaded.nodes)} nodes, {len(loaded.edges)} edges")
print(f"Node labels match: {[n.label for n in context.nodes] == [n.label for n in loaded.nodes]}")
print(f"Edge types match: {[e.relation_type for e in context.edges] == [e.relation_type for e in loaded.edges]}")

os.unlink(temp_path)

Original: 2 nodes, 1 edges
Loaded:   2 nodes, 1 edges
Node labels match: True
Edge types match: True


In [None]:
# Test from_dict
ctx_dict = context.to_dict()
loaded_from_dict = GraphContext.from_dict(ctx_dict)

print(f"From dict: {len(loaded_from_dict.nodes)} nodes, {len(loaded_from_dict.edges)} edges")

From dict: 2 nodes, 1 edges


## GraphQuery

A standardized query object that can represent:

- Raw query strings (SQL, Cypher, etc.)
- Structured search parameters

The `depth` parameter is used for neighborhood traversals.

In [None]:
#| export
@dataclass
class GraphQuery:
    """A standardized query object for graph operations."""
    query: str  # Raw query string (SQL, Cypher, etc.)
    parameters: Dict[str, Any] = field(default_factory=dict)  # Query parameters
    limit: int = 100   # Max results to return
    depth: int = 1     # Traversal depth for neighborhood queries

    def to_dict(self) -> Dict[str, Any]:  # Dictionary representation for JSON serialization
        """Convert to dictionary."""
        return asdict(self)

In [None]:
# Test GraphQuery with SQL-style query
sql_query = GraphQuery(
    query="SELECT * FROM nodes WHERE label = :label",
    parameters={"label": "Person"},
    limit=50
)
print(f"SQL Query: {sql_query}")
print(f"As dict: {sql_query.to_dict()}")

SQL Query: GraphQuery(query='SELECT * FROM nodes WHERE label = :label', parameters={'label': 'Person'}, limit=50, depth=1)
As dict: {'query': 'SELECT * FROM nodes WHERE label = :label', 'parameters': {'label': 'Person'}, 'limit': 50, 'depth': 1}


In [None]:
# Test GraphQuery for neighborhood traversal
traversal_query = GraphQuery(
    query="NEIGHBORS",
    parameters={"start_node": person_id},
    depth=2,
    limit=100
)
print(f"Traversal Query: {traversal_query}")

Traversal Query: GraphQuery(query='NEIGHBORS', parameters={'start_node': '74bd93af-afe3-4bd1-b2f8-628808e4c063'}, limit=100, depth=2)


In [None]:
# Test minimal query
simple = GraphQuery(query="SELECT * FROM nodes")
print(f"Simple query defaults: limit={simple.limit}, depth={simple.depth}")

Simple query defaults: limit=100, depth=1


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()