# Core Data Structures

> DTOs for audio transcription with FileBackedDTO support for zero-copy transfer

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import tempfile
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional

import numpy as np
import soundfile as sf

from cjm_plugin_system.core.interface import FileBackedDTO

In [None]:
#| export
@dataclass
class AudioData:
    """
    Container for raw audio data.
    Implements FileBackedDTO for zero-copy transfer between Host and Worker processes.
    """
    samples: np.ndarray  # Audio sample data as numpy array
    sample_rate: int     # Sample rate in Hz (e.g., 16000, 44100)

    def to_temp_file(self) -> str: # Absolute path to temporary WAV file
        """Save audio to a temp file for zero-copy transfer to Worker process."""
        # Create temp file (delete=False so Worker can read it)
        tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
        
        # Ensure float32 format
        audio = self.samples
        if audio.dtype != np.float32:
            audio = audio.astype(np.float32)
        
        # Normalize if needed
        max_val = np.abs(audio).max()
        if max_val > 1.0:
            audio = audio / max_val
        
        # Write to disk
        sf.write(tmp.name, audio, self.sample_rate)
        tmp.close()
        
        return str(Path(tmp.name).absolute())

    def to_dict(self) -> Dict[str, Any]: # Serialized representation
        """Convert to dictionary for smaller payloads."""
        return {
            "samples": self.samples.tolist(),
            "sample_rate": self.sample_rate
        }
    
    @classmethod
    def from_file(
        cls,
        filepath: str # Path to audio file
    ) -> "AudioData": # AudioData instance
        """Load audio from a file."""
        samples, sample_rate = sf.read(filepath, dtype='float32')
        return cls(samples=samples, sample_rate=sample_rate)

In [None]:
#| export
@dataclass
class TranscriptionResult:
    """Standardized output for all transcription plugins."""
    text: str                                        # The transcribed text
    confidence: Optional[float] = None               # Overall confidence (0.0 to 1.0)
    segments: Optional[List[Dict[str, Any]]] = None  # Timestamped segments
    metadata: Dict[str, Any] = field(default_factory=dict)  # Additional metadata

## Testing AudioData

AudioData implements the `FileBackedDTO` protocol, which means the `RemotePluginProxy` will automatically serialize it to a temp file before sending to the Worker.

In [None]:
# Test AudioData creation
audio = AudioData(
    samples=np.sin(np.linspace(0, 2*np.pi*440, 16000)),  # 1 second of 440Hz tone
    sample_rate=16000
)

print(f"AudioData: {len(audio.samples)} samples at {audio.sample_rate}Hz")

# Test FileBackedDTO protocol
print(f"\nImplements FileBackedDTO: {isinstance(audio, FileBackedDTO)}")

# Test to_temp_file (this is what the Proxy calls)
temp_path = audio.to_temp_file()
print(f"Saved to temp file: {temp_path}")

# Verify the file exists and can be read back
import os
print(f"File exists: {os.path.exists(temp_path)}")
print(f"File size: {os.path.getsize(temp_path)} bytes")

# Clean up
os.unlink(temp_path)

AudioData: 16000 samples at 16000Hz

Implements FileBackedDTO: True
Saved to temp file: /tmp/tmpbpdzmhhx.wav
File exists: True
File size: 32044 bytes


In [None]:
# Test TranscriptionResult
result = TranscriptionResult(
    text="Hello world",
    confidence=0.95,
    segments=[
        {"start": 0.0, "end": 0.5, "text": "Hello"},
        {"start": 0.5, "end": 1.0, "text": "world"}
    ],
    metadata={"model": "whisper-large-v3", "language": "en"}
)

print(f"Text: {result.text}")
print(f"Confidence: {result.confidence}")
print(f"Segments: {result.segments}")
print(f"Metadata: {result.metadata}")

Text: Hello world
Confidence: 0.95
Segments: [{'start': 0.0, 'end': 0.5, 'text': 'Hello'}, {'start': 0.5, 'end': 1.0, 'text': 'world'}]
Metadata: {'model': 'whisper-large-v3', 'language': 'en'}


In [None]:
# Test minimal result (only text required)
minimal = TranscriptionResult(text="Just the text")
print(f"Minimal result: {minimal}")

# Test from_file class method (if audio file available)
# audio_loaded = AudioData.from_file("path/to/audio.wav")

Minimal result: TranscriptionResult(text='Just the text', confidence=None, segments=None, metadata={})


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()