# Voxtral HF Plugin

> Plugin implementation for Mistral Voxtral transcription through Hugging Face Transformers

In [None]:
#| default_exp plugin

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import sqlite3
import json
import time
import os
import sys
from uuid import uuid4
import logging
from pathlib import Path
from dataclasses import dataclass, field
from dataclasses import replace as dataclass_replace
from typing import Dict, Any, Optional, List, Union, Generator
import tempfile
import warnings
from threading import Thread

from fastcore.basics import patch

import numpy as np
import torch
import soundfile as sf

try:
    from transformers import VoxtralForConditionalGeneration, AutoProcessor
    from transformers import TextStreamer, TextIteratorStreamer
    VOXTRAL_AVAILABLE = True
except ImportError:
    VOXTRAL_AVAILABLE = False
    
from cjm_transcription_plugin_system.plugin_interface import TranscriptionPlugin
from cjm_transcription_plugin_system.core import AudioData, TranscriptionResult
from cjm_plugin_system.utils.validation import (
    dict_to_config, config_to_dict, validate_config, dataclass_to_jsonschema,
    SCHEMA_TITLE, SCHEMA_DESC, SCHEMA_MIN, SCHEMA_MAX, SCHEMA_ENUM
)
from cjm_transcription_plugin_voxtral_hf.meta import (
    get_plugin_metadata
)

In [None]:
#| export
@dataclass
class VoxtralHFPluginConfig:
    """Configuration for Voxtral HF transcription plugin."""
    model_id:str = field(
        default="mistralai/Voxtral-Mini-3B-2507",
        metadata={
            SCHEMA_TITLE: "Model ID",
            SCHEMA_DESC: "Voxtral model to use. Mini is faster, Small is more accurate.",
            SCHEMA_ENUM: ["mistralai/Voxtral-Mini-3B-2507", "mistralai/Voxtral-Small-24B-2507"]
        }
    )
    device:str = field(
        default="auto",
        metadata={
            SCHEMA_TITLE: "Device",
            SCHEMA_DESC: "Device for inference (auto will use CUDA if available)",
            SCHEMA_ENUM: ["auto", "cpu", "cuda"]
        }
    )
    dtype:str = field(
        default="auto",
        metadata={
            SCHEMA_TITLE: "Data Type",
            SCHEMA_DESC: "Data type for model weights (auto will use bfloat16 on GPU, float32 on CPU)",
            SCHEMA_ENUM: ["auto", "bfloat16", "float16", "float32"]
        }
    )
    language:Optional[str] = field(
        default="en",
        metadata={
            SCHEMA_TITLE: "Language",
            SCHEMA_DESC: "Language code for transcription (e.g., 'en', 'es', 'fr')"
        }
    )
    max_new_tokens:int = field(
        default=25000,
        metadata={
            SCHEMA_TITLE: "Max New Tokens",
            SCHEMA_DESC: "Maximum number of tokens to generate",
            SCHEMA_MIN: 1,
            SCHEMA_MAX: 50000
        }
    )
    do_sample:bool = field(
        default=False,
        metadata={
            SCHEMA_TITLE: "Do Sample",
            SCHEMA_DESC: "Whether to use sampling (true) or greedy decoding (False)"
        }
    )
    temperature:float = field(
        default=1.0,
        metadata={
            SCHEMA_TITLE: "Temperature",
            SCHEMA_DESC: "Temperature for sampling (only used when do_sample=true)",
            SCHEMA_MIN: 0.0,
            SCHEMA_MAX: 2.0
        }
    )
    top_p:float = field(
        default=0.95,
        metadata={
            SCHEMA_TITLE: "Top P",
            SCHEMA_DESC: "Top-p (nucleus) sampling parameter (only used when do_sample=true)",
            SCHEMA_MIN: 0.0,
            SCHEMA_MAX: 1.0
        }
    )
    streaming:bool = field(
        default=False,
        metadata={
            SCHEMA_TITLE: "Streaming",
            SCHEMA_DESC: "Enable streaming output (yields partial results as they're generated)"
        }
    )
    trust_remote_code:bool = field(
        default=False,
        metadata={
            SCHEMA_TITLE: "Trust Remote Code",
            SCHEMA_DESC: "Whether to trust remote code when loading the model"
        }
    )
    cache_dir:Optional[str] = field(
        default=None,
        metadata={
            SCHEMA_TITLE: "Cache Directory",
            SCHEMA_DESC: "Directory to cache downloaded models"
        }
    )
    compile_model:bool = field(
        default=False,
        metadata={
            SCHEMA_TITLE: "Compile Model",
            SCHEMA_DESC: "Use torch.compile for potential speedup (requires PyTorch 2.0+)"
        }
    )
    load_in_8bit:bool = field(
        default=False,
        metadata={
            SCHEMA_TITLE: "Load in 8-bit",
            SCHEMA_DESC: "Load model in 8-bit quantization (requires bitsandbytes)"
        }
    )
    load_in_4bit:bool = field(
        default=False,
        metadata={
            SCHEMA_TITLE: "Load in 4-bit",
            SCHEMA_DESC: "Load model in 4-bit quantization (requires bitsandbytes)"
        }
    )


class VoxtralHFPlugin(TranscriptionPlugin):
    """Mistral Voxtral transcription plugin via Hugging Face Transformers."""
    
    config_class = VoxtralHFPluginConfig
    
    def __init__(self):
        """Initialize the Voxtral HF plugin with default configuration."""
        self.logger = logging.getLogger(f"{__name__}.{type(self).__name__}")
        self.config: VoxtralHFPluginConfig = None
        self.model = None
        self.processor = None
        self.device = None
        self.dtype = None
    
    @property
    def name(self) -> str: # Plugin name identifier
        """Get the plugin name identifier."""
        return "voxtral_hf"
    
    @property
    def version(self) -> str: # Plugin version string
        """Get the plugin version string."""
        return "1.0.0"
    
    @property
    def supported_formats(self) -> List[str]: # List of supported audio formats
        """Get the list of supported audio file formats."""
        return ["wav", "mp3", "flac", "m4a", "ogg", "webm", "mp4", "avi", "mov"]

    def get_current_config(self) -> Dict[str, Any]: # Current configuration as dictionary
        """Return current configuration state."""
        if not self.config:
            return {}
        return config_to_dict(self.config)

    def get_config_schema(self) -> Dict[str, Any]: # JSON Schema for configuration
        """Return JSON Schema for UI generation."""
        return dataclass_to_jsonschema(VoxtralHFPluginConfig)

    @staticmethod
    def get_config_dataclass() -> VoxtralHFPluginConfig: # Configuration dataclass
        """Return dataclass describing the plugin's configuration options."""
        return VoxtralHFPluginConfig
    
    def initialize(
        self,
        config: Optional[Any] = None # Configuration dataclass, dict, or None
    ) -> None:
        """Initialize or re-configure the plugin (idempotent)."""
        # Parse new config
        new_config = dict_to_config(VoxtralHFPluginConfig, config or {})
        
        # Check for changes if already running
        if self.config:
            # If the model changed, unload old model
            if self.config.model_id != new_config.model_id:
                self.logger.info(f"Config change: Model {self.config.model_id} -> {new_config.model_id}")
                self._unload_model()
            
            # If device changed, unload
            if self.config.device != new_config.device:
                self.logger.info(f"Config change: Device {self.config.device} -> {new_config.device}")
                self._unload_model()
            
            # If dtype changed, unload
            if self.config.dtype != new_config.dtype:
                self.logger.info(f"Config change: Dtype {self.config.dtype} -> {new_config.dtype}")
                self._unload_model()
            
            # If quantization settings changed, unload
            if (self.config.load_in_8bit != new_config.load_in_8bit or
                self.config.load_in_4bit != new_config.load_in_4bit):
                self.logger.info("Config change: Quantization settings changed")
                self._unload_model()
        
        # Apply new config
        self.config = new_config
        
        # Set device
        if self.config.device == "auto":
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
        else:
            self.device = self.config.device
        
        # Set dtype
        if self.config.dtype == "auto":
            if self.device == "cuda":
                self.dtype = torch.bfloat16
            else:
                self.dtype = torch.float32
        else:
            dtype_map = {
                "bfloat16": torch.bfloat16,
                "float16": torch.float16,
                "float32": torch.float32
            }
            self.dtype = dtype_map[self.config.dtype]
        
        self.logger.info(f"Initialized Voxtral HF plugin with model '{self.config.model_id}' on device '{self.device}' with dtype '{self.dtype}'")
    
    def _unload_model(self) -> None:
        """Unload the current model and free resources."""
        if self.model is None and self.processor is None:
            return
        
        self.logger.info("Unloading Voxtral model for reconfiguration")
        
        try:
            # Move model to CPU first if it's on GPU
            if self.model is not None and self.device == "cuda":
                try:
                    self.model = self.model.to('cpu')
                except Exception as e:
                    self.logger.warning(f"Could not move model to CPU: {e}")
            
            # Delete processor and model
            if self.processor is not None:
                del self.processor
                self.processor = None
            
            if self.model is not None:
                del self.model
                self.model = None
            
            # Force garbage collection
            import gc
            gc.collect()
            
            # GPU-specific cleanup
            if self.device == "cuda" and torch.cuda.is_available():
                torch.cuda.empty_cache()
                torch.cuda.synchronize()
                
        except Exception as e:
            self.logger.error(f"Error during model unload: {e}")
            self.model = None
            self.processor = None
    
    def _load_model(self) -> None:
        """Load the Voxtral model and processor (lazy loading)."""
        if self.model is None or self.processor is None:
            try:
                self.logger.info(f"Loading Voxtral model: {self.config.model_id}")
                
                # Load processor
                self.processor = AutoProcessor.from_pretrained(
                    self.config.model_id,
                    cache_dir=self.config.cache_dir,
                    trust_remote_code=self.config.trust_remote_code
                )
                
                # Model loading kwargs
                model_kwargs = {
                    "cache_dir": self.config.cache_dir,
                    "trust_remote_code": self.config.trust_remote_code,
                    "device_map": self.device,
                }
                
                # Add quantization settings if specified
                if self.config.load_in_8bit:
                    model_kwargs["load_in_8bit"] = True
                elif self.config.load_in_4bit:
                    model_kwargs["load_in_4bit"] = True
                else:
                    model_kwargs["dtype"] = self.dtype
                
                # Load model
                self.model = VoxtralForConditionalGeneration.from_pretrained(
                    self.config.model_id,
                    **model_kwargs
                )
                
                # Optionally compile the model (PyTorch 2.0+)
                if self.config.compile_model and hasattr(torch, 'compile'):
                    self.model = torch.compile(self.model)
                    self.logger.info("Model compiled with torch.compile")
                    
                self.logger.info("Voxtral model loaded successfully")
            except Exception as e:
                raise RuntimeError(f"Failed to load Voxtral model: {e}")
    
    def _prepare_audio(
        self,
        audio: Union[AudioData, str, Path] # Audio data, file path, or Path object to prepare
    ) -> str: # Path to the prepared audio file
        """Prepare audio for Voxtral processing."""
        if isinstance(audio, (str, Path)):
            # Already a file path
            return str(audio)
        
        elif isinstance(audio, AudioData):
            # Save AudioData to temporary file
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
                # Ensure audio is in the correct format
                audio_array = audio.samples
                
                # If stereo, convert to mono
                if audio_array.ndim > 1:
                    audio_array = audio_array.mean(axis=1)
                
                # Ensure float32 and normalized
                if audio_array.dtype != np.float32:
                    audio_array = audio_array.astype(np.float32)
                
                # Normalize if needed
                if audio_array.max() > 1.0:
                    audio_array = audio_array / np.abs(audio_array).max()
                
                # Save to file
                sf.write(tmp_file.name, audio_array, audio.sample_rate)
                return tmp_file.name
        else:
            raise ValueError(f"Unsupported audio input type: {type(audio)}")

    def _init_db(self):
        """Ensure table exists."""
        db_path = get_plugin_metadata()["db_path"]
        with sqlite3.connect(db_path) as con:
            con.execute("""
                CREATE TABLE IF NOT EXISTS transcriptions (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                    job_id TEXT,
                    audio_path TEXT,
                    text TEXT,
                    segments JSON,
                    metadata JSON,
                    created_at REAL
                )
            """)
            con.execute("CREATE INDEX IF NOT EXISTS idx_job_id ON transcriptions(job_id)")

    def _save_to_db(self, result: TranscriptionResult, audio_path: str, **kwargs) -> None:
        """Save result to SQLite."""
        try:
            self._init_db()
            db_path = get_plugin_metadata()["db_path"]
            
            # Extract a job_id if provided, else gen random
            job_id = kwargs.get("job_id", str(uuid4()))
            
            # Serialize complex objects
            segments_json = json.dumps(result.segments) if result.segments else None
            metadata_json = json.dumps(result.metadata)
            
            with sqlite3.connect(db_path) as con:
                con.execute(
                    """
                    INSERT INTO transcriptions 
                    (job_id, audio_path, text, segments, metadata, created_at)
                    VALUES (?, ?, ?, ?, ?, ?)
                    """,
                    (job_id, str(audio_path), result.text, segments_json, metadata_json, time.time())
                )
                self.logger.info(f"Saved result to DB (Job: {job_id})")
                
        except Exception as e:
            self.logger.error(f"Failed to save to DB: {e}")
    
    def execute(
        self,
        audio: Union[AudioData, str, Path], # Audio data or path to audio file to transcribe
        **kwargs # Additional arguments to override config
    ) -> TranscriptionResult: # Transcription result with text and metadata
        """Transcribe audio using Voxtral."""
        # Load model if not already loaded
        self._load_model()
        
        # Prepare audio file
        audio_path = self._prepare_audio(audio)
        temp_file_created = not isinstance(audio, (str, Path))
        
        try:
            # Get config values, allowing kwargs overrides
            model_id = kwargs.get("model_id", self.config.model_id)
            language = kwargs.get("language", self.config.language)
            max_new_tokens = kwargs.get("max_new_tokens", self.config.max_new_tokens)
            do_sample = kwargs.get("do_sample", self.config.do_sample)
            temperature = kwargs.get("temperature", self.config.temperature)
            top_p = kwargs.get("top_p", self.config.top_p)
            
            # Prepare inputs
            self.logger.info(f"Processing audio with Voxtral {model_id}")
            
            inputs = self.processor.apply_transcription_request(
                language=language or "en",
                audio=str(audio_path),
                model_id=model_id
            )
            inputs = inputs.to(self.device, dtype=self.dtype)
            
            # Generation kwargs
            generation_kwargs = {
                "max_new_tokens": max_new_tokens,
                "do_sample": do_sample,
            }
            
            # Add sampling parameters if sampling is enabled
            if do_sample:
                generation_kwargs["temperature"] = temperature
                generation_kwargs["top_p"] = top_p
            
            # Generate transcription
            with torch.no_grad():
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    
                    outputs = self.model.generate(
                        **inputs,
                        **generation_kwargs
                    )
            
            # Decode the output
            result_text = self.processor.batch_decode(
                outputs[:, inputs.input_ids.shape[1]:], 
                skip_special_tokens=True
            )[0]

            # Clean up tensors immediately
            del inputs
            del outputs
            
            # Clear GPU cache if using CUDA
            if self.device == "cuda" and torch.cuda.is_available():
                torch.cuda.empty_cache()

            # Capture provenance metadata passed via kwargs
            provenance_meta = {
                k: v for k, v in kwargs.items() 
                if k in ['source_start_time', 'source_end_time']
            }
            
            # Create transcription result
            transcription_result = TranscriptionResult(
                text=result_text.strip(),
                confidence=None,  # Voxtral doesn't provide confidence scores
                segments=None,  # Voxtral doesn't provide segments by default
                metadata={
                    "model": model_id,
                    **provenance_meta,
                    "language": language or "en",
                    "device": self.device,
                    "dtype": str(self.dtype),
                }
            )

            # Capture original path for DB
            original_path = str(audio)
            if hasattr(audio, 'to_temp_file'): original_path = "in_memory_data"
            
            # Save to database
            self._save_to_db(transcription_result, original_path, **kwargs)
            
            self.logger.info(f"Transcription completed: {len(result_text.split())} words")
            return transcription_result
            
        finally:
            # Clean up temporary file if created
            if temp_file_created:
                try:
                    Path(audio_path).unlink()
                except Exception:
                    pass

    def is_available(self) -> bool: # True if Voxtral and its dependencies are available
        """Check if Voxtral is available."""
        return VOXTRAL_AVAILABLE
    
    def cleanup(self) -> None:
        """Clean up resources with aggressive memory management."""
        if self.model is None and self.processor is None:
            self.logger.info("No models to clean up")
            return
        
        self.logger.info("Unloading Voxtral model")
        
        try:
            # Move model to CPU first if it's on GPU (frees GPU memory immediately)
            if self.model is not None and self.device == "cuda":
                try:
                    # Move to CPU to free GPU memory
                    self.model = self.model.to('cpu')
                    self.logger.debug("Model moved to CPU")
                except Exception as e:
                    self.logger.warning(f"Could not move model to CPU: {e}")
            
            # Delete processor first (it may hold references to model components)
            if self.processor is not None:
                del self.processor
                self.processor = None
                self.logger.debug("Processor deleted")
            
            # Delete model
            if self.model is not None:
                del self.model
                self.model = None
                self.logger.debug("Model deleted")
            
            # Force garbage collection BEFORE GPU operations
            import gc
            gc.collect()
            
            # GPU-specific cleanup
            if self.device == "cuda" and torch.cuda.is_available():
                # Empty cache and synchronize
                torch.cuda.empty_cache()
                torch.cuda.synchronize()
                
                # Optional: more aggressive cleanup
                torch.cuda.ipc_collect()
                
                # Log memory stats
                if torch.cuda.is_available():
                    allocated = torch.cuda.memory_allocated() / 1024**3
                    reserved = torch.cuda.memory_reserved() / 1024**3
                    self.logger.info(f"GPU memory after cleanup - Allocated: {allocated:.2f}GB, Reserved: {reserved:.2f}GB")
            
            self.logger.info("Cleanup completed successfully")
            
        except Exception as e:
            self.logger.error(f"Error during cleanup: {e}")
            # Ensure references are cleared even if cleanup fails
            self.model = None
            self.processor = None

## Streaming Support

The `execute_stream` method provides real-time transcription output:

- **Yields**: Partial transcription text chunks as they become available during generation
- **Returns**: Final `TranscriptionResult` with complete text and metadata after streaming completes
- **Parameters**: Same as `execute()` - accepts `AudioData`, file path string, or `Path` object, plus optional kwargs to override config
- **Usage**: Iterate over the generator to receive text chunks, then access the return value for the final result

In [None]:
#| export
@patch
def supports_streaming(
    self:VoxtralHFPlugin
) -> bool:  # True if streaming is supported
    """Check if this plugin supports streaming transcription."""
    return True

@patch
def execute_stream(
    self:VoxtralHFPlugin,
    audio: Union[AudioData, str, Path],  # Audio data or path to audio file
    **kwargs  # Additional plugin-specific parameters
) -> Generator[str, None, TranscriptionResult]:  # Yields text chunks, returns final result
    """Stream transcription results chunk by chunk."""
    # Load model if not already loaded
    self._load_model()
    
    # Prepare audio file
    audio_path = self._prepare_audio(audio)
    temp_file_created = not isinstance(audio, (str, Path))
    
    try:
        # Get config values, allowing kwargs overrides
        model_id = kwargs.get("model_id", self.config.model_id)
        language = kwargs.get("language", self.config.language)
        max_new_tokens = kwargs.get("max_new_tokens", self.config.max_new_tokens)
        do_sample = kwargs.get("do_sample", self.config.do_sample)
        temperature = kwargs.get("temperature", self.config.temperature)
        top_p = kwargs.get("top_p", self.config.top_p)
        
        # Prepare inputs
        self.logger.info(f"Streaming transcription with Voxtral {model_id}")
        
        inputs = self.processor.apply_transcription_request(
            language=language or "en",
            audio=str(audio_path),
            model_id=model_id
        )
        inputs = inputs.to(self.device, dtype=self.dtype)
        
        # Create streamer
        from transformers import TextIteratorStreamer
        streamer = TextIteratorStreamer(
            self.processor.tokenizer, 
            skip_prompt=True, 
            skip_special_tokens=True
        )
        
        # Generation kwargs
        generation_kwargs = {
            **inputs,
            "max_new_tokens": max_new_tokens,
            "do_sample": do_sample,
            "streamer": streamer,
        }
        
        # Add sampling parameters if sampling is enabled
        if do_sample:
            generation_kwargs["temperature"] = temperature
            generation_kwargs["top_p"] = top_p
        
        # Start generation in a separate thread with torch.no_grad()               
        def generate_with_no_grad():                                               
          with torch.no_grad():                                                  
              self.model.generate(**generation_kwargs)                           
        
        thread = Thread(target=generate_with_no_grad)                              
        thread.start() 
        
        # Collect generated text
        generated_text = ""
        for text_chunk in streamer:
            generated_text += text_chunk
            yield text_chunk
        
        # Wait for generation to complete
        thread.join()

        # Clean up tensors immediately
        del inputs
        
        # Clear GPU cache if using CUDA
        if self.device == "cuda" and torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        # Return final result
        return TranscriptionResult(
            text=generated_text.strip(),
            confidence=None,
            segments=None,
            metadata={
                "model": model_id,
                "language": language or "en",
                "device": self.device,
                "dtype": str(self.dtype),
                "streaming": True,
            }
        )
        
    finally:
        # Clean up temporary file if created
        if temp_file_created:
            try:
                Path(audio_path).unlink()
            except Exception:
                pass

## Testing the Plugin

In [None]:
# Test basic functionality
plugin = VoxtralHFPlugin()

# Check availability
print(f"Voxtral available: {plugin.is_available()}")
print(f"Plugin name: {plugin.name}")
print(f"Plugin version: {plugin.version}")
print(f"Supported formats: {plugin.supported_formats}")
print(f"Config class: {plugin.config_class.__name__}")
print(f"Supports streaming: {plugin.supports_streaming()}")

Voxtral available: True
Plugin name: voxtral_hf
Plugin version: 1.0.0
Supported formats: ['wav', 'mp3', 'flac', 'm4a', 'ogg', 'webm', 'mp4', 'avi', 'mov']
Config class: VoxtralHFPluginConfig
Supports streaming: True


In [None]:
# Test configuration dataclass
from dataclasses import fields

print("Available models:")
model_field = next(f for f in fields(VoxtralHFPluginConfig) if f.name == "model_id")
for model in model_field.metadata.get(SCHEMA_ENUM, []):
    print(f"  - {model}")

Available models:
  - mistralai/Voxtral-Mini-3B-2507
  - mistralai/Voxtral-Small-24B-2507


In [None]:
# Test configuration validation
test_configs = [
    ({"model_id": "mistralai/Voxtral-Mini-3B-2507"}, "Valid config"),
    ({"model_id": "invalid_model"}, "Invalid model"),
    ({"model_id": "mistralai/Voxtral-Mini-3B-2507", "temperature": 2.5}, "Temperature out of range"),
]

for config, description in test_configs:
    try:
        test_cfg = dict_to_config(VoxtralHFPluginConfig, config, validate=True)
        print(f"{description}: Valid=True")
    except ValueError as e:
        print(f"{description}: Valid=False")
        print(f"  Error: {str(e)[:100]}")

Valid config: Valid=True
Invalid model: Valid=False
  Error: model_id: 'invalid_model' is not one of ['mistralai/Voxtral-Mini-3B-2507', 'mistralai/Voxtral-Small-
Temperature out of range: Valid=False
  Error: temperature: 2.5 is greater than maximum 2.0


In [None]:
# Test initialization and get_current_config (returns dict now)
plugin.initialize({"model_id": "mistralai/Voxtral-Mini-3B-2507", "device": "cpu"})
current_config = plugin.get_current_config()
print(f"Current config (dict): model_id={current_config['model_id']}")

Current config (dict): model_id=mistralai/Voxtral-Mini-3B-2507


In [None]:
#| eval: false
# Test get_config_schema for UI generation
import json

schema = plugin.get_config_schema()
print("JSON Schema for VoxtralHFPluginConfig:")
print(f"  Name: {schema['name']}")
print(f"  Properties count: {len(schema['properties'])}")
print(f"  Model field enum: {schema['properties']['model_id'].get('enum', [])}")
print(f"\nSample properties:")
print(json.dumps({k: v for k, v in list(schema['properties'].items())[:3]}, indent=2))

JSON Schema for VoxtralHFPluginConfig:
  Name: VoxtralHFPluginConfig
  Properties count: 14
  Model field enum: ['mistralai/Voxtral-Mini-3B-2507', 'mistralai/Voxtral-Small-24B-2507']

Sample properties:
{
  "model_id": {
    "type": "string",
    "title": "Model ID",
    "description": "Voxtral model to use. Mini is faster, Small is more accurate.",
    "enum": [
      "mistralai/Voxtral-Mini-3B-2507",
      "mistralai/Voxtral-Small-24B-2507"
    ],
    "default": "mistralai/Voxtral-Mini-3B-2507"
  },
  "device": {
    "type": "string",
    "title": "Device",
    "description": "Device for inference (auto will use CUDA if available)",
    "enum": [
      "auto",
      "cpu",
      "cuda"
    ],
    "default": "auto"
  },
  "dtype": {
    "type": "string",
    "title": "Data Type",
    "description": "Data type for model weights (auto will use bfloat16 on GPU, float32 on CPU)",
    "enum": [
      "auto",
      "bfloat16",
      "float16",
      "float32"
    ],
    "default": "auto"
  

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()