diff --git a/bootstrap/src/host/engine.rs b/bootstrap/src/host/engine.rs
new file mode 100644
index 00000000..30ef3bbf
--- /dev/null
+++ b/bootstrap/src/host/engine.rs
@@ -0,0 +1,316 @@
+use super::csr_map;
+use super::driver::{BitnetDriver, DriverError};
+use super::irq::IrqDrivenDriver;
+use super::mmio::MockMmio;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct InferenceReport {
+    pub total_layers: u32,
+    pub layers_completed: u32,
+    pub error_layer: Option<u32>,
+    pub total_writes: usize,
+    pub total_reads: usize,
+}
+
+pub struct InferenceEngine {
+    driver: IrqDrivenDriver<MockMmio>,
+    num_layers: u32,
+    neurons: u32,
+    chunks: u32,
+    threshold: u32,
+    weight_addr: u64,
+}
+
+impl InferenceEngine {
+    pub fn new(driver: BitnetDriver<MockMmio>) -> Self {
+        Self {
+            driver: IrqDrivenDriver::new(driver),
+            num_layers: 2,
+            neurons: 16,
+            chunks: 4,
+            threshold: 1,
+            weight_addr: 0,
+        }
+    }
+
+    pub fn configure(
+        &mut self,
+        num_layers: u32,
+        neurons: u32,
+        chunks: u32,
+        threshold: u32,
+        weight_addr: u64,
+    ) -> Result<(), DriverError> {
+        if num_layers == 0 || neurons == 0 || chunks == 0 {
+            return Err(DriverError::InvalidConfig);
+        }
+        self.num_layers = num_layers;
+        self.neurons = neurons;
+        self.chunks = chunks;
+        self.threshold = threshold;
+        self.weight_addr = weight_addr;
+        Ok(())
+    }
+
+    pub fn run(&mut self, max_rounds_per_stage: u32) -> Result<InferenceReport, DriverError> {
+        self.driver
+            .handler_mut()
+            .driver_mut()
+            .configure(
+                self.num_layers,
+                self.neurons,
+                self.chunks,
+                self.threshold,
+                self.weight_addr,
+            )?;
+        self.driver
+            .handler_mut()
+            .driver_mut()
+            .enable_irqs(csr_map::IRQ_ALL_MASK);
+
+        let mut layers_completed: u32 = 0;
+        let mut error_layer: Option<u32> = None;
+
+        for layer in 0..self.num_layers {
+            let layer_weight_addr = self.weight_addr.wrapping_add(
+                (layer as u64) * 0x1_0000 * (self.neurons as u64) * (self.chunks as u64),
+            );
+            self.driver
+                .handler_mut()
+                .driver_mut()
+                .mmio_mut()
+                .poke(csr_map::WEIGHT_ADDR_LO, layer_weight_addr as u32);
+            self.driver
+                .handler_mut()
+                .driver_mut()
+                .mmio_mut()
+                .poke(csr_map::WEIGHT_ADDR_HI, (layer_weight_addr >> 32) as u32);
+
+            self.driver
+                .handler_mut()
+                .driver_mut()
+                .mmio_mut()
+                .latch_irq(csr_map::IRQ_DMA_DONE_MASK);
+            if let Err(e) = self.wait_dma_done(max_rounds_per_stage) {
+                error_layer = Some(layer);
+                if e == DriverError::EngineError {
+                    break;
+                }
+                return Err(e);
+            }
+
+            self.driver.handler_mut().driver_mut().start();
+            self.driver
+                .handler_mut()
+                .driver_mut()
+                .mmio_mut()
+                .latch_irq(csr_map::IRQ_INFERENCE_DONE_MASK);
+            self.driver
+                .handler_mut()
+                .driver_mut()
+                .mmio_mut()
+                .set_done(true);
+            if let Err(e) = self.wait_inference_done(max_rounds_per_stage) {
+                error_layer = Some(layer);
+                if e == DriverError::EngineError {
+                    break;
+                }
+                return Err(e);
+            }
+
+            self.driver
+                .handler_mut()
+                .driver_mut()
+                .mmio_mut()
+                .latch_irq(csr_map::IRQ_DMA_DONE_MASK);
+            if let Err(e) = self.wait_dma_done(max_rounds_per_stage) {
+                error_layer = Some(layer);
+                if e == DriverError::EngineError {
+                    break;
+                }
+                return Err(e);
+            }
+
+            layers_completed += 1;
+        }
+
+        let total_writes = self.driver.handler().driver().mmio().write_count();
+        let total_reads = self.driver.handler().driver().mmio().read_count();
+
+        Ok(InferenceReport {
+            total_layers: self.num_layers,
+            layers_completed,
+            error_layer,
+            total_writes,
+            total_reads,
+        })
+    }
+
+    fn wait_dma_done(&mut self, max_rounds: u32) -> Result<(), DriverError> {
+        self.driver.wait_irq_mask(csr_map::IRQ_DMA_DONE_MASK, max_rounds)
+    }
+
+    fn wait_inference_done(&mut self, max_rounds: u32) -> Result<(), DriverError> {
+        self.driver.wait_irq_mask(csr_map::IRQ_INFERENCE_DONE_MASK, max_rounds)
+    }
+
+    pub fn driver(&self) -> &IrqDrivenDriver<MockMmio> {
+        &self.driver
+    }
+
+    pub fn driver_mut(&mut self) -> &mut IrqDrivenDriver<MockMmio> {
+        &mut self.driver
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::host::mmio::MockMmio;
+
+    fn fresh() -> InferenceEngine {
+        InferenceEngine::new(BitnetDriver::new(MockMmio::with_csrs_zeroed()))
+    }
+
+    #[test]
+    fn configure_rejects_zero_layers() {
+        let mut e = fresh();
+        assert_eq!(e.configure(0, 1, 1, 1, 0), Err(DriverError::InvalidConfig));
+    }
+
+    #[test]
+    fn configure_rejects_zero_neurons() {
+        let mut e = fresh();
+        assert_eq!(e.configure(1, 0, 1, 1, 0), Err(DriverError::InvalidConfig));
+    }
+
+    #[test]
+    fn configure_rejects_zero_chunks() {
+        let mut e = fresh();
+        assert_eq!(e.configure(1, 1, 0, 1, 0), Err(DriverError::InvalidConfig));
+    }
+
+    #[test]
+    fn run_single_layer_succeeds() {
+        let mut e = fresh();
+        e.configure(1, 16, 4, 1, 0).unwrap();
+        let report = e.run(4).unwrap();
+        assert_eq!(report.total_layers, 1);
+        assert_eq!(report.layers_completed, 1);
+        assert_eq!(report.error_layer, None);
+    }
+
+    #[test]
+    fn run_two_layers_succeeds() {
+        let mut e = fresh();
+        e.configure(2, 16, 4, 1, 0).unwrap();
+        let report = e.run(4).unwrap();
+        assert_eq!(report.total_layers, 2);
+        assert_eq!(report.layers_completed, 2);
+    }
+
+    #[test]
+    fn run_reports_writes_and_reads() {
+        let mut e = fresh();
+        e.configure(1, 16, 4, 1, 0).unwrap();
+        let report = e.run(4).unwrap();
+        assert!(report.total_writes > 0);
+        assert!(report.total_reads > 0);
+    }
+
+    #[test]
+    fn run_increases_writes_with_more_layers() {
+        let mut e1 = fresh();
+        e1.configure(1, 16, 4, 1, 0).unwrap();
+        let r1 = e1.run(4).unwrap();
+
+        let mut e2 = fresh();
+        e2.configure(3, 16, 4, 1, 0).unwrap();
+        let r2 = e2.run(4).unwrap();
+
+        assert!(r2.total_writes > r1.total_writes);
+    }
+
+    #[test]
+    fn run_error_on_irq_error_stops_early() {
+        let mut e = fresh();
+        e.configure(3, 16, 4, 1, 0).unwrap();
+        e.driver_mut()
+            .handler_mut()
+            .driver_mut()
+            .mmio_mut()
+            .latch_irq(csr_map::IRQ_ERROR_MASK);
+        let report = e.run(4).unwrap();
+        assert!(report.layers_completed < report.total_layers);
+        assert!(report.error_layer.is_some());
+    }
+
+    #[test]
+    fn run_returns_error_on_zero_rounds() {
+        let mut e = fresh();
+        e.configure(1, 16, 4, 1, 0).unwrap();
+        let result = e.run(0);
+        assert_eq!(result, Err(DriverError::Timeout));
+    }
+
+    #[test]
+    fn five_layers_all_complete() {
+        let mut e = fresh();
+        e.configure(5, 4, 2, 1, 0).unwrap();
+        let report = e.run(4).unwrap();
+        assert_eq!(report.layers_completed, 5);
+        assert_eq!(report.error_layer, None);
+    }
+
+    #[test]
+    fn one_layer_one_neuron_one_chunk() {
+        let mut e = fresh();
+        e.configure(1, 1, 1, 0, 0).unwrap();
+        let report = e.run(4).unwrap();
+        assert_eq!(report.layers_completed, 1);
+    }
+
+    #[test]
+    fn threshold_zero_is_valid() {
+        let mut e = fresh();
+        e.configure(1, 4, 2, 0, 0).unwrap();
+        let report = e.run(4).unwrap();
+        assert_eq!(report.layers_completed, 1);
+    }
+
+    #[test]
+    fn large_weight_addr_wraps() {
+        let mut e = fresh();
+        e.configure(1, 4, 2, 1, 0xFFFF_FFFF_FFFF_FFFF).unwrap();
+        let report = e.run(4).unwrap();
+        assert_eq!(report.layers_completed, 1);
+    }
+
+    #[test]
+    fn read_count_positive_after_run() {
+        let mut e = fresh();
+        e.configure(1, 4, 4, 1, 0).unwrap();
+        let report = e.run(4).unwrap();
+        assert!(report.total_reads >= 1);
+    }
+
+    #[test]
+    fn writes_increase_monotonically_with_layers() {
+        let results: Vec<InferenceReport> = (1..=4)
+            .map(|n| {
+                let mut e = fresh();
+                e.configure(n, 4, 2, 1, 0).unwrap();
+                e.run(4).unwrap()
+            })
+            .collect();
+        for w in results.windows(2) {
+            assert!(w[1].total_writes > w[0].total_writes);
+        }
+    }
+
+    #[test]
+    fn configure_accepts_max_values() {
+        let mut e = fresh();
+        assert!(e.configure(u32::MAX, u32::MAX, u32::MAX, u32::MAX, u64::MAX).is_ok());
+    }
+}
diff --git a/bootstrap/src/host/irq.rs b/bootstrap/src/host/irq.rs
new file mode 100644
index 00000000..c8aa724a
--- /dev/null
+++ b/bootstrap/src/host/irq.rs
@@ -0,0 +1,257 @@
+use super::csr_map;
+use super::driver::{BitnetDriver, DriverError};
+use super::mmio::Mmio;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum IrqSource {
+    InferenceDone,
+    DmaDone,
+    Error,
+}
+
+impl IrqSource {
+    pub fn mask(self) -> u32 {
+        match self {
+            IrqSource::InferenceDone => csr_map::IRQ_INFERENCE_DONE_MASK,
+            IrqSource::DmaDone => csr_map::IRQ_DMA_DONE_MASK,
+            IrqSource::Error => csr_map::IRQ_ERROR_MASK,
+        }
+    }
+
+    pub fn from_mask(mask: u32) -> Vec<IrqSource> {
+        let mut sources = Vec::new();
+        if mask & csr_map::IRQ_INFERENCE_DONE_MASK != 0 {
+            sources.push(IrqSource::InferenceDone);
+        }
+        if mask & csr_map::IRQ_DMA_DONE_MASK != 0 {
+            sources.push(IrqSource::DmaDone);
+        }
+        if mask & csr_map::IRQ_ERROR_MASK != 0 {
+            sources.push(IrqSource::Error);
+        }
+        sources
+    }
+}
+
+type IrqCallback = fn(IrqSource);
+
+pub struct IrqHandler<M: Mmio> {
+    driver: BitnetDriver<M>,
+    callbacks: [Option<IrqCallback>; 3],
+}
+
+impl<M: Mmio> IrqHandler<M> {
+    pub fn new(driver: BitnetDriver<M>) -> Self {
+        Self {
+            driver,
+            callbacks: [None, None, None],
+        }
+    }
+
+    pub fn register(&mut self, source: IrqSource, cb: IrqCallback) {
+        let idx = match source {
+            IrqSource::InferenceDone => 0,
+            IrqSource::DmaDone => 1,
+            IrqSource::Error => 2,
+        };
+        self.callbacks[idx] = Some(cb);
+    }
+
+    pub fn service(&mut self) -> u32 {
+        let stat = self.driver.read_irq_status();
+        if stat == 0 {
+            return 0;
+        }
+        let sources = IrqSource::from_mask(stat);
+        for src in sources {
+            let idx = match src {
+                IrqSource::InferenceDone => 0,
+                IrqSource::DmaDone => 1,
+                IrqSource::Error => 2,
+            };
+            if let Some(cb) = self.callbacks[idx] {
+                cb(src);
+            }
+        }
+        self.driver.clear_irq(stat);
+        stat
+    }
+
+    pub fn driver(&self) -> &BitnetDriver<M> {
+        &self.driver
+    }
+
+    pub fn driver_mut(&mut self) -> &mut BitnetDriver<M> {
+        &mut self.driver
+    }
+}
+
+pub struct IrqDrivenDriver<M: Mmio> {
+    handler: IrqHandler<M>,
+}
+
+impl<M: Mmio> IrqDrivenDriver<M> {
+    pub fn new(driver: BitnetDriver<M>) -> Self {
+        Self {
+            handler: IrqHandler::new(driver),
+        }
+    }
+
+    pub fn register(&mut self, source: IrqSource, cb: IrqCallback) {
+        self.handler.register(source, cb);
+    }
+
+    pub fn handler(&self) -> &IrqHandler<M> {
+        &self.handler
+    }
+
+    pub fn handler_mut(&mut self) -> &mut IrqHandler<M> {
+        &mut self.handler
+    }
+
+    pub fn wait_done_irq(&mut self, max_service_rounds: u32) -> Result<(), DriverError> {
+        self.wait_irq_mask(csr_map::IRQ_INFERENCE_DONE_MASK, max_service_rounds)
+    }
+
+    pub fn wait_irq_mask(&mut self, mask: u32, max_service_rounds: u32) -> Result<(), DriverError> {
+        for _ in 0..max_service_rounds {
+            let serviced = self.handler.service();
+            if serviced & csr_map::IRQ_ERROR_MASK != 0 {
+                return Err(DriverError::EngineError);
+            }
+            if serviced & mask != 0 {
+                return Ok(());
+            }
+            if mask == csr_map::IRQ_INFERENCE_DONE_MASK
+                && self.handler.driver_mut().is_done()
+            {
+                return Ok(());
+            }
+        }
+        Err(DriverError::Timeout)
+    }
+
+    pub fn into_handler(self) -> IrqHandler<M> {
+        self.handler
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::host::mmio::MockMmio;
+
+    thread_local! {
+        static FIRED: std::cell::RefCell<Vec<IrqSource>> = std::cell::RefCell::new(Vec::new());
+    }
+
+    fn record_cb(src: IrqSource) {
+        FIRED.with(|f| f.borrow_mut().push(src));
+    }
+
+    fn fired() -> Vec<IrqSource> {
+        FIRED.with(|f| f.borrow().clone())
+    }
+
+    fn clear_fired() {
+        FIRED.with(|f| f.borrow_mut().clear());
+    }
+
+    fn fresh_handler() -> IrqHandler<MockMmio> {
+        IrqHandler::new(BitnetDriver::new(MockMmio::with_csrs_zeroed()))
+    }
+
+    fn fresh_irq_driver() -> IrqDrivenDriver<MockMmio> {
+        IrqDrivenDriver::new(BitnetDriver::new(MockMmio::with_csrs_zeroed()))
+    }
+
+    #[test]
+    fn irq_source_mask_roundtrip() {
+        assert_eq!(IrqSource::InferenceDone.mask(), csr_map::IRQ_INFERENCE_DONE_MASK);
+        assert_eq!(IrqSource::DmaDone.mask(), csr_map::IRQ_DMA_DONE_MASK);
+        assert_eq!(IrqSource::Error.mask(), csr_map::IRQ_ERROR_MASK);
+    }
+
+    #[test]
+    fn from_mask_empty() {
+        assert!(IrqSource::from_mask(0).is_empty());
+    }
+
+    #[test]
+    fn from_mask_all_three() {
+        let sources = IrqSource::from_mask(csr_map::IRQ_ALL_MASK);
+        assert_eq!(sources.len(), 3);
+    }
+
+    #[test]
+    fn handler_service_no_irqs_returns_zero() {
+        let mut h = fresh_handler();
+        assert_eq!(h.service(), 0);
+    }
+
+    #[test]
+    fn handler_service_dispatches_callback() {
+        clear_fired();
+        let mut h = fresh_handler();
+        h.register(IrqSource::InferenceDone, record_cb);
+        h.driver_mut().mmio_mut().latch_irq(csr_map::IRQ_INFERENCE_DONE_MASK);
+        let serviced = h.service();
+        assert_eq!(serviced, csr_map::IRQ_INFERENCE_DONE_MASK);
+        assert_eq!(fired(), vec![IrqSource::InferenceDone]);
+    }
+
+    #[test]
+    fn handler_service_calls_clear_irq() {
+        let mut h = fresh_handler();
+        h.driver_mut().mmio_mut().latch_irq(csr_map::IRQ_INFERENCE_DONE_MASK);
+        h.service();
+        let log = h.driver().mmio().log();
+        let last = log.last().unwrap();
+        assert_eq!(last.op, super::super::mmio::MmioOp::Write);
+        assert_eq!(last.addr, csr_map::IRQ_STAT);
+        assert_eq!(last.value, csr_map::IRQ_INFERENCE_DONE_MASK);
+    }
+
+    #[test]
+    fn handler_multiple_sources() {
+        clear_fired();
+        let mut h = fresh_handler();
+        h.register(IrqSource::InferenceDone, record_cb);
+        h.register(IrqSource::DmaDone, record_cb);
+        h.driver_mut().mmio_mut().latch_irq(
+            csr_map::IRQ_INFERENCE_DONE_MASK | csr_map::IRQ_DMA_DONE_MASK,
+        );
+        let serviced = h.service();
+        assert_eq!(serviced, csr_map::IRQ_INFERENCE_DONE_MASK | csr_map::IRQ_DMA_DONE_MASK);
+        let f = fired();
+        assert!(f.contains(&IrqSource::InferenceDone));
+        assert!(f.contains(&IrqSource::DmaDone));
+    }
+
+    #[test]
+    fn irq_driver_wait_done_succeeds_on_inference_done() {
+        let mut d = fresh_irq_driver();
+        d.handler_mut().driver_mut().mmio_mut().latch_irq(csr_map::IRQ_INFERENCE_DONE_MASK);
+        assert_eq!(d.wait_done_irq(4), Ok(()));
+    }
+
+    #[test]
+    fn irq_driver_wait_done_returns_error_on_irq_error() {
+        let mut d = fresh_irq_driver();
+        d.handler_mut().driver_mut().mmio_mut().latch_irq(csr_map::IRQ_ERROR_MASK);
+        assert_eq!(d.wait_done_irq(4), Err(DriverError::EngineError));
+    }
+
+    #[test]
+    fn irq_driver_wait_done_times_out() {
+        let mut d = fresh_irq_driver();
+        assert_eq!(d.wait_done_irq(2), Err(DriverError::Timeout));
+    }
+
+    #[test]
+    fn irq_driver_wait_done_falls_back_to_done_bit() {
+        let mut d = fresh_irq_driver();
+        d.handler_mut().driver_mut().mmio_mut().set_done(true);
+        assert_eq!(d.wait_done_irq(4), Ok(()));
+    }
+}
diff --git a/bootstrap/src/host/json_output.rs b/bootstrap/src/host/json_output.rs
new file mode 100644
index 00000000..65ce1935
--- /dev/null
+++ b/bootstrap/src/host/json_output.rs
@@ -0,0 +1,58 @@
+use serde::Serialize;
+
+#[derive(Debug, Clone, Serialize)]
+pub struct HostSmokeJson {
+    pub ok: bool,
+    pub writes: usize,
+    pub reads: usize,
+    pub layers: u32,
+    pub neurons: u32,
+    pub chunks: u32,
+    pub threshold: u32,
+    pub weight_addr: String,
+    pub irq_stat: String,
+}
+
+#[derive(Debug, Clone, Serialize)]
+pub struct HostPollVsIrqJson {
+    pub ok: bool,
+    pub poll_writes: usize,
+    pub poll_reads: usize,
+    pub irq_writes: usize,
+    pub irq_reads: usize,
+    pub writes_match: bool,
+    pub irq_stat_poll: String,
+    pub irq_stat_irq: String,
+}
+
+#[derive(Debug, Clone, Serialize)]
+pub struct HostInferenceJson {
+    pub ok: bool,
+    pub total_layers: u32,
+    pub layers_completed: u32,
+    pub error_layer: Option<u32>,
+    pub total_writes: usize,
+    pub total_reads: usize,
+}
+
+#[derive(Debug, Clone, Serialize)]
+pub struct HostPerfJson {
+    pub ok: bool,
+    pub layers: u32,
+    pub neurons: u32,
+    pub chunks: u32,
+    pub total_cycles: u32,
+    pub total_weight_words: u32,
+    pub total_weight_bytes: u32,
+    pub bram_utilization_pct: f64,
+    pub total_dma_beats: u32,
+    pub throughput_inf_per_sec: f64,
+    pub clock_mhz: f64,
+}
+
+pub fn print_json<T: Serialize>(value: &T) -> anyhow::Result<()> {
+    let s = serde_json::to_string(value)
+        .map_err(|e| anyhow::anyhow!("JSON serialization failed: {}", e))?;
+    println!("{}", s);
+    Ok(())
+}
diff --git a/bootstrap/src/host/mod.rs b/bootstrap/src/host/mod.rs
index ad34d0e2..b63e5616 100644
--- a/bootstrap/src/host/mod.rs
+++ b/bootstrap/src/host/mod.rs
@@ -15,7 +15,15 @@
 
 pub mod csr_map;
 pub mod driver;
+pub mod engine;
+pub mod irq;
+pub mod json_output;
 pub mod mmio;
+pub mod perf;
 
 pub use driver::{BitnetDriver, CsrSnapshot, DriverError};
+pub use engine::{InferenceEngine, InferenceReport};
+pub use irq::{IrqDrivenDriver, IrqHandler, IrqSource};
+pub use json_output::{HostSmokeJson, HostPollVsIrqJson, HostInferenceJson, HostPerfJson};
 pub use mmio::{MmioOp, MmioRecord, MockMmio};
+pub use perf::{EngineConfig, PerformanceEstimate};
diff --git a/bootstrap/src/host/perf.rs b/bootstrap/src/host/perf.rs
new file mode 100644
index 00000000..fbd16108
--- /dev/null
+++ b/bootstrap/src/host/perf.rs
@@ -0,0 +1,252 @@
+pub const TRITS_PER_WORD: u32 = 27;
+pub const BITS_PER_TRIT: u32 = 2;
+pub const DATA_WIDTH: u32 = TRITS_PER_WORD * BITS_PER_TRIT;
+pub const BRAM_DEPTH: u32 = 4096;
+pub const DDR_BEAT_BITS: u32 = 64;
+pub const DDR_BEAT_BYTES: u32 = DDR_BEAT_BITS / 8;
+pub const WORDS_PER_DDR_BEAT: u32 = DDR_BEAT_BITS / DATA_WIDTH;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub struct EngineConfig {
+    pub num_layers: u32,
+    pub neurons: u32,
+    pub chunks: u32,
+}
+
+#[derive(Debug, Clone, Copy, PartialEq)]
+pub struct LayerEstimate {
+    pub layer_index: u32,
+    pub weight_words: u32,
+    pub weight_bytes: u32,
+    pub dma_prefetch_beats: u32,
+    pub compute_cycles: u32,
+    pub dma_drain_beats: u32,
+    pub total_cycles: u32,
+}
+
+#[derive(Debug, Clone, PartialEq)]
+pub struct PerformanceEstimate {
+    pub config: EngineConfig,
+    pub total_weight_words: u32,
+    pub total_weight_bytes: u32,
+    pub bram_utilization_pct: f64,
+    pub total_dma_beats: u32,
+    pub total_inference_cycles: u32,
+    pub layers: Vec<LayerEstimate>,
+}
+
+impl EngineConfig {
+    pub fn new(num_layers: u32, neurons: u32, chunks: u32) -> Option<Self> {
+        if num_layers == 0 || neurons == 0 || chunks == 0 {
+            return None;
+        }
+        Some(Self {
+            num_layers,
+            neurons,
+            chunks,
+        })
+    }
+
+    pub fn weight_words_per_layer(&self) -> u32 {
+        self.neurons * self.chunks
+    }
+
+    pub fn total_weight_words(&self) -> u32 {
+        self.weight_words_per_layer() * self.num_layers
+    }
+
+    pub fn weight_bytes_per_layer(&self) -> u32 {
+        self.weight_words_per_layer() * (DATA_WIDTH / 8)
+    }
+
+    pub fn total_weight_bytes(&self) -> u32 {
+        self.total_weight_words() * (DATA_WIDTH / 8)
+    }
+
+    pub fn bram_utilization_pct(&self) -> f64 {
+        let words = self.weight_words_per_layer() as f64;
+        (words / BRAM_DEPTH as f64) * 100.0
+    }
+
+    pub fn dma_beats_per_layer(&self) -> u32 {
+        let words = self.weight_words_per_layer();
+        (words + WORDS_PER_DDR_BEAT - 1) / WORDS_PER_DDR_BEAT
+    }
+
+    pub fn compute_cycles_per_layer(&self) -> u32 {
+        self.neurons * self.chunks
+    }
+
+    pub fn cycles_per_layer(&self) -> u32 {
+        let dma_beats = self.dma_beats_per_layer();
+        let compute = self.compute_cycles_per_layer();
+        dma_beats + compute + dma_beats
+    }
+
+    pub fn total_inference_cycles(&self) -> u32 {
+        self.cycles_per_layer() * self.num_layers
+    }
+
+    pub fn throughput_inf_per_sec(&self, clock_mhz: f64) -> f64 {
+        if clock_mhz <= 0.0 {
+            return 0.0;
+        }
+        let cycles_per_sec = clock_mhz * 1e6;
+        let cycles_per_inf = self.total_inference_cycles() as f64;
+        if cycles_per_inf == 0.0 {
+            return 0.0;
+        }
+        cycles_per_sec / cycles_per_inf
+    }
+
+    pub fn estimate(&self) -> PerformanceEstimate {
+        let layers: Vec<LayerEstimate> = (0..self.num_layers)
+            .map(|i| {
+                let weight_words = self.weight_words_per_layer();
+                let dma_beats = self.dma_beats_per_layer();
+                let compute = self.compute_cycles_per_layer();
+                LayerEstimate {
+                    layer_index: i,
+                    weight_words,
+                    weight_bytes: self.weight_bytes_per_layer(),
+                    dma_prefetch_beats: dma_beats,
+                    compute_cycles: compute,
+                    dma_drain_beats: dma_beats,
+                    total_cycles: dma_beats + compute + dma_beats,
+                }
+            })
+            .collect();
+
+        PerformanceEstimate {
+            config: *self,
+            total_weight_words: self.total_weight_words(),
+            total_weight_bytes: self.total_weight_bytes(),
+            bram_utilization_pct: self.bram_utilization_pct(),
+            total_dma_beats: layers.iter().map(|l| l.dma_prefetch_beats + l.dma_drain_beats).sum(),
+            total_inference_cycles: self.total_inference_cycles(),
+            layers,
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn cfg() -> EngineConfig {
+        EngineConfig::new(2, 16, 4).unwrap()
+    }
+
+    #[test]
+    fn rejects_zero_layers() {
+        assert!(EngineConfig::new(0, 1, 1).is_none());
+    }
+
+    #[test]
+    fn rejects_zero_neurons() {
+        assert!(EngineConfig::new(1, 0, 1).is_none());
+    }
+
+    #[test]
+    fn rejects_zero_chunks() {
+        assert!(EngineConfig::new(1, 1, 0).is_none());
+    }
+
+    #[test]
+    fn weight_words_per_layer() {
+        assert_eq!(cfg().weight_words_per_layer(), 64);
+    }
+
+    #[test]
+    fn total_weight_words() {
+        assert_eq!(cfg().total_weight_words(), 128);
+    }
+
+    #[test]
+    fn weight_bytes_per_layer() {
+        assert_eq!(cfg().weight_bytes_per_layer(), 64 * (DATA_WIDTH / 8));
+    }
+
+    #[test]
+    fn bram_utilization_under_100() {
+        assert!(cfg().bram_utilization_pct() < 100.0);
+    }
+
+    #[test]
+    fn bram_utilization_exact() {
+        let c = cfg();
+        let expected = (c.weight_words_per_layer() as f64 / BRAM_DEPTH as f64) * 100.0;
+        assert!((c.bram_utilization_pct() - expected).abs() < 0.001);
+    }
+
+    #[test]
+    fn compute_cycles_per_layer() {
+        assert_eq!(cfg().compute_cycles_per_layer(), 64);
+    }
+
+    #[test]
+    fn cycles_per_layer_is_three_stages() {
+        let c = cfg();
+        let dma = c.dma_beats_per_layer();
+        let compute = c.compute_cycles_per_layer();
+        assert_eq!(c.cycles_per_layer(), dma + compute + dma);
+    }
+
+    #[test]
+    fn total_inference_cycles_scales_linearly() {
+        let c = cfg();
+        assert_eq!(
+            c.total_inference_cycles(),
+            c.cycles_per_layer() * c.num_layers
+        );
+    }
+
+    #[test]
+    fn throughput_at_66_mhz() {
+        let t = cfg().throughput_inf_per_sec(66.0);
+        assert!(t > 0.0, "throughput should be positive: {t}");
+    }
+
+    #[test]
+    fn throughput_zero_clock_is_zero() {
+        assert_eq!(cfg().throughput_inf_per_sec(0.0), 0.0);
+    }
+
+    #[test]
+    fn estimate_has_correct_layer_count() {
+        let e = cfg().estimate();
+        assert_eq!(e.layers.len(), 2);
+    }
+
+    #[test]
+    fn estimate_layer_indices_sequential() {
+        let e = cfg().estimate();
+        for (i, l) in e.layers.iter().enumerate() {
+            assert_eq!(l.layer_index, i as u32);
+        }
+    }
+
+    #[test]
+    fn estimate_total_weight_words_matches() {
+        let e = cfg().estimate();
+        assert_eq!(e.total_weight_words, cfg().total_weight_words());
+    }
+
+    #[test]
+    fn large_config_does_not_overflow() {
+        let c = EngineConfig::new(100, 4096, 64).unwrap();
+        let e = c.estimate();
+        assert!(e.total_weight_words > 0);
+        assert!(e.total_inference_cycles > 0);
+    }
+
+    #[test]
+    fn data_width_is_54() {
+        assert_eq!(DATA_WIDTH, 54);
+    }
+
+    #[test]
+    fn bram_depth_is_4096() {
+        assert_eq!(BRAM_DEPTH, 4096);
+    }
+}
diff --git a/bootstrap/src/main.rs b/bootstrap/src/main.rs
index 5249a998..9eea44b5 100644
--- a/bootstrap/src/main.rs
+++ b/bootstrap/src/main.rs
@@ -347,6 +347,114 @@ enum Commands {
         /// Maximum poll iterations before timeout (default: 16).
         #[arg(long, default_value_t = 16)]
         max_polls: u32,
+
+        /// Emit structured JSON instead of human-readable output.
+        #[arg(long)]
+        json: bool,
+    },
+
+    /// Run a side-by-side poll-vs-IRQ comparison on MockMmio (Wave 40, R-HS-2).
+    ///
+    /// Executes the same configure -> start -> complete flow twice: once via
+    /// the poll-mode `wait_done` path (W39) and once via the interrupt-driven
+    /// `IrqDrivenDriver::wait_done_irq` path (W40). Prints a single comparison
+    /// line: `OK poll=Nw/Mr irq=Nw/Mr writes_match=<bool> irq_stat_poll=0x..
+    /// irq_stat_irq=0x..`.
+    #[command(name = "host-poll-vs-irq")]
+    HostPollVsIrq {
+        /// Number of layers to program (default: 2).
+        #[arg(long, default_value_t = 2)]
+        num_layers: u32,
+
+        /// Neurons per layer (default: 16).
+        #[arg(long, default_value_t = 16)]
+        neurons: u32,
+
+        /// Chunks per neuron (default: 4).
+        #[arg(long, default_value_t = 4)]
+        chunks: u32,
+
+        /// Signed threshold value (default: 1).
+        #[arg(long, default_value_t = 1)]
+        threshold: u32,
+
+        /// 64-bit weight base address as decimal (default: 0).
+        #[arg(long, default_value_t = 0)]
+        weight_addr: u64,
+
+        /// Maximum poll iterations before timeout (default: 16).
+        #[arg(long, default_value_t = 16)]
+        max_polls: u32,
+
+        /// Emit structured JSON instead of human-readable output.
+        #[arg(long)]
+        json: bool,
+    },
+
+    /// Run a multi-layer DMA-driven BitNet inference flow on MockMmio
+    /// (Wave 41, R-HS-3).
+    ///
+    /// Exercises the full configure -> DMA prefetch -> inference ->
+    /// DMA drain cycle per layer, using IrqDrivenDriver from W40.
+    /// Prints `OK layers=N completed=M writes=W reads=R`.
+    #[command(name = "host-inference")]
+    HostInference {
+        /// Number of layers to program (default: 2).
+        #[arg(long, default_value_t = 2)]
+        num_layers: u32,
+
+        /// Neurons per layer (default: 16).
+        #[arg(long, default_value_t = 16)]
+        neurons: u32,
+
+        /// Chunks per neuron (default: 4).
+        #[arg(long, default_value_t = 4)]
+        chunks: u32,
+
+        /// Signed threshold value (default: 1).
+        #[arg(long, default_value_t = 1)]
+        threshold: u32,
+
+        /// 64-bit weight base address as decimal (default: 0).
+        #[arg(long, default_value_t = 0)]
+        weight_addr: u64,
+
+        /// Maximum IRQ-service rounds per stage (default: 16).
+        #[arg(long, default_value_t = 16)]
+        max_rounds: u32,
+
+        /// Emit structured JSON instead of human-readable output.
+        #[arg(long)]
+        json: bool,
+    },
+
+    /// Estimate BitNet inference performance from engine configuration
+    /// (Wave 42, R-HS-4).
+    ///
+    /// Prints cycle counts, DMA beats, BRAM utilization, and throughput
+    /// estimates. No hardware required — pure arithmetic model.
+    #[command(name = "host-perf")]
+    HostPerf {
+        /// Number of layers (default: 2).
+        #[arg(long, default_value_t = 2)]
+        num_layers: u32,
+
+        /// Neurons per layer (default: 16).
+        #[arg(long, default_value_t = 16)]
+        neurons: u32,
+
+        /// Chunks per neuron (default: 4).
+        #[arg(long, default_value_t = 4)]
+        chunks: u32,
+
+        /// Clock frequency in MHz for throughput estimate (default: 66.0,
+        /// matching STARTUPE2.CFGMCLK on Wukong V1).
+        #[arg(long, default_value_t = 66.0)]
+        clock_mhz: f64,
+
+        /// Emit structured JSON instead of human-readable output.
+        #[arg(long)]
+        json: bool,
     },
 
     /// Emit a complete BitNet HLS bundle (Wave 38, R-SI-1).
@@ -3019,6 +3127,7 @@ fn run_host_smoke(
     threshold: u32,
     weight_addr: u64,
     max_polls: u32,
+    json: bool,
 ) -> anyhow::Result<()> {
     use host::{BitnetDriver, MockMmio};
     let mut driver = BitnetDriver::new(MockMmio::with_csrs_zeroed());
@@ -3027,8 +3136,6 @@ fn run_host_smoke(
         .map_err(|e| anyhow::anyhow!("configure failed: {:?}", e))?;
     driver.enable_irqs(host::csr_map::IRQ_ALL_MASK);
     driver.start();
-    // Simulate hardware completing the inference immediately for the smoke
-    // test: latch `done` and the inference_done IRQ before polling.
     driver.mmio_mut().set_done(true);
     driver.mmio_mut().latch_irq(host::csr_map::IRQ_INFERENCE_DONE_MASK);
     driver
@@ -3037,17 +3144,174 @@ fn run_host_smoke(
     let snap = driver.dump();
     let w = driver.mmio().write_count();
     let r = driver.mmio().read_count();
-    println!(
-        "OK {}w/{}r layers={} neurons={} chunks={} threshold={} weight_addr=0x{:016x} irq_stat=0x{:08x}",
-        w,
-        r,
-        snap.num_layers,
-        snap.neurons,
-        snap.chunks,
-        snap.threshold,
-        snap.weight_addr_64(),
-        snap.irq_stat
-    );
+    if json {
+        host::json_output::print_json(&host::json_output::HostSmokeJson {
+            ok: true,
+            writes: w,
+            reads: r,
+            layers: snap.num_layers,
+            neurons: snap.neurons,
+            chunks: snap.chunks,
+            threshold: snap.threshold,
+            weight_addr: format!("0x{:016x}", snap.weight_addr_64()),
+            irq_stat: format!("0x{:08x}", snap.irq_stat),
+        })?;
+    } else {
+        println!(
+            "OK {}w/{}r layers={} neurons={} chunks={} threshold={} weight_addr=0x{:016x} irq_stat=0x{:08x}",
+            w, r, snap.num_layers, snap.neurons, snap.chunks, snap.threshold, snap.weight_addr_64(), snap.irq_stat
+        );
+    }
+    Ok(())
+}
+
+fn run_host_poll_vs_irq(
+    num_layers: u32,
+    neurons: u32,
+    chunks: u32,
+    threshold: u32,
+    weight_addr: u64,
+    max_polls: u32,
+    json: bool,
+) -> anyhow::Result<()> {
+    use host::{BitnetDriver, IrqDrivenDriver, MockMmio};
+    let poll_writes;
+    let poll_reads;
+    let irq_stat_poll;
+    {
+        let mut driver = BitnetDriver::new(MockMmio::with_csrs_zeroed());
+        driver
+            .configure(num_layers, neurons, chunks, threshold, weight_addr)
+            .map_err(|e| anyhow::anyhow!("poll configure failed: {:?}", e))?;
+        driver.enable_irqs(host::csr_map::IRQ_ALL_MASK);
+        driver.start();
+        driver.mmio_mut().set_done(true);
+        driver.mmio_mut().latch_irq(host::csr_map::IRQ_INFERENCE_DONE_MASK);
+        driver
+            .wait_done(max_polls)
+            .map_err(|e| anyhow::anyhow!("poll wait_done failed: {:?}", e))?;
+        poll_writes = driver.mmio().write_count();
+        poll_reads = driver.mmio().read_count();
+        irq_stat_poll = driver.dump().irq_stat;
+    }
+    let irq_writes;
+    let irq_reads;
+    let irq_stat_irq;
+    {
+        let mut idd = IrqDrivenDriver::new(BitnetDriver::new(MockMmio::with_csrs_zeroed()));
+        idd.handler_mut()
+            .driver_mut()
+            .configure(num_layers, neurons, chunks, threshold, weight_addr)
+            .map_err(|e| anyhow::anyhow!("irq configure failed: {:?}", e))?;
+        idd.handler_mut().driver_mut().enable_irqs(host::csr_map::IRQ_ALL_MASK);
+        idd.handler_mut().driver_mut().start();
+        idd.handler_mut().driver_mut().mmio_mut().set_done(true);
+        idd.handler_mut()
+            .driver_mut()
+            .mmio_mut()
+            .latch_irq(host::csr_map::IRQ_INFERENCE_DONE_MASK);
+        idd.wait_done_irq(max_polls)
+            .map_err(|e| anyhow::anyhow!("irq wait_done_irq failed: {:?}", e))?;
+        irq_writes = idd.handler().driver().mmio().write_count();
+        irq_reads = idd.handler().driver().mmio().read_count();
+        irq_stat_irq = idd.handler_mut().driver_mut().dump().irq_stat;
+    }
+    let writes_match = poll_writes == irq_writes;
+    if json {
+        host::json_output::print_json(&host::json_output::HostPollVsIrqJson {
+            ok: true,
+            poll_writes,
+            poll_reads,
+            irq_writes,
+            irq_reads,
+            writes_match,
+            irq_stat_poll: format!("0x{:08x}", irq_stat_poll),
+            irq_stat_irq: format!("0x{:08x}", irq_stat_irq),
+        })?;
+    } else {
+        println!(
+            "OK poll={}w/{}r irq={}w/{}r writes_match={} irq_stat_poll=0x{:08x} irq_stat_irq=0x{:08x}",
+            poll_writes, poll_reads, irq_writes, irq_reads, writes_match, irq_stat_poll, irq_stat_irq
+        );
+    }
+    Ok(())
+}
+
+fn run_host_inference(
+    num_layers: u32,
+    neurons: u32,
+    chunks: u32,
+    threshold: u32,
+    weight_addr: u64,
+    max_rounds: u32,
+    json: bool,
+) -> anyhow::Result<()> {
+    use host::{BitnetDriver, InferenceEngine, MockMmio};
+    let mut engine = InferenceEngine::new(BitnetDriver::new(MockMmio::with_csrs_zeroed()));
+    engine
+        .configure(num_layers, neurons, chunks, threshold, weight_addr)
+        .map_err(|e| anyhow::anyhow!("configure failed: {:?}", e))?;
+    let report = engine
+        .run(max_rounds)
+        .map_err(|e| anyhow::anyhow!("inference failed: {:?}", e))?;
+    if json {
+        host::json_output::print_json(&host::json_output::HostInferenceJson {
+            ok: true,
+            total_layers: report.total_layers,
+            layers_completed: report.layers_completed,
+            error_layer: report.error_layer,
+            total_writes: report.total_writes,
+            total_reads: report.total_reads,
+        })?;
+    } else {
+        println!(
+            "OK layers={} completed={} writes={} reads={}",
+            report.total_layers, report.layers_completed, report.total_writes, report.total_reads
+        );
+    }
+    Ok(())
+}
+
+fn run_host_perf(
+    num_layers: u32,
+    neurons: u32,
+    chunks: u32,
+    clock_mhz: f64,
+    json: bool,
+) -> anyhow::Result<()> {
+    use host::perf::EngineConfig;
+    let cfg = EngineConfig::new(num_layers, neurons, chunks)
+        .ok_or_else(|| anyhow::anyhow!("invalid config: layers, neurons, and chunks must be > 0"))?;
+    let est = cfg.estimate();
+    let throughput = cfg.throughput_inf_per_sec(clock_mhz);
+    if json {
+        host::json_output::print_json(&host::json_output::HostPerfJson {
+            ok: true,
+            layers: est.config.num_layers,
+            neurons: est.config.neurons,
+            chunks: est.config.chunks,
+            total_cycles: est.total_inference_cycles,
+            total_weight_words: est.total_weight_words,
+            total_weight_bytes: est.total_weight_bytes,
+            bram_utilization_pct: est.bram_utilization_pct,
+            total_dma_beats: est.total_dma_beats,
+            throughput_inf_per_sec: throughput,
+            clock_mhz,
+        })?;
+    } else {
+        println!(
+            "OK layers={} neurons={} chunks={} total_cycles={} total_weight_words={} bram_pct={:.1}% dma_beats={} throughput={:.1} inf/s @ {:.1} MHz",
+            est.config.num_layers,
+            est.config.neurons,
+            est.config.chunks,
+            est.total_inference_cycles,
+            est.total_weight_words,
+            est.bram_utilization_pct,
+            est.total_dma_beats,
+            throughput,
+            clock_mhz,
+        );
+    }
     Ok(())
 }
 
@@ -7856,8 +8120,17 @@ async fn main() -> anyhow::Result<()> {
         Commands::GenBitnetBundle { top_name, axi_addr_width, axi_data_width, output_dir } => {
             run_gen_bitnet_bundle(&top_name, axi_addr_width, axi_data_width, &output_dir)?
         }
-        Commands::HostSmoke { num_layers, neurons, chunks, threshold, weight_addr, max_polls } => {
-            run_host_smoke(num_layers, neurons, chunks, threshold, weight_addr, max_polls)?
+        Commands::HostSmoke { num_layers, neurons, chunks, threshold, weight_addr, max_polls, json } => {
+            run_host_smoke(num_layers, neurons, chunks, threshold, weight_addr, max_polls, json)?
+        }
+        Commands::HostPollVsIrq { num_layers, neurons, chunks, threshold, weight_addr, max_polls, json } => {
+            run_host_poll_vs_irq(num_layers, neurons, chunks, threshold, weight_addr, max_polls, json)?
+        }
+        Commands::HostInference { num_layers, neurons, chunks, threshold, weight_addr, max_rounds, json } => {
+            run_host_inference(num_layers, neurons, chunks, threshold, weight_addr, max_rounds, json)?
+        }
+        Commands::HostPerf { num_layers, neurons, chunks, clock_mhz, json } => {
+            run_host_perf(num_layers, neurons, chunks, clock_mhz, json)?
         }
         Commands::Asm { input, output, format } => run_asm(&input, output.as_deref(), &format)?,
         Commands::GenTestbench { input, period_ns, max_cycles, output } => {
@@ -8094,8 +8367,17 @@ fn main() -> anyhow::Result<()> {
         Commands::GenBitnetBundle { top_name, axi_addr_width, axi_data_width, output_dir } => {
             run_gen_bitnet_bundle(&top_name, axi_addr_width, axi_data_width, &output_dir)?
         }
-        Commands::HostSmoke { num_layers, neurons, chunks, threshold, weight_addr, max_polls } => {
-            run_host_smoke(num_layers, neurons, chunks, threshold, weight_addr, max_polls)?
+        Commands::HostSmoke { num_layers, neurons, chunks, threshold, weight_addr, max_polls, json } => {
+            run_host_smoke(num_layers, neurons, chunks, threshold, weight_addr, max_polls, json)?
+        }
+        Commands::HostPollVsIrq { num_layers, neurons, chunks, threshold, weight_addr, max_polls, json } => {
+            run_host_poll_vs_irq(num_layers, neurons, chunks, threshold, weight_addr, max_polls, json)?
+        }
+        Commands::HostInference { num_layers, neurons, chunks, threshold, weight_addr, max_rounds, json } => {
+            run_host_inference(num_layers, neurons, chunks, threshold, weight_addr, max_rounds, json)?
+        }
+        Commands::HostPerf { num_layers, neurons, chunks, clock_mhz, json } => {
+            run_host_perf(num_layers, neurons, chunks, clock_mhz, json)?
         }
         Commands::Asm { input, output, format } => run_asm(&input, output.as_deref(), &format)?,
         Commands::GenTestbench { input, period_ns, max_cycles, output } => {
diff --git a/bootstrap/tests/host_engine.rs b/bootstrap/tests/host_engine.rs
new file mode 100644
index 00000000..47547aa5
--- /dev/null
+++ b/bootstrap/tests/host_engine.rs
@@ -0,0 +1,162 @@
+use std::process::Command;
+
+fn bin() -> &'static str {
+    env!("CARGO_BIN_EXE_t27c")
+}
+
+fn run(args: &[&str]) -> (bool, String, String) {
+    let out = Command::new(bin())
+        .args(args)
+        .output()
+        .expect("failed to spawn t27c");
+    (
+        out.status.success(),
+        String::from_utf8_lossy(&out.stdout).to_string(),
+        String::from_utf8_lossy(&out.stderr).to_string(),
+    )
+}
+
+#[test]
+fn inference_default_succeeds() {
+    let (ok, stdout, _) = run(&["host-inference"]);
+    assert!(ok, "default should succeed");
+    assert!(stdout.starts_with("OK "), "stdout = {stdout}");
+}
+
+#[test]
+fn inference_prints_layer_count() {
+    let (ok, stdout, _) = run(&["host-inference"]);
+    assert!(ok);
+    assert!(stdout.contains("layers=2"), "stdout = {stdout}");
+}
+
+#[test]
+fn inference_prints_completed_count() {
+    let (ok, stdout, _) = run(&["host-inference"]);
+    assert!(ok);
+    assert!(stdout.contains("completed=2"), "stdout = {stdout}");
+}
+
+#[test]
+fn inference_prints_writes() {
+    let (ok, stdout, _) = run(&["host-inference"]);
+    assert!(ok);
+    assert!(stdout.contains("writes="), "stdout = {stdout}");
+}
+
+#[test]
+fn inference_prints_reads() {
+    let (ok, stdout, _) = run(&["host-inference"]);
+    assert!(ok);
+    assert!(stdout.contains("reads="), "stdout = {stdout}");
+}
+
+#[test]
+fn inference_single_layer() {
+    let (ok, stdout, _) = run(&["host-inference", "--num-layers", "1"]);
+    assert!(ok);
+    assert!(stdout.contains("layers=1 completed=1"), "stdout = {stdout}");
+}
+
+#[test]
+fn inference_three_layers() {
+    let (ok, stdout, _) = run(&["host-inference", "--num-layers", "3"]);
+    assert!(ok);
+    assert!(stdout.contains("layers=3 completed=3"), "stdout = {stdout}");
+}
+
+#[test]
+fn inference_custom_neurons() {
+    let (ok, stdout, _) = run(&["host-inference", "--neurons", "128"]);
+    assert!(ok);
+}
+
+#[test]
+fn inference_custom_chunks() {
+    let (ok, stdout, _) = run(&["host-inference", "--chunks", "32"]);
+    assert!(ok);
+}
+
+#[test]
+fn inference_custom_threshold() {
+    let (ok, stdout, _) = run(&["host-inference", "--threshold", "42"]);
+    assert!(ok);
+}
+
+#[test]
+fn inference_custom_weight_addr() {
+    let (ok, stdout, _) = run(&["host-inference", "--weight-addr", "1099511627776"]);
+    assert!(ok);
+}
+
+#[test]
+fn inference_zero_layers_fails() {
+    let (ok, _, stderr) = run(&["host-inference", "--num-layers", "0"]);
+    assert!(!ok);
+    assert!(stderr.contains("configure") || stderr.contains("InvalidConfig"));
+}
+
+#[test]
+fn inference_zero_neurons_fails() {
+    let (ok, _, _stderr) = run(&["host-inference", "--neurons", "0"]);
+    assert!(!ok);
+}
+
+#[test]
+fn inference_zero_chunks_fails() {
+    let (ok, _, _stderr) = run(&["host-inference", "--chunks", "0"]);
+    assert!(!ok);
+}
+
+#[test]
+fn inference_deterministic() {
+    let (ok1, s1, _) = run(&["host-inference"]);
+    let (ok2, s2, _) = run(&["host-inference"]);
+    assert!(ok1 && ok2);
+    assert_eq!(s1, s2, "should be deterministic");
+}
+
+#[test]
+fn inference_single_line_output() {
+    let (ok, stdout, _) = run(&["host-inference"]);
+    assert!(ok);
+    let trimmed = stdout.trim_end_matches('\n');
+    assert!(!trimmed.contains('\n'), "expected single line, got {stdout}");
+}
+
+#[test]
+fn inference_no_stderr_on_success() {
+    let (ok, _, stderr) = run(&["host-inference"]);
+    assert!(ok);
+    assert!(stderr.trim().is_empty(), "stderr should be empty: {stderr}");
+}
+
+#[test]
+fn inference_combined_overrides() {
+    let (ok, stdout, _) = run(&[
+        "host-inference",
+        "--num-layers", "3",
+        "--neurons", "64",
+        "--chunks", "8",
+        "--threshold", "42",
+        "--weight-addr", "1024",
+    ]);
+    assert!(ok);
+    assert!(stdout.contains("layers=3 completed=3"));
+}
+
+#[test]
+fn inference_help_lists_all_flags() {
+    let (ok, stdout, _) = run(&["host-inference", "--help"]);
+    assert!(ok);
+    for flag in ["--num-layers", "--neurons", "--chunks", "--threshold", "--weight-addr", "--max-rounds"] {
+        assert!(stdout.contains(flag), "missing {flag} in help: {stdout}");
+    }
+}
+
+#[test]
+fn inference_help_mentions_wave_41() {
+    let (ok, stdout, _) = run(&["host-inference", "--help"]);
+    assert!(ok);
+    assert!(stdout.contains("Wave 41") || stdout.contains("R-HS-3"), "expected Wave 41 / R-HS-3: {stdout}");
+}
diff --git a/bootstrap/tests/host_irq.rs b/bootstrap/tests/host_irq.rs
new file mode 100644
index 00000000..7f2612c3
--- /dev/null
+++ b/bootstrap/tests/host_irq.rs
@@ -0,0 +1,169 @@
+use std::process::Command;
+
+fn bin() -> &'static str {
+    env!("CARGO_BIN_EXE_t27c")
+}
+
+fn run(args: &[&str]) -> (bool, String, String) {
+    let out = Command::new(bin())
+        .args(args)
+        .output()
+        .expect("failed to spawn t27c");
+    (
+        out.status.success(),
+        String::from_utf8_lossy(&out.stdout).to_string(),
+        String::from_utf8_lossy(&out.stderr).to_string(),
+    )
+}
+
+#[test]
+fn poll_vs_irq_default_succeeds() {
+    let (ok, stdout, _) = run(&["host-poll-vs-irq"]);
+    assert!(ok, "default should succeed");
+    assert!(stdout.starts_with("OK "), "stdout = {stdout}");
+}
+
+#[test]
+fn poll_vs_irq_prints_poll_metrics() {
+    let (ok, stdout, _) = run(&["host-poll-vs-irq"]);
+    assert!(ok);
+    assert!(stdout.contains("poll="), "stdout = {stdout}");
+}
+
+#[test]
+fn poll_vs_irq_prints_irq_metrics() {
+    let (ok, stdout, _) = run(&["host-poll-vs-irq"]);
+    assert!(ok);
+    assert!(stdout.contains("irq="), "stdout = {stdout}");
+}
+
+#[test]
+fn poll_vs_irq_writes_match_field_present() {
+    let (ok, stdout, _) = run(&["host-poll-vs-irq"]);
+    assert!(ok);
+    assert!(stdout.contains("writes_match="), "stdout = {stdout}");
+}
+
+#[test]
+fn poll_vs_irq_prints_irq_stat_poll() {
+    let (ok, stdout, _) = run(&["host-poll-vs-irq"]);
+    assert!(ok);
+    assert!(stdout.contains("irq_stat_poll=0x"), "stdout = {stdout}");
+}
+
+#[test]
+fn poll_vs_irq_prints_irq_stat_irq() {
+    let (ok, stdout, _) = run(&["host-poll-vs-irq"]);
+    assert!(ok);
+    assert!(stdout.contains("irq_stat_irq=0x"), "stdout = {stdout}");
+}
+
+#[test]
+fn poll_vs_irq_custom_layers() {
+    let (ok, stdout, _) = run(&["host-poll-vs-irq", "--num-layers", "5"]);
+    assert!(ok);
+    assert!(stdout.starts_with("OK "));
+}
+
+#[test]
+fn poll_vs_irq_custom_neurons() {
+    let (ok, stdout, _) = run(&["host-poll-vs-irq", "--neurons", "128"]);
+    assert!(ok);
+}
+
+#[test]
+fn poll_vs_irq_custom_chunks() {
+    let (ok, stdout, _) = run(&["host-poll-vs-irq", "--chunks", "32"]);
+    assert!(ok);
+}
+
+#[test]
+fn poll_vs_irq_custom_threshold() {
+    let (ok, stdout, _) = run(&["host-poll-vs-irq", "--threshold", "7"]);
+    assert!(ok);
+}
+
+#[test]
+fn poll_vs_irq_custom_weight_addr() {
+    let (ok, stdout, _) = run(&["host-poll-vs-irq", "--weight-addr", "1099511627776"]);
+    assert!(ok);
+}
+
+#[test]
+fn poll_vs_irq_zero_layers_fails() {
+    let (ok, _, stderr) = run(&["host-poll-vs-irq", "--num-layers", "0"]);
+    assert!(!ok);
+    assert!(stderr.contains("configure") || stderr.contains("InvalidConfig"));
+}
+
+#[test]
+fn poll_vs_irq_zero_neurons_fails() {
+    let (ok, _, _stderr) = run(&["host-poll-vs-irq", "--neurons", "0"]);
+    assert!(!ok);
+}
+
+#[test]
+fn poll_vs_irq_zero_chunks_fails() {
+    let (ok, _, _stderr) = run(&["host-poll-vs-irq", "--chunks", "0"]);
+    assert!(!ok);
+}
+
+#[test]
+fn poll_vs_irq_max_polls_one_succeeds() {
+    let (ok, stdout, _) = run(&["host-poll-vs-irq", "--max-polls", "1"]);
+    assert!(ok, "done is preset, 1 poll is enough");
+    assert!(stdout.starts_with("OK "));
+}
+
+#[test]
+fn poll_vs_irq_deterministic() {
+    let (ok1, s1, _) = run(&["host-poll-vs-irq"]);
+    let (ok2, s2, _) = run(&["host-poll-vs-irq"]);
+    assert!(ok1 && ok2);
+    assert_eq!(s1, s2, "should be deterministic");
+}
+
+#[test]
+fn poll_vs_irq_single_line_output() {
+    let (ok, stdout, _) = run(&["host-poll-vs-irq"]);
+    assert!(ok);
+    let trimmed = stdout.trim_end_matches('\n');
+    assert!(!trimmed.contains('\n'), "expected single line, got {stdout}");
+}
+
+#[test]
+fn poll_vs_irq_no_stderr_on_success() {
+    let (ok, _, stderr) = run(&["host-poll-vs-irq"]);
+    assert!(ok);
+    assert!(stderr.trim().is_empty(), "stderr should be empty: {stderr}");
+}
+
+#[test]
+fn poll_vs_irq_combined_overrides() {
+    let (ok, stdout, _) = run(&[
+        "host-poll-vs-irq",
+        "--num-layers", "3",
+        "--neurons", "64",
+        "--chunks", "8",
+        "--threshold", "42",
+        "--weight-addr", "1024",
+    ]);
+    assert!(ok);
+    assert!(stdout.contains("writes_match="));
+}
+
+#[test]
+fn poll_vs_irq_help_lists_all_flags() {
+    let (ok, stdout, _) = run(&["host-poll-vs-irq", "--help"]);
+    assert!(ok);
+    for flag in ["--num-layers", "--neurons", "--chunks", "--threshold", "--weight-addr", "--max-polls"] {
+        assert!(stdout.contains(flag), "missing {flag} in help: {stdout}");
+    }
+}
+
+#[test]
+fn poll_vs_irq_help_mentions_wave_40() {
+    let (ok, stdout, _) = run(&["host-poll-vs-irq", "--help"]);
+    assert!(ok);
+    assert!(stdout.contains("Wave 40") || stdout.contains("R-HS-2"), "expected Wave 40 / R-HS-2: {stdout}");
+}
diff --git a/bootstrap/tests/host_json.rs b/bootstrap/tests/host_json.rs
new file mode 100644
index 00000000..1f4179a6
--- /dev/null
+++ b/bootstrap/tests/host_json.rs
@@ -0,0 +1,235 @@
+use std::process::Command;
+
+fn bin() -> &'static str {
+    env!("CARGO_BIN_EXE_t27c")
+}
+
+fn run(args: &[&str]) -> (bool, String, String) {
+    let out = Command::new(bin())
+        .args(args)
+        .output()
+        .expect("failed to spawn t27c");
+    (
+        out.status.success(),
+        String::from_utf8_lossy(&out.stdout).to_string(),
+        String::from_utf8_lossy(&out.stderr).to_string(),
+    )
+}
+
+fn parse_json(s: &str) -> serde_json::Value {
+    let trimmed = s.trim();
+    serde_json::from_str(trimmed).unwrap_or_else(|e| panic!("invalid JSON: {e}\ninput: {trimmed}"))
+}
+
+// -- host-smoke --json --
+
+#[test]
+fn smoke_json_is_valid() {
+    let (ok, stdout, _) = run(&["host-smoke", "--json"]);
+    assert!(ok);
+    let v = parse_json(&stdout);
+    assert_eq!(v["ok"], true);
+}
+
+#[test]
+fn smoke_json_has_writes_reads() {
+    let (ok, stdout, _) = run(&["host-smoke", "--json"]);
+    assert!(ok);
+    let v = parse_json(&stdout);
+    assert!(v["writes"].is_number());
+    assert!(v["reads"].is_number());
+}
+
+#[test]
+fn smoke_json_has_config_fields() {
+    let (ok, stdout, _) = run(&["host-smoke", "--json"]);
+    assert!(ok);
+    let v = parse_json(&stdout);
+    assert_eq!(v["layers"], 2);
+    assert_eq!(v["neurons"], 16);
+    assert_eq!(v["chunks"], 4);
+    assert_eq!(v["threshold"], 1);
+}
+
+#[test]
+fn smoke_json_weight_addr_is_string() {
+    let (ok, stdout, _) = run(&["host-smoke", "--json"]);
+    assert!(ok);
+    let v = parse_json(&stdout);
+    assert!(v["weight_addr"].is_string());
+    assert!(v["weight_addr"].as_str().unwrap().starts_with("0x"));
+}
+
+#[test]
+fn smoke_json_irq_stat_is_string() {
+    let (ok, stdout, _) = run(&["host-smoke", "--json"]);
+    assert!(ok);
+    let v = parse_json(&stdout);
+    assert!(v["irq_stat"].is_string());
+}
+
+// -- host-poll-vs-irq --json --
+
+#[test]
+fn poll_vs_irq_json_is_valid() {
+    let (ok, stdout, _) = run(&["host-poll-vs-irq", "--json"]);
+    assert!(ok);
+    let v = parse_json(&stdout);
+    assert_eq!(v["ok"], true);
+}
+
+#[test]
+fn poll_vs_irq_json_has_poll_and_irq_counts() {
+    let (ok, stdout, _) = run(&["host-poll-vs-irq", "--json"]);
+    assert!(ok);
+    let v = parse_json(&stdout);
+    assert!(v["poll_writes"].is_number());
+    assert!(v["poll_reads"].is_number());
+    assert!(v["irq_writes"].is_number());
+    assert!(v["irq_reads"].is_number());
+}
+
+#[test]
+fn poll_vs_irq_json_writes_match_is_bool() {
+    let (ok, stdout, _) = run(&["host-poll-vs-irq", "--json"]);
+    assert!(ok);
+    let v = parse_json(&stdout);
+    assert!(v["writes_match"].is_boolean());
+}
+
+// -- host-inference --json --
+
+#[test]
+fn inference_json_is_valid() {
+    let (ok, stdout, _) = run(&["host-inference", "--json"]);
+    assert!(ok);
+    let v = parse_json(&stdout);
+    assert_eq!(v["ok"], true);
+}
+
+#[test]
+fn inference_json_has_layers_completed() {
+    let (ok, stdout, _) = run(&["host-inference", "--json"]);
+    assert!(ok);
+    let v = parse_json(&stdout);
+    assert_eq!(v["total_layers"], 2);
+    assert_eq!(v["layers_completed"], 2);
+}
+
+#[test]
+fn inference_json_has_writes_reads() {
+    let (ok, stdout, _) = run(&["host-inference", "--json"]);
+    assert!(ok);
+    let v = parse_json(&stdout);
+    assert!(v["total_writes"].is_number());
+    assert!(v["total_reads"].is_number());
+}
+
+#[test]
+fn inference_json_error_layer_is_null_on_success() {
+    let (ok, stdout, _) = run(&["host-inference", "--json"]);
+    assert!(ok);
+    let v = parse_json(&stdout);
+    assert!(v["error_layer"].is_null());
+}
+
+#[test]
+fn inference_json_single_layer() {
+    let (ok, stdout, _) = run(&["host-inference", "--json", "--num-layers", "1"]);
+    assert!(ok);
+    let v = parse_json(&stdout);
+    assert_eq!(v["total_layers"], 1);
+    assert_eq!(v["layers_completed"], 1);
+}
+
+// -- host-perf --json --
+
+#[test]
+fn perf_json_is_valid() {
+    let (ok, stdout, _) = run(&["host-perf", "--json"]);
+    assert!(ok);
+    let v = parse_json(&stdout);
+    assert_eq!(v["ok"], true);
+}
+
+#[test]
+fn perf_json_has_config() {
+    let (ok, stdout, _) = run(&["host-perf", "--json"]);
+    assert!(ok);
+    let v = parse_json(&stdout);
+    assert_eq!(v["layers"], 2);
+    assert_eq!(v["neurons"], 16);
+    assert_eq!(v["chunks"], 4);
+}
+
+#[test]
+fn perf_json_has_cycles_and_dma() {
+    let (ok, stdout, _) = run(&["host-perf", "--json"]);
+    assert!(ok);
+    let v = parse_json(&stdout);
+    assert!(v["total_cycles"].is_number());
+    assert!(v["total_dma_beats"].is_number());
+    assert!(v["total_weight_words"].is_number());
+}
+
+#[test]
+fn perf_json_has_bram_pct() {
+    let (ok, stdout, _) = run(&["host-perf", "--json"]);
+    assert!(ok);
+    let v = parse_json(&stdout);
+    assert!(v["bram_utilization_pct"].is_number());
+}
+
+#[test]
+fn perf_json_has_throughput() {
+    let (ok, stdout, _) = run(&["host-perf", "--json"]);
+    assert!(ok);
+    let v = parse_json(&stdout);
+    assert!(v["throughput_inf_per_sec"].is_number());
+    assert!(v["clock_mhz"].is_number());
+}
+
+#[test]
+fn perf_json_custom_clock() {
+    let (ok, stdout, _) = run(&["host-perf", "--json", "--clock-mhz", "100.0"]);
+    assert!(ok);
+    let v = parse_json(&stdout);
+    assert_eq!(v["clock_mhz"], 100.0);
+}
+
+// -- cross-command: without --json, output is NOT JSON --
+
+#[test]
+fn smoke_without_json_is_not_json_object() {
+    let (ok, stdout, _) = run(&["host-smoke"]);
+    assert!(ok);
+    let trimmed = stdout.trim();
+    assert!(trimmed.starts_with("OK "), "expected human-readable: {trimmed}");
+    assert!(!trimmed.starts_with("{"), "should not be JSON: {trimmed}");
+}
+
+#[test]
+fn perf_without_json_is_not_json_object() {
+    let (ok, stdout, _) = run(&["host-perf"]);
+    assert!(ok);
+    let trimmed = stdout.trim();
+    assert!(trimmed.starts_with("OK "), "expected human-readable: {trimmed}");
+}
+
+// -- determinism --
+
+#[test]
+fn smoke_json_deterministic() {
+    let (ok1, s1, _) = run(&["host-smoke", "--json"]);
+    let (ok2, s2, _) = run(&["host-smoke", "--json"]);
+    assert!(ok1 && ok2);
+    assert_eq!(s1, s2);
+}
+
+#[test]
+fn perf_json_deterministic() {
+    let (ok1, s1, _) = run(&["host-perf", "--json"]);
+    let (ok2, s2, _) = run(&["host-perf", "--json"]);
+    assert!(ok1 && ok2);
+    assert_eq!(s1, s2);
+}
diff --git a/bootstrap/tests/host_perf.rs b/bootstrap/tests/host_perf.rs
new file mode 100644
index 00000000..002d81f0
--- /dev/null
+++ b/bootstrap/tests/host_perf.rs
@@ -0,0 +1,189 @@
+use std::process::Command;
+
+fn bin() -> &'static str {
+    env!("CARGO_BIN_EXE_t27c")
+}
+
+fn run(args: &[&str]) -> (bool, String, String) {
+    let out = Command::new(bin())
+        .args(args)
+        .output()
+        .expect("failed to spawn t27c");
+    (
+        out.status.success(),
+        String::from_utf8_lossy(&out.stdout).to_string(),
+        String::from_utf8_lossy(&out.stderr).to_string(),
+    )
+}
+
+#[test]
+fn perf_default_succeeds() {
+    let (ok, stdout, _) = run(&["host-perf"]);
+    assert!(ok, "default should succeed");
+    assert!(stdout.starts_with("OK "), "stdout = {stdout}");
+}
+
+#[test]
+fn perf_prints_layer_count() {
+    let (ok, stdout, _) = run(&["host-perf"]);
+    assert!(ok);
+    assert!(stdout.contains("layers=2"), "stdout = {stdout}");
+}
+
+#[test]
+fn perf_prints_neuron_count() {
+    let (ok, stdout, _) = run(&["host-perf"]);
+    assert!(ok);
+    assert!(stdout.contains("neurons=16"), "stdout = {stdout}");
+}
+
+#[test]
+fn perf_prints_chunk_count() {
+    let (ok, stdout, _) = run(&["host-perf"]);
+    assert!(ok);
+    assert!(stdout.contains("chunks=4"), "stdout = {stdout}");
+}
+
+#[test]
+fn perf_prints_total_cycles() {
+    let (ok, stdout, _) = run(&["host-perf"]);
+    assert!(ok);
+    assert!(stdout.contains("total_cycles="), "stdout = {stdout}");
+}
+
+#[test]
+fn perf_prints_weight_words() {
+    let (ok, stdout, _) = run(&["host-perf"]);
+    assert!(ok);
+    assert!(stdout.contains("total_weight_words="), "stdout = {stdout}");
+}
+
+#[test]
+fn perf_prints_bram_pct() {
+    let (ok, stdout, _) = run(&["host-perf"]);
+    assert!(ok);
+    assert!(stdout.contains("bram_pct="), "stdout = {stdout}");
+}
+
+#[test]
+fn perf_prints_dma_beats() {
+    let (ok, stdout, _) = run(&["host-perf"]);
+    assert!(ok);
+    assert!(stdout.contains("dma_beats="), "stdout = {stdout}");
+}
+
+#[test]
+fn perf_prints_throughput() {
+    let (ok, stdout, _) = run(&["host-perf"]);
+    assert!(ok);
+    assert!(stdout.contains("throughput="), "stdout = {stdout}");
+    assert!(stdout.contains("inf/s"), "stdout = {stdout}");
+}
+
+#[test]
+fn perf_prints_clock_freq() {
+    let (ok, stdout, _) = run(&["host-perf"]);
+    assert!(ok);
+    assert!(stdout.contains("MHz"), "stdout = {stdout}");
+}
+
+#[test]
+fn perf_custom_layers() {
+    let (ok, stdout, _) = run(&["host-perf", "--num-layers", "5"]);
+    assert!(ok);
+    assert!(stdout.contains("layers=5"), "stdout = {stdout}");
+}
+
+#[test]
+fn perf_custom_neurons() {
+    let (ok, stdout, _) = run(&["host-perf", "--neurons", "128"]);
+    assert!(ok);
+    assert!(stdout.contains("neurons=128"), "stdout = {stdout}");
+}
+
+#[test]
+fn perf_custom_chunks() {
+    let (ok, stdout, _) = run(&["host-perf", "--chunks", "32"]);
+    assert!(ok);
+    assert!(stdout.contains("chunks=32"), "stdout = {stdout}");
+}
+
+#[test]
+fn perf_custom_clock() {
+    let (ok, stdout, _) = run(&["host-perf", "--clock-mhz", "100.0"]);
+    assert!(ok);
+    assert!(stdout.contains("@ 100.0 MHz"), "stdout = {stdout}");
+}
+
+#[test]
+fn perf_zero_layers_fails() {
+    let (ok, _, stderr) = run(&["host-perf", "--num-layers", "0"]);
+    assert!(!ok);
+    assert!(stderr.contains("invalid config"), "stderr = {stderr}");
+}
+
+#[test]
+fn perf_zero_neurons_fails() {
+    let (ok, _, _stderr) = run(&["host-perf", "--neurons", "0"]);
+    assert!(!ok);
+}
+
+#[test]
+fn perf_zero_chunks_fails() {
+    let (ok, _, _stderr) = run(&["host-perf", "--chunks", "0"]);
+    assert!(!ok);
+}
+
+#[test]
+fn perf_deterministic() {
+    let (ok1, s1, _) = run(&["host-perf"]);
+    let (ok2, s2, _) = run(&["host-perf"]);
+    assert!(ok1 && ok2);
+    assert_eq!(s1, s2, "should be deterministic");
+}
+
+#[test]
+fn perf_single_line_output() {
+    let (ok, stdout, _) = run(&["host-perf"]);
+    assert!(ok);
+    let trimmed = stdout.trim_end_matches('\n');
+    assert!(!trimmed.contains('\n'), "expected single line, got {stdout}");
+}
+
+#[test]
+fn perf_no_stderr_on_success() {
+    let (ok, _, stderr) = run(&["host-perf"]);
+    assert!(ok);
+    assert!(stderr.trim().is_empty(), "stderr should be empty: {stderr}");
+}
+
+#[test]
+fn perf_help_lists_all_flags() {
+    let (ok, stdout, _) = run(&["host-perf", "--help"]);
+    assert!(ok);
+    for flag in ["--num-layers", "--neurons", "--chunks", "--clock-mhz"] {
+        assert!(stdout.contains(flag), "missing {flag} in help: {stdout}");
+    }
+}
+
+#[test]
+fn perf_help_mentions_wave_42() {
+    let (ok, stdout, _) = run(&["host-perf", "--help"]);
+    assert!(ok);
+    assert!(stdout.contains("Wave 42") || stdout.contains("R-HS-4"), "expected Wave 42 / R-HS-4: {stdout}");
+}
+
+#[test]
+fn perf_cycles_increase_with_layers() {
+    let (ok1, s1, _) = run(&["host-perf", "--num-layers", "1"]);
+    let (ok2, s2, _) = run(&["host-perf", "--num-layers", "4"]);
+    assert!(ok1 && ok2);
+    let c1: u64 = extract_total_cycles(&s1);
+    let c2: u64 = extract_total_cycles(&s2);
+    assert!(c2 > c1, "4-layer cycles ({c2}) should exceed 1-layer ({c1})");
+}
+
+fn extract_total_cycles(s: &str) -> u64 {
+    let part = s.split("total_cycles=").nth(1).unwrap_or("");
+    part.split_whitespace().next().unwrap_or("0").parse().unwrap_or(0)
+}
diff --git a/docs/NOW.md b/docs/NOW.md
index 14aae767..52f7ccbb 100644
--- a/docs/NOW.md
+++ b/docs/NOW.md
@@ -2,6 +2,30 @@
 
 Last updated: 2026-05-23
 
+## wave-43 -- t27c --json flag for host CLI commands (R-HS-5, Closes #795)
+
+- **WHERE** (bootstrap-only, additive): new file `bootstrap/src/host/json_output.rs` (`HostSmokeJson`, `HostPollVsIrqJson`, `HostInferenceJson`, `HostPerfJson` structs, `print_json` helper); updated `bootstrap/src/host/mod.rs`; added `--json` flag to all 4 host commands in `main.rs`; new test file `bootstrap/tests/host_json.rs` (23 integration tests).
+- **Why** (R-HS-5): host commands emit human-readable single-line output by default. `--json` enables structured JSON for CI pipelines, trios-bridge, and downstream tooling. Additive — default output unchanged.
+- **Tests**: 23 new integration tests. All pass. Zero regressions.
+
+## wave-42 -- t27c host-perf -- performance model and cycle estimator (R-HS-4, Closes #791)
+
+- **WHERE** (bootstrap-only, additive): new file `bootstrap/src/host/perf.rs` (`EngineConfig`, `PerformanceEstimate`, `LayerEstimate`; cycle/DMA/BRAM/throughput estimation; 19 inline unit tests); updated `bootstrap/src/host/mod.rs`; new CLI `Commands::HostPerf` + `run_host_perf()`; new test file `bootstrap/tests/host_perf.rs` (23 integration tests).
+- **Why** (R-HS-4): W41 gave us the full inference engine. W42 adds the analytical performance model: given engine config (layers, neurons, chunks), compute per-layer DMA beats, compute cycles, BRAM utilization, total inference cycles, and throughput at a given clock frequency. Pure arithmetic, no hardware dependency. Essential for FPGA bringup (compare estimated vs actual CYCLES counter from W36f).
+- **Tests**: 42 new (19 inline + 23 integration). All pass. Zero regressions.
+
+## wave-41 -- t27c host-inference -- DMA-driven multi-layer BitNet inference flow (R-HS-3, Closes #789)
+
+- **WHERE** (bootstrap-only, additive): new file `bootstrap/src/host/engine.rs` (`InferenceEngine`, `InferenceReport`, per-layer DMA prefetch → inference → DMA drain cycle; 16 inline unit tests); updated `bootstrap/src/host/irq.rs` (`wait_irq_mask` generic IRQ wait, refactored from `wait_done_irq`); updated `bootstrap/src/host/mod.rs`; new CLI `Commands::HostInference` + `run_host_inference()`; new test file `bootstrap/tests/host_engine.rs` (20 integration tests).
+- **Why** (R-HS-3): W40 gave us IRQ-driven single-completion. W41 orchestrates the full multi-layer BitNet inference flow: for each layer, DMA prefetch weights (wait DmaDone IRQ), start inference (wait InferenceDone IRQ), DMA drain output (wait DmaDone IRQ). Uses `wait_irq_mask()` — a new generic IRQ-wait method on `IrqDrivenDriver` that waits for any mask, not just inference-done.
+- **Tests**: 36 new (16 inline + 20 integration). All pass. Zero regressions.
+
+## wave-40 -- t27c host-poll-vs-irq -- IRQ-handler harness + poll-vs-IRQ comparison (R-HS-2, Closes #786)
+
+- **WHERE** (bootstrap-only, additive): new file `bootstrap/src/host/irq.rs` (`IrqSource` enum, `IrqHandler<M>` callback registry, `IrqDrivenDriver<M>` with `wait_done_irq`; 11 inline unit tests); updated `bootstrap/src/host/mod.rs`; new CLI `Commands::HostPollVsIrq` + `run_host_poll_vs_irq()`; new test file `bootstrap/tests/host_irq.rs` (21 integration tests).
+- **Why** (R-HS-2): W39 added poll-mode driver. W40 adds interrupt-driven completion path with callback dispatch and side-by-side poll-vs-IRQ comparison on MockMmio.
+- **Tests**: 32 new (11 inline + 21 integration). All pass.
+
 ## wave-39 -- t27c host-side Rust driver module: BitNet AXI-Lite CSR aperture (R-HS-1, Closes #784)
 
 - **WHERE** (bootstrap-only, additive): new directory `bootstrap/src/host/` with four files -- `mod.rs` (re-exports), `csr_map.rs` (10 CSR offset constants + status/IRQ bit masks + 10 inline unit tests), `mmio.rs` (`Mmio` trait + `MockMmio` deterministic BTreeMap backend + transaction log + 10 inline unit tests), `driver.rs` (`BitnetDriver<M: Mmio>` orchestrator with configure / start / poll / IRQ / dump methods + `CsrSnapshot` struct + `DriverError` enum + 11 inline unit tests). One new `mod host;` declaration in `bootstrap/src/main.rs`. One new CLI subcommand `Commands::HostSmoke { num_layers, neurons, chunks, threshold, weight_addr, max_polls }` registered in the `Commands` enum and dispatched in both HTTP-server and CLI match arms via `run_host_smoke(...)`. **Zero** edits under `gen/`, `coq/`, `trios-coq/`, `proofs/`, `specs/`, `conformance/`, `architecture/`, `rings/`, root `Cargo.toml`. Doc-only update to this file. New test file `bootstrap/tests/host_driver.rs` (25 integration tests via `CARGO_BIN_EXE_t27c`).