diff --git a/bootstrap/src/host/engine.rs b/bootstrap/src/host/engine.rs new file mode 100644 index 00000000..30ef3bbf --- /dev/null +++ b/bootstrap/src/host/engine.rs @@ -0,0 +1,316 @@ +use super::csr_map; +use super::driver::{BitnetDriver, DriverError}; +use super::irq::IrqDrivenDriver; +use super::mmio::MockMmio; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct InferenceReport { + pub total_layers: u32, + pub layers_completed: u32, + pub error_layer: Option, + pub total_writes: usize, + pub total_reads: usize, +} + +pub struct InferenceEngine { + driver: IrqDrivenDriver, + num_layers: u32, + neurons: u32, + chunks: u32, + threshold: u32, + weight_addr: u64, +} + +impl InferenceEngine { + pub fn new(driver: BitnetDriver) -> Self { + Self { + driver: IrqDrivenDriver::new(driver), + num_layers: 2, + neurons: 16, + chunks: 4, + threshold: 1, + weight_addr: 0, + } + } + + pub fn configure( + &mut self, + num_layers: u32, + neurons: u32, + chunks: u32, + threshold: u32, + weight_addr: u64, + ) -> Result<(), DriverError> { + if num_layers == 0 || neurons == 0 || chunks == 0 { + return Err(DriverError::InvalidConfig); + } + self.num_layers = num_layers; + self.neurons = neurons; + self.chunks = chunks; + self.threshold = threshold; + self.weight_addr = weight_addr; + Ok(()) + } + + pub fn run(&mut self, max_rounds_per_stage: u32) -> Result { + self.driver + .handler_mut() + .driver_mut() + .configure( + self.num_layers, + self.neurons, + self.chunks, + self.threshold, + self.weight_addr, + )?; + self.driver + .handler_mut() + .driver_mut() + .enable_irqs(csr_map::IRQ_ALL_MASK); + + let mut layers_completed: u32 = 0; + let mut error_layer: Option = None; + + for layer in 0..self.num_layers { + let layer_weight_addr = self.weight_addr.wrapping_add( + (layer as u64) * 0x1_0000 * (self.neurons as u64) * (self.chunks as u64), + ); + self.driver + .handler_mut() + .driver_mut() + .mmio_mut() + .poke(csr_map::WEIGHT_ADDR_LO, layer_weight_addr as u32); + self.driver + .handler_mut() + .driver_mut() + .mmio_mut() + .poke(csr_map::WEIGHT_ADDR_HI, (layer_weight_addr >> 32) as u32); + + self.driver + .handler_mut() + .driver_mut() + .mmio_mut() + .latch_irq(csr_map::IRQ_DMA_DONE_MASK); + if let Err(e) = self.wait_dma_done(max_rounds_per_stage) { + error_layer = Some(layer); + if e == DriverError::EngineError { + break; + } + return Err(e); + } + + self.driver.handler_mut().driver_mut().start(); + self.driver + .handler_mut() + .driver_mut() + .mmio_mut() + .latch_irq(csr_map::IRQ_INFERENCE_DONE_MASK); + self.driver + .handler_mut() + .driver_mut() + .mmio_mut() + .set_done(true); + if let Err(e) = self.wait_inference_done(max_rounds_per_stage) { + error_layer = Some(layer); + if e == DriverError::EngineError { + break; + } + return Err(e); + } + + self.driver + .handler_mut() + .driver_mut() + .mmio_mut() + .latch_irq(csr_map::IRQ_DMA_DONE_MASK); + if let Err(e) = self.wait_dma_done(max_rounds_per_stage) { + error_layer = Some(layer); + if e == DriverError::EngineError { + break; + } + return Err(e); + } + + layers_completed += 1; + } + + let total_writes = self.driver.handler().driver().mmio().write_count(); + let total_reads = self.driver.handler().driver().mmio().read_count(); + + Ok(InferenceReport { + total_layers: self.num_layers, + layers_completed, + error_layer, + total_writes, + total_reads, + }) + } + + fn wait_dma_done(&mut self, max_rounds: u32) -> Result<(), DriverError> { + self.driver.wait_irq_mask(csr_map::IRQ_DMA_DONE_MASK, max_rounds) + } + + fn wait_inference_done(&mut self, max_rounds: u32) -> Result<(), DriverError> { + self.driver.wait_irq_mask(csr_map::IRQ_INFERENCE_DONE_MASK, max_rounds) + } + + pub fn driver(&self) -> &IrqDrivenDriver { + &self.driver + } + + pub fn driver_mut(&mut self) -> &mut IrqDrivenDriver { + &mut self.driver + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::host::mmio::MockMmio; + + fn fresh() -> InferenceEngine { + InferenceEngine::new(BitnetDriver::new(MockMmio::with_csrs_zeroed())) + } + + #[test] + fn configure_rejects_zero_layers() { + let mut e = fresh(); + assert_eq!(e.configure(0, 1, 1, 1, 0), Err(DriverError::InvalidConfig)); + } + + #[test] + fn configure_rejects_zero_neurons() { + let mut e = fresh(); + assert_eq!(e.configure(1, 0, 1, 1, 0), Err(DriverError::InvalidConfig)); + } + + #[test] + fn configure_rejects_zero_chunks() { + let mut e = fresh(); + assert_eq!(e.configure(1, 1, 0, 1, 0), Err(DriverError::InvalidConfig)); + } + + #[test] + fn run_single_layer_succeeds() { + let mut e = fresh(); + e.configure(1, 16, 4, 1, 0).unwrap(); + let report = e.run(4).unwrap(); + assert_eq!(report.total_layers, 1); + assert_eq!(report.layers_completed, 1); + assert_eq!(report.error_layer, None); + } + + #[test] + fn run_two_layers_succeeds() { + let mut e = fresh(); + e.configure(2, 16, 4, 1, 0).unwrap(); + let report = e.run(4).unwrap(); + assert_eq!(report.total_layers, 2); + assert_eq!(report.layers_completed, 2); + } + + #[test] + fn run_reports_writes_and_reads() { + let mut e = fresh(); + e.configure(1, 16, 4, 1, 0).unwrap(); + let report = e.run(4).unwrap(); + assert!(report.total_writes > 0); + assert!(report.total_reads > 0); + } + + #[test] + fn run_increases_writes_with_more_layers() { + let mut e1 = fresh(); + e1.configure(1, 16, 4, 1, 0).unwrap(); + let r1 = e1.run(4).unwrap(); + + let mut e2 = fresh(); + e2.configure(3, 16, 4, 1, 0).unwrap(); + let r2 = e2.run(4).unwrap(); + + assert!(r2.total_writes > r1.total_writes); + } + + #[test] + fn run_error_on_irq_error_stops_early() { + let mut e = fresh(); + e.configure(3, 16, 4, 1, 0).unwrap(); + e.driver_mut() + .handler_mut() + .driver_mut() + .mmio_mut() + .latch_irq(csr_map::IRQ_ERROR_MASK); + let report = e.run(4).unwrap(); + assert!(report.layers_completed < report.total_layers); + assert!(report.error_layer.is_some()); + } + + #[test] + fn run_returns_error_on_zero_rounds() { + let mut e = fresh(); + e.configure(1, 16, 4, 1, 0).unwrap(); + let result = e.run(0); + assert_eq!(result, Err(DriverError::Timeout)); + } + + #[test] + fn five_layers_all_complete() { + let mut e = fresh(); + e.configure(5, 4, 2, 1, 0).unwrap(); + let report = e.run(4).unwrap(); + assert_eq!(report.layers_completed, 5); + assert_eq!(report.error_layer, None); + } + + #[test] + fn one_layer_one_neuron_one_chunk() { + let mut e = fresh(); + e.configure(1, 1, 1, 0, 0).unwrap(); + let report = e.run(4).unwrap(); + assert_eq!(report.layers_completed, 1); + } + + #[test] + fn threshold_zero_is_valid() { + let mut e = fresh(); + e.configure(1, 4, 2, 0, 0).unwrap(); + let report = e.run(4).unwrap(); + assert_eq!(report.layers_completed, 1); + } + + #[test] + fn large_weight_addr_wraps() { + let mut e = fresh(); + e.configure(1, 4, 2, 1, 0xFFFF_FFFF_FFFF_FFFF).unwrap(); + let report = e.run(4).unwrap(); + assert_eq!(report.layers_completed, 1); + } + + #[test] + fn read_count_positive_after_run() { + let mut e = fresh(); + e.configure(1, 4, 4, 1, 0).unwrap(); + let report = e.run(4).unwrap(); + assert!(report.total_reads >= 1); + } + + #[test] + fn writes_increase_monotonically_with_layers() { + let results: Vec = (1..=4) + .map(|n| { + let mut e = fresh(); + e.configure(n, 4, 2, 1, 0).unwrap(); + e.run(4).unwrap() + }) + .collect(); + for w in results.windows(2) { + assert!(w[1].total_writes > w[0].total_writes); + } + } + + #[test] + fn configure_accepts_max_values() { + let mut e = fresh(); + assert!(e.configure(u32::MAX, u32::MAX, u32::MAX, u32::MAX, u64::MAX).is_ok()); + } +} diff --git a/bootstrap/src/host/irq.rs b/bootstrap/src/host/irq.rs new file mode 100644 index 00000000..c8aa724a --- /dev/null +++ b/bootstrap/src/host/irq.rs @@ -0,0 +1,257 @@ +use super::csr_map; +use super::driver::{BitnetDriver, DriverError}; +use super::mmio::Mmio; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum IrqSource { + InferenceDone, + DmaDone, + Error, +} + +impl IrqSource { + pub fn mask(self) -> u32 { + match self { + IrqSource::InferenceDone => csr_map::IRQ_INFERENCE_DONE_MASK, + IrqSource::DmaDone => csr_map::IRQ_DMA_DONE_MASK, + IrqSource::Error => csr_map::IRQ_ERROR_MASK, + } + } + + pub fn from_mask(mask: u32) -> Vec { + let mut sources = Vec::new(); + if mask & csr_map::IRQ_INFERENCE_DONE_MASK != 0 { + sources.push(IrqSource::InferenceDone); + } + if mask & csr_map::IRQ_DMA_DONE_MASK != 0 { + sources.push(IrqSource::DmaDone); + } + if mask & csr_map::IRQ_ERROR_MASK != 0 { + sources.push(IrqSource::Error); + } + sources + } +} + +type IrqCallback = fn(IrqSource); + +pub struct IrqHandler { + driver: BitnetDriver, + callbacks: [Option; 3], +} + +impl IrqHandler { + pub fn new(driver: BitnetDriver) -> Self { + Self { + driver, + callbacks: [None, None, None], + } + } + + pub fn register(&mut self, source: IrqSource, cb: IrqCallback) { + let idx = match source { + IrqSource::InferenceDone => 0, + IrqSource::DmaDone => 1, + IrqSource::Error => 2, + }; + self.callbacks[idx] = Some(cb); + } + + pub fn service(&mut self) -> u32 { + let stat = self.driver.read_irq_status(); + if stat == 0 { + return 0; + } + let sources = IrqSource::from_mask(stat); + for src in sources { + let idx = match src { + IrqSource::InferenceDone => 0, + IrqSource::DmaDone => 1, + IrqSource::Error => 2, + }; + if let Some(cb) = self.callbacks[idx] { + cb(src); + } + } + self.driver.clear_irq(stat); + stat + } + + pub fn driver(&self) -> &BitnetDriver { + &self.driver + } + + pub fn driver_mut(&mut self) -> &mut BitnetDriver { + &mut self.driver + } +} + +pub struct IrqDrivenDriver { + handler: IrqHandler, +} + +impl IrqDrivenDriver { + pub fn new(driver: BitnetDriver) -> Self { + Self { + handler: IrqHandler::new(driver), + } + } + + pub fn register(&mut self, source: IrqSource, cb: IrqCallback) { + self.handler.register(source, cb); + } + + pub fn handler(&self) -> &IrqHandler { + &self.handler + } + + pub fn handler_mut(&mut self) -> &mut IrqHandler { + &mut self.handler + } + + pub fn wait_done_irq(&mut self, max_service_rounds: u32) -> Result<(), DriverError> { + self.wait_irq_mask(csr_map::IRQ_INFERENCE_DONE_MASK, max_service_rounds) + } + + pub fn wait_irq_mask(&mut self, mask: u32, max_service_rounds: u32) -> Result<(), DriverError> { + for _ in 0..max_service_rounds { + let serviced = self.handler.service(); + if serviced & csr_map::IRQ_ERROR_MASK != 0 { + return Err(DriverError::EngineError); + } + if serviced & mask != 0 { + return Ok(()); + } + if mask == csr_map::IRQ_INFERENCE_DONE_MASK + && self.handler.driver_mut().is_done() + { + return Ok(()); + } + } + Err(DriverError::Timeout) + } + + pub fn into_handler(self) -> IrqHandler { + self.handler + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::host::mmio::MockMmio; + + thread_local! { + static FIRED: std::cell::RefCell> = std::cell::RefCell::new(Vec::new()); + } + + fn record_cb(src: IrqSource) { + FIRED.with(|f| f.borrow_mut().push(src)); + } + + fn fired() -> Vec { + FIRED.with(|f| f.borrow().clone()) + } + + fn clear_fired() { + FIRED.with(|f| f.borrow_mut().clear()); + } + + fn fresh_handler() -> IrqHandler { + IrqHandler::new(BitnetDriver::new(MockMmio::with_csrs_zeroed())) + } + + fn fresh_irq_driver() -> IrqDrivenDriver { + IrqDrivenDriver::new(BitnetDriver::new(MockMmio::with_csrs_zeroed())) + } + + #[test] + fn irq_source_mask_roundtrip() { + assert_eq!(IrqSource::InferenceDone.mask(), csr_map::IRQ_INFERENCE_DONE_MASK); + assert_eq!(IrqSource::DmaDone.mask(), csr_map::IRQ_DMA_DONE_MASK); + assert_eq!(IrqSource::Error.mask(), csr_map::IRQ_ERROR_MASK); + } + + #[test] + fn from_mask_empty() { + assert!(IrqSource::from_mask(0).is_empty()); + } + + #[test] + fn from_mask_all_three() { + let sources = IrqSource::from_mask(csr_map::IRQ_ALL_MASK); + assert_eq!(sources.len(), 3); + } + + #[test] + fn handler_service_no_irqs_returns_zero() { + let mut h = fresh_handler(); + assert_eq!(h.service(), 0); + } + + #[test] + fn handler_service_dispatches_callback() { + clear_fired(); + let mut h = fresh_handler(); + h.register(IrqSource::InferenceDone, record_cb); + h.driver_mut().mmio_mut().latch_irq(csr_map::IRQ_INFERENCE_DONE_MASK); + let serviced = h.service(); + assert_eq!(serviced, csr_map::IRQ_INFERENCE_DONE_MASK); + assert_eq!(fired(), vec![IrqSource::InferenceDone]); + } + + #[test] + fn handler_service_calls_clear_irq() { + let mut h = fresh_handler(); + h.driver_mut().mmio_mut().latch_irq(csr_map::IRQ_INFERENCE_DONE_MASK); + h.service(); + let log = h.driver().mmio().log(); + let last = log.last().unwrap(); + assert_eq!(last.op, super::super::mmio::MmioOp::Write); + assert_eq!(last.addr, csr_map::IRQ_STAT); + assert_eq!(last.value, csr_map::IRQ_INFERENCE_DONE_MASK); + } + + #[test] + fn handler_multiple_sources() { + clear_fired(); + let mut h = fresh_handler(); + h.register(IrqSource::InferenceDone, record_cb); + h.register(IrqSource::DmaDone, record_cb); + h.driver_mut().mmio_mut().latch_irq( + csr_map::IRQ_INFERENCE_DONE_MASK | csr_map::IRQ_DMA_DONE_MASK, + ); + let serviced = h.service(); + assert_eq!(serviced, csr_map::IRQ_INFERENCE_DONE_MASK | csr_map::IRQ_DMA_DONE_MASK); + let f = fired(); + assert!(f.contains(&IrqSource::InferenceDone)); + assert!(f.contains(&IrqSource::DmaDone)); + } + + #[test] + fn irq_driver_wait_done_succeeds_on_inference_done() { + let mut d = fresh_irq_driver(); + d.handler_mut().driver_mut().mmio_mut().latch_irq(csr_map::IRQ_INFERENCE_DONE_MASK); + assert_eq!(d.wait_done_irq(4), Ok(())); + } + + #[test] + fn irq_driver_wait_done_returns_error_on_irq_error() { + let mut d = fresh_irq_driver(); + d.handler_mut().driver_mut().mmio_mut().latch_irq(csr_map::IRQ_ERROR_MASK); + assert_eq!(d.wait_done_irq(4), Err(DriverError::EngineError)); + } + + #[test] + fn irq_driver_wait_done_times_out() { + let mut d = fresh_irq_driver(); + assert_eq!(d.wait_done_irq(2), Err(DriverError::Timeout)); + } + + #[test] + fn irq_driver_wait_done_falls_back_to_done_bit() { + let mut d = fresh_irq_driver(); + d.handler_mut().driver_mut().mmio_mut().set_done(true); + assert_eq!(d.wait_done_irq(4), Ok(())); + } +} diff --git a/bootstrap/src/host/json_output.rs b/bootstrap/src/host/json_output.rs new file mode 100644 index 00000000..65ce1935 --- /dev/null +++ b/bootstrap/src/host/json_output.rs @@ -0,0 +1,58 @@ +use serde::Serialize; + +#[derive(Debug, Clone, Serialize)] +pub struct HostSmokeJson { + pub ok: bool, + pub writes: usize, + pub reads: usize, + pub layers: u32, + pub neurons: u32, + pub chunks: u32, + pub threshold: u32, + pub weight_addr: String, + pub irq_stat: String, +} + +#[derive(Debug, Clone, Serialize)] +pub struct HostPollVsIrqJson { + pub ok: bool, + pub poll_writes: usize, + pub poll_reads: usize, + pub irq_writes: usize, + pub irq_reads: usize, + pub writes_match: bool, + pub irq_stat_poll: String, + pub irq_stat_irq: String, +} + +#[derive(Debug, Clone, Serialize)] +pub struct HostInferenceJson { + pub ok: bool, + pub total_layers: u32, + pub layers_completed: u32, + pub error_layer: Option, + pub total_writes: usize, + pub total_reads: usize, +} + +#[derive(Debug, Clone, Serialize)] +pub struct HostPerfJson { + pub ok: bool, + pub layers: u32, + pub neurons: u32, + pub chunks: u32, + pub total_cycles: u32, + pub total_weight_words: u32, + pub total_weight_bytes: u32, + pub bram_utilization_pct: f64, + pub total_dma_beats: u32, + pub throughput_inf_per_sec: f64, + pub clock_mhz: f64, +} + +pub fn print_json(value: &T) -> anyhow::Result<()> { + let s = serde_json::to_string(value) + .map_err(|e| anyhow::anyhow!("JSON serialization failed: {}", e))?; + println!("{}", s); + Ok(()) +} diff --git a/bootstrap/src/host/mod.rs b/bootstrap/src/host/mod.rs index ad34d0e2..b63e5616 100644 --- a/bootstrap/src/host/mod.rs +++ b/bootstrap/src/host/mod.rs @@ -15,7 +15,15 @@ pub mod csr_map; pub mod driver; +pub mod engine; +pub mod irq; +pub mod json_output; pub mod mmio; +pub mod perf; pub use driver::{BitnetDriver, CsrSnapshot, DriverError}; +pub use engine::{InferenceEngine, InferenceReport}; +pub use irq::{IrqDrivenDriver, IrqHandler, IrqSource}; +pub use json_output::{HostSmokeJson, HostPollVsIrqJson, HostInferenceJson, HostPerfJson}; pub use mmio::{MmioOp, MmioRecord, MockMmio}; +pub use perf::{EngineConfig, PerformanceEstimate}; diff --git a/bootstrap/src/host/perf.rs b/bootstrap/src/host/perf.rs new file mode 100644 index 00000000..fbd16108 --- /dev/null +++ b/bootstrap/src/host/perf.rs @@ -0,0 +1,252 @@ +pub const TRITS_PER_WORD: u32 = 27; +pub const BITS_PER_TRIT: u32 = 2; +pub const DATA_WIDTH: u32 = TRITS_PER_WORD * BITS_PER_TRIT; +pub const BRAM_DEPTH: u32 = 4096; +pub const DDR_BEAT_BITS: u32 = 64; +pub const DDR_BEAT_BYTES: u32 = DDR_BEAT_BITS / 8; +pub const WORDS_PER_DDR_BEAT: u32 = DDR_BEAT_BITS / DATA_WIDTH; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct EngineConfig { + pub num_layers: u32, + pub neurons: u32, + pub chunks: u32, +} + +#[derive(Debug, Clone, Copy, PartialEq)] +pub struct LayerEstimate { + pub layer_index: u32, + pub weight_words: u32, + pub weight_bytes: u32, + pub dma_prefetch_beats: u32, + pub compute_cycles: u32, + pub dma_drain_beats: u32, + pub total_cycles: u32, +} + +#[derive(Debug, Clone, PartialEq)] +pub struct PerformanceEstimate { + pub config: EngineConfig, + pub total_weight_words: u32, + pub total_weight_bytes: u32, + pub bram_utilization_pct: f64, + pub total_dma_beats: u32, + pub total_inference_cycles: u32, + pub layers: Vec, +} + +impl EngineConfig { + pub fn new(num_layers: u32, neurons: u32, chunks: u32) -> Option { + if num_layers == 0 || neurons == 0 || chunks == 0 { + return None; + } + Some(Self { + num_layers, + neurons, + chunks, + }) + } + + pub fn weight_words_per_layer(&self) -> u32 { + self.neurons * self.chunks + } + + pub fn total_weight_words(&self) -> u32 { + self.weight_words_per_layer() * self.num_layers + } + + pub fn weight_bytes_per_layer(&self) -> u32 { + self.weight_words_per_layer() * (DATA_WIDTH / 8) + } + + pub fn total_weight_bytes(&self) -> u32 { + self.total_weight_words() * (DATA_WIDTH / 8) + } + + pub fn bram_utilization_pct(&self) -> f64 { + let words = self.weight_words_per_layer() as f64; + (words / BRAM_DEPTH as f64) * 100.0 + } + + pub fn dma_beats_per_layer(&self) -> u32 { + let words = self.weight_words_per_layer(); + (words + WORDS_PER_DDR_BEAT - 1) / WORDS_PER_DDR_BEAT + } + + pub fn compute_cycles_per_layer(&self) -> u32 { + self.neurons * self.chunks + } + + pub fn cycles_per_layer(&self) -> u32 { + let dma_beats = self.dma_beats_per_layer(); + let compute = self.compute_cycles_per_layer(); + dma_beats + compute + dma_beats + } + + pub fn total_inference_cycles(&self) -> u32 { + self.cycles_per_layer() * self.num_layers + } + + pub fn throughput_inf_per_sec(&self, clock_mhz: f64) -> f64 { + if clock_mhz <= 0.0 { + return 0.0; + } + let cycles_per_sec = clock_mhz * 1e6; + let cycles_per_inf = self.total_inference_cycles() as f64; + if cycles_per_inf == 0.0 { + return 0.0; + } + cycles_per_sec / cycles_per_inf + } + + pub fn estimate(&self) -> PerformanceEstimate { + let layers: Vec = (0..self.num_layers) + .map(|i| { + let weight_words = self.weight_words_per_layer(); + let dma_beats = self.dma_beats_per_layer(); + let compute = self.compute_cycles_per_layer(); + LayerEstimate { + layer_index: i, + weight_words, + weight_bytes: self.weight_bytes_per_layer(), + dma_prefetch_beats: dma_beats, + compute_cycles: compute, + dma_drain_beats: dma_beats, + total_cycles: dma_beats + compute + dma_beats, + } + }) + .collect(); + + PerformanceEstimate { + config: *self, + total_weight_words: self.total_weight_words(), + total_weight_bytes: self.total_weight_bytes(), + bram_utilization_pct: self.bram_utilization_pct(), + total_dma_beats: layers.iter().map(|l| l.dma_prefetch_beats + l.dma_drain_beats).sum(), + total_inference_cycles: self.total_inference_cycles(), + layers, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn cfg() -> EngineConfig { + EngineConfig::new(2, 16, 4).unwrap() + } + + #[test] + fn rejects_zero_layers() { + assert!(EngineConfig::new(0, 1, 1).is_none()); + } + + #[test] + fn rejects_zero_neurons() { + assert!(EngineConfig::new(1, 0, 1).is_none()); + } + + #[test] + fn rejects_zero_chunks() { + assert!(EngineConfig::new(1, 1, 0).is_none()); + } + + #[test] + fn weight_words_per_layer() { + assert_eq!(cfg().weight_words_per_layer(), 64); + } + + #[test] + fn total_weight_words() { + assert_eq!(cfg().total_weight_words(), 128); + } + + #[test] + fn weight_bytes_per_layer() { + assert_eq!(cfg().weight_bytes_per_layer(), 64 * (DATA_WIDTH / 8)); + } + + #[test] + fn bram_utilization_under_100() { + assert!(cfg().bram_utilization_pct() < 100.0); + } + + #[test] + fn bram_utilization_exact() { + let c = cfg(); + let expected = (c.weight_words_per_layer() as f64 / BRAM_DEPTH as f64) * 100.0; + assert!((c.bram_utilization_pct() - expected).abs() < 0.001); + } + + #[test] + fn compute_cycles_per_layer() { + assert_eq!(cfg().compute_cycles_per_layer(), 64); + } + + #[test] + fn cycles_per_layer_is_three_stages() { + let c = cfg(); + let dma = c.dma_beats_per_layer(); + let compute = c.compute_cycles_per_layer(); + assert_eq!(c.cycles_per_layer(), dma + compute + dma); + } + + #[test] + fn total_inference_cycles_scales_linearly() { + let c = cfg(); + assert_eq!( + c.total_inference_cycles(), + c.cycles_per_layer() * c.num_layers + ); + } + + #[test] + fn throughput_at_66_mhz() { + let t = cfg().throughput_inf_per_sec(66.0); + assert!(t > 0.0, "throughput should be positive: {t}"); + } + + #[test] + fn throughput_zero_clock_is_zero() { + assert_eq!(cfg().throughput_inf_per_sec(0.0), 0.0); + } + + #[test] + fn estimate_has_correct_layer_count() { + let e = cfg().estimate(); + assert_eq!(e.layers.len(), 2); + } + + #[test] + fn estimate_layer_indices_sequential() { + let e = cfg().estimate(); + for (i, l) in e.layers.iter().enumerate() { + assert_eq!(l.layer_index, i as u32); + } + } + + #[test] + fn estimate_total_weight_words_matches() { + let e = cfg().estimate(); + assert_eq!(e.total_weight_words, cfg().total_weight_words()); + } + + #[test] + fn large_config_does_not_overflow() { + let c = EngineConfig::new(100, 4096, 64).unwrap(); + let e = c.estimate(); + assert!(e.total_weight_words > 0); + assert!(e.total_inference_cycles > 0); + } + + #[test] + fn data_width_is_54() { + assert_eq!(DATA_WIDTH, 54); + } + + #[test] + fn bram_depth_is_4096() { + assert_eq!(BRAM_DEPTH, 4096); + } +} diff --git a/bootstrap/src/main.rs b/bootstrap/src/main.rs index 5249a998..9eea44b5 100644 --- a/bootstrap/src/main.rs +++ b/bootstrap/src/main.rs @@ -347,6 +347,114 @@ enum Commands { /// Maximum poll iterations before timeout (default: 16). #[arg(long, default_value_t = 16)] max_polls: u32, + + /// Emit structured JSON instead of human-readable output. + #[arg(long)] + json: bool, + }, + + /// Run a side-by-side poll-vs-IRQ comparison on MockMmio (Wave 40, R-HS-2). + /// + /// Executes the same configure -> start -> complete flow twice: once via + /// the poll-mode `wait_done` path (W39) and once via the interrupt-driven + /// `IrqDrivenDriver::wait_done_irq` path (W40). Prints a single comparison + /// line: `OK poll=Nw/Mr irq=Nw/Mr writes_match= irq_stat_poll=0x.. + /// irq_stat_irq=0x..`. + #[command(name = "host-poll-vs-irq")] + HostPollVsIrq { + /// Number of layers to program (default: 2). + #[arg(long, default_value_t = 2)] + num_layers: u32, + + /// Neurons per layer (default: 16). + #[arg(long, default_value_t = 16)] + neurons: u32, + + /// Chunks per neuron (default: 4). + #[arg(long, default_value_t = 4)] + chunks: u32, + + /// Signed threshold value (default: 1). + #[arg(long, default_value_t = 1)] + threshold: u32, + + /// 64-bit weight base address as decimal (default: 0). + #[arg(long, default_value_t = 0)] + weight_addr: u64, + + /// Maximum poll iterations before timeout (default: 16). + #[arg(long, default_value_t = 16)] + max_polls: u32, + + /// Emit structured JSON instead of human-readable output. + #[arg(long)] + json: bool, + }, + + /// Run a multi-layer DMA-driven BitNet inference flow on MockMmio + /// (Wave 41, R-HS-3). + /// + /// Exercises the full configure -> DMA prefetch -> inference -> + /// DMA drain cycle per layer, using IrqDrivenDriver from W40. + /// Prints `OK layers=N completed=M writes=W reads=R`. + #[command(name = "host-inference")] + HostInference { + /// Number of layers to program (default: 2). + #[arg(long, default_value_t = 2)] + num_layers: u32, + + /// Neurons per layer (default: 16). + #[arg(long, default_value_t = 16)] + neurons: u32, + + /// Chunks per neuron (default: 4). + #[arg(long, default_value_t = 4)] + chunks: u32, + + /// Signed threshold value (default: 1). + #[arg(long, default_value_t = 1)] + threshold: u32, + + /// 64-bit weight base address as decimal (default: 0). + #[arg(long, default_value_t = 0)] + weight_addr: u64, + + /// Maximum IRQ-service rounds per stage (default: 16). + #[arg(long, default_value_t = 16)] + max_rounds: u32, + + /// Emit structured JSON instead of human-readable output. + #[arg(long)] + json: bool, + }, + + /// Estimate BitNet inference performance from engine configuration + /// (Wave 42, R-HS-4). + /// + /// Prints cycle counts, DMA beats, BRAM utilization, and throughput + /// estimates. No hardware required — pure arithmetic model. + #[command(name = "host-perf")] + HostPerf { + /// Number of layers (default: 2). + #[arg(long, default_value_t = 2)] + num_layers: u32, + + /// Neurons per layer (default: 16). + #[arg(long, default_value_t = 16)] + neurons: u32, + + /// Chunks per neuron (default: 4). + #[arg(long, default_value_t = 4)] + chunks: u32, + + /// Clock frequency in MHz for throughput estimate (default: 66.0, + /// matching STARTUPE2.CFGMCLK on Wukong V1). + #[arg(long, default_value_t = 66.0)] + clock_mhz: f64, + + /// Emit structured JSON instead of human-readable output. + #[arg(long)] + json: bool, }, /// Emit a complete BitNet HLS bundle (Wave 38, R-SI-1). @@ -3019,6 +3127,7 @@ fn run_host_smoke( threshold: u32, weight_addr: u64, max_polls: u32, + json: bool, ) -> anyhow::Result<()> { use host::{BitnetDriver, MockMmio}; let mut driver = BitnetDriver::new(MockMmio::with_csrs_zeroed()); @@ -3027,8 +3136,6 @@ fn run_host_smoke( .map_err(|e| anyhow::anyhow!("configure failed: {:?}", e))?; driver.enable_irqs(host::csr_map::IRQ_ALL_MASK); driver.start(); - // Simulate hardware completing the inference immediately for the smoke - // test: latch `done` and the inference_done IRQ before polling. driver.mmio_mut().set_done(true); driver.mmio_mut().latch_irq(host::csr_map::IRQ_INFERENCE_DONE_MASK); driver @@ -3037,17 +3144,174 @@ fn run_host_smoke( let snap = driver.dump(); let w = driver.mmio().write_count(); let r = driver.mmio().read_count(); - println!( - "OK {}w/{}r layers={} neurons={} chunks={} threshold={} weight_addr=0x{:016x} irq_stat=0x{:08x}", - w, - r, - snap.num_layers, - snap.neurons, - snap.chunks, - snap.threshold, - snap.weight_addr_64(), - snap.irq_stat - ); + if json { + host::json_output::print_json(&host::json_output::HostSmokeJson { + ok: true, + writes: w, + reads: r, + layers: snap.num_layers, + neurons: snap.neurons, + chunks: snap.chunks, + threshold: snap.threshold, + weight_addr: format!("0x{:016x}", snap.weight_addr_64()), + irq_stat: format!("0x{:08x}", snap.irq_stat), + })?; + } else { + println!( + "OK {}w/{}r layers={} neurons={} chunks={} threshold={} weight_addr=0x{:016x} irq_stat=0x{:08x}", + w, r, snap.num_layers, snap.neurons, snap.chunks, snap.threshold, snap.weight_addr_64(), snap.irq_stat + ); + } + Ok(()) +} + +fn run_host_poll_vs_irq( + num_layers: u32, + neurons: u32, + chunks: u32, + threshold: u32, + weight_addr: u64, + max_polls: u32, + json: bool, +) -> anyhow::Result<()> { + use host::{BitnetDriver, IrqDrivenDriver, MockMmio}; + let poll_writes; + let poll_reads; + let irq_stat_poll; + { + let mut driver = BitnetDriver::new(MockMmio::with_csrs_zeroed()); + driver + .configure(num_layers, neurons, chunks, threshold, weight_addr) + .map_err(|e| anyhow::anyhow!("poll configure failed: {:?}", e))?; + driver.enable_irqs(host::csr_map::IRQ_ALL_MASK); + driver.start(); + driver.mmio_mut().set_done(true); + driver.mmio_mut().latch_irq(host::csr_map::IRQ_INFERENCE_DONE_MASK); + driver + .wait_done(max_polls) + .map_err(|e| anyhow::anyhow!("poll wait_done failed: {:?}", e))?; + poll_writes = driver.mmio().write_count(); + poll_reads = driver.mmio().read_count(); + irq_stat_poll = driver.dump().irq_stat; + } + let irq_writes; + let irq_reads; + let irq_stat_irq; + { + let mut idd = IrqDrivenDriver::new(BitnetDriver::new(MockMmio::with_csrs_zeroed())); + idd.handler_mut() + .driver_mut() + .configure(num_layers, neurons, chunks, threshold, weight_addr) + .map_err(|e| anyhow::anyhow!("irq configure failed: {:?}", e))?; + idd.handler_mut().driver_mut().enable_irqs(host::csr_map::IRQ_ALL_MASK); + idd.handler_mut().driver_mut().start(); + idd.handler_mut().driver_mut().mmio_mut().set_done(true); + idd.handler_mut() + .driver_mut() + .mmio_mut() + .latch_irq(host::csr_map::IRQ_INFERENCE_DONE_MASK); + idd.wait_done_irq(max_polls) + .map_err(|e| anyhow::anyhow!("irq wait_done_irq failed: {:?}", e))?; + irq_writes = idd.handler().driver().mmio().write_count(); + irq_reads = idd.handler().driver().mmio().read_count(); + irq_stat_irq = idd.handler_mut().driver_mut().dump().irq_stat; + } + let writes_match = poll_writes == irq_writes; + if json { + host::json_output::print_json(&host::json_output::HostPollVsIrqJson { + ok: true, + poll_writes, + poll_reads, + irq_writes, + irq_reads, + writes_match, + irq_stat_poll: format!("0x{:08x}", irq_stat_poll), + irq_stat_irq: format!("0x{:08x}", irq_stat_irq), + })?; + } else { + println!( + "OK poll={}w/{}r irq={}w/{}r writes_match={} irq_stat_poll=0x{:08x} irq_stat_irq=0x{:08x}", + poll_writes, poll_reads, irq_writes, irq_reads, writes_match, irq_stat_poll, irq_stat_irq + ); + } + Ok(()) +} + +fn run_host_inference( + num_layers: u32, + neurons: u32, + chunks: u32, + threshold: u32, + weight_addr: u64, + max_rounds: u32, + json: bool, +) -> anyhow::Result<()> { + use host::{BitnetDriver, InferenceEngine, MockMmio}; + let mut engine = InferenceEngine::new(BitnetDriver::new(MockMmio::with_csrs_zeroed())); + engine + .configure(num_layers, neurons, chunks, threshold, weight_addr) + .map_err(|e| anyhow::anyhow!("configure failed: {:?}", e))?; + let report = engine + .run(max_rounds) + .map_err(|e| anyhow::anyhow!("inference failed: {:?}", e))?; + if json { + host::json_output::print_json(&host::json_output::HostInferenceJson { + ok: true, + total_layers: report.total_layers, + layers_completed: report.layers_completed, + error_layer: report.error_layer, + total_writes: report.total_writes, + total_reads: report.total_reads, + })?; + } else { + println!( + "OK layers={} completed={} writes={} reads={}", + report.total_layers, report.layers_completed, report.total_writes, report.total_reads + ); + } + Ok(()) +} + +fn run_host_perf( + num_layers: u32, + neurons: u32, + chunks: u32, + clock_mhz: f64, + json: bool, +) -> anyhow::Result<()> { + use host::perf::EngineConfig; + let cfg = EngineConfig::new(num_layers, neurons, chunks) + .ok_or_else(|| anyhow::anyhow!("invalid config: layers, neurons, and chunks must be > 0"))?; + let est = cfg.estimate(); + let throughput = cfg.throughput_inf_per_sec(clock_mhz); + if json { + host::json_output::print_json(&host::json_output::HostPerfJson { + ok: true, + layers: est.config.num_layers, + neurons: est.config.neurons, + chunks: est.config.chunks, + total_cycles: est.total_inference_cycles, + total_weight_words: est.total_weight_words, + total_weight_bytes: est.total_weight_bytes, + bram_utilization_pct: est.bram_utilization_pct, + total_dma_beats: est.total_dma_beats, + throughput_inf_per_sec: throughput, + clock_mhz, + })?; + } else { + println!( + "OK layers={} neurons={} chunks={} total_cycles={} total_weight_words={} bram_pct={:.1}% dma_beats={} throughput={:.1} inf/s @ {:.1} MHz", + est.config.num_layers, + est.config.neurons, + est.config.chunks, + est.total_inference_cycles, + est.total_weight_words, + est.bram_utilization_pct, + est.total_dma_beats, + throughput, + clock_mhz, + ); + } Ok(()) } @@ -7856,8 +8120,17 @@ async fn main() -> anyhow::Result<()> { Commands::GenBitnetBundle { top_name, axi_addr_width, axi_data_width, output_dir } => { run_gen_bitnet_bundle(&top_name, axi_addr_width, axi_data_width, &output_dir)? } - Commands::HostSmoke { num_layers, neurons, chunks, threshold, weight_addr, max_polls } => { - run_host_smoke(num_layers, neurons, chunks, threshold, weight_addr, max_polls)? + Commands::HostSmoke { num_layers, neurons, chunks, threshold, weight_addr, max_polls, json } => { + run_host_smoke(num_layers, neurons, chunks, threshold, weight_addr, max_polls, json)? + } + Commands::HostPollVsIrq { num_layers, neurons, chunks, threshold, weight_addr, max_polls, json } => { + run_host_poll_vs_irq(num_layers, neurons, chunks, threshold, weight_addr, max_polls, json)? + } + Commands::HostInference { num_layers, neurons, chunks, threshold, weight_addr, max_rounds, json } => { + run_host_inference(num_layers, neurons, chunks, threshold, weight_addr, max_rounds, json)? + } + Commands::HostPerf { num_layers, neurons, chunks, clock_mhz, json } => { + run_host_perf(num_layers, neurons, chunks, clock_mhz, json)? } Commands::Asm { input, output, format } => run_asm(&input, output.as_deref(), &format)?, Commands::GenTestbench { input, period_ns, max_cycles, output } => { @@ -8094,8 +8367,17 @@ fn main() -> anyhow::Result<()> { Commands::GenBitnetBundle { top_name, axi_addr_width, axi_data_width, output_dir } => { run_gen_bitnet_bundle(&top_name, axi_addr_width, axi_data_width, &output_dir)? } - Commands::HostSmoke { num_layers, neurons, chunks, threshold, weight_addr, max_polls } => { - run_host_smoke(num_layers, neurons, chunks, threshold, weight_addr, max_polls)? + Commands::HostSmoke { num_layers, neurons, chunks, threshold, weight_addr, max_polls, json } => { + run_host_smoke(num_layers, neurons, chunks, threshold, weight_addr, max_polls, json)? + } + Commands::HostPollVsIrq { num_layers, neurons, chunks, threshold, weight_addr, max_polls, json } => { + run_host_poll_vs_irq(num_layers, neurons, chunks, threshold, weight_addr, max_polls, json)? + } + Commands::HostInference { num_layers, neurons, chunks, threshold, weight_addr, max_rounds, json } => { + run_host_inference(num_layers, neurons, chunks, threshold, weight_addr, max_rounds, json)? + } + Commands::HostPerf { num_layers, neurons, chunks, clock_mhz, json } => { + run_host_perf(num_layers, neurons, chunks, clock_mhz, json)? } Commands::Asm { input, output, format } => run_asm(&input, output.as_deref(), &format)?, Commands::GenTestbench { input, period_ns, max_cycles, output } => { diff --git a/bootstrap/tests/host_engine.rs b/bootstrap/tests/host_engine.rs new file mode 100644 index 00000000..47547aa5 --- /dev/null +++ b/bootstrap/tests/host_engine.rs @@ -0,0 +1,162 @@ +use std::process::Command; + +fn bin() -> &'static str { + env!("CARGO_BIN_EXE_t27c") +} + +fn run(args: &[&str]) -> (bool, String, String) { + let out = Command::new(bin()) + .args(args) + .output() + .expect("failed to spawn t27c"); + ( + out.status.success(), + String::from_utf8_lossy(&out.stdout).to_string(), + String::from_utf8_lossy(&out.stderr).to_string(), + ) +} + +#[test] +fn inference_default_succeeds() { + let (ok, stdout, _) = run(&["host-inference"]); + assert!(ok, "default should succeed"); + assert!(stdout.starts_with("OK "), "stdout = {stdout}"); +} + +#[test] +fn inference_prints_layer_count() { + let (ok, stdout, _) = run(&["host-inference"]); + assert!(ok); + assert!(stdout.contains("layers=2"), "stdout = {stdout}"); +} + +#[test] +fn inference_prints_completed_count() { + let (ok, stdout, _) = run(&["host-inference"]); + assert!(ok); + assert!(stdout.contains("completed=2"), "stdout = {stdout}"); +} + +#[test] +fn inference_prints_writes() { + let (ok, stdout, _) = run(&["host-inference"]); + assert!(ok); + assert!(stdout.contains("writes="), "stdout = {stdout}"); +} + +#[test] +fn inference_prints_reads() { + let (ok, stdout, _) = run(&["host-inference"]); + assert!(ok); + assert!(stdout.contains("reads="), "stdout = {stdout}"); +} + +#[test] +fn inference_single_layer() { + let (ok, stdout, _) = run(&["host-inference", "--num-layers", "1"]); + assert!(ok); + assert!(stdout.contains("layers=1 completed=1"), "stdout = {stdout}"); +} + +#[test] +fn inference_three_layers() { + let (ok, stdout, _) = run(&["host-inference", "--num-layers", "3"]); + assert!(ok); + assert!(stdout.contains("layers=3 completed=3"), "stdout = {stdout}"); +} + +#[test] +fn inference_custom_neurons() { + let (ok, stdout, _) = run(&["host-inference", "--neurons", "128"]); + assert!(ok); +} + +#[test] +fn inference_custom_chunks() { + let (ok, stdout, _) = run(&["host-inference", "--chunks", "32"]); + assert!(ok); +} + +#[test] +fn inference_custom_threshold() { + let (ok, stdout, _) = run(&["host-inference", "--threshold", "42"]); + assert!(ok); +} + +#[test] +fn inference_custom_weight_addr() { + let (ok, stdout, _) = run(&["host-inference", "--weight-addr", "1099511627776"]); + assert!(ok); +} + +#[test] +fn inference_zero_layers_fails() { + let (ok, _, stderr) = run(&["host-inference", "--num-layers", "0"]); + assert!(!ok); + assert!(stderr.contains("configure") || stderr.contains("InvalidConfig")); +} + +#[test] +fn inference_zero_neurons_fails() { + let (ok, _, _stderr) = run(&["host-inference", "--neurons", "0"]); + assert!(!ok); +} + +#[test] +fn inference_zero_chunks_fails() { + let (ok, _, _stderr) = run(&["host-inference", "--chunks", "0"]); + assert!(!ok); +} + +#[test] +fn inference_deterministic() { + let (ok1, s1, _) = run(&["host-inference"]); + let (ok2, s2, _) = run(&["host-inference"]); + assert!(ok1 && ok2); + assert_eq!(s1, s2, "should be deterministic"); +} + +#[test] +fn inference_single_line_output() { + let (ok, stdout, _) = run(&["host-inference"]); + assert!(ok); + let trimmed = stdout.trim_end_matches('\n'); + assert!(!trimmed.contains('\n'), "expected single line, got {stdout}"); +} + +#[test] +fn inference_no_stderr_on_success() { + let (ok, _, stderr) = run(&["host-inference"]); + assert!(ok); + assert!(stderr.trim().is_empty(), "stderr should be empty: {stderr}"); +} + +#[test] +fn inference_combined_overrides() { + let (ok, stdout, _) = run(&[ + "host-inference", + "--num-layers", "3", + "--neurons", "64", + "--chunks", "8", + "--threshold", "42", + "--weight-addr", "1024", + ]); + assert!(ok); + assert!(stdout.contains("layers=3 completed=3")); +} + +#[test] +fn inference_help_lists_all_flags() { + let (ok, stdout, _) = run(&["host-inference", "--help"]); + assert!(ok); + for flag in ["--num-layers", "--neurons", "--chunks", "--threshold", "--weight-addr", "--max-rounds"] { + assert!(stdout.contains(flag), "missing {flag} in help: {stdout}"); + } +} + +#[test] +fn inference_help_mentions_wave_41() { + let (ok, stdout, _) = run(&["host-inference", "--help"]); + assert!(ok); + assert!(stdout.contains("Wave 41") || stdout.contains("R-HS-3"), "expected Wave 41 / R-HS-3: {stdout}"); +} diff --git a/bootstrap/tests/host_irq.rs b/bootstrap/tests/host_irq.rs new file mode 100644 index 00000000..7f2612c3 --- /dev/null +++ b/bootstrap/tests/host_irq.rs @@ -0,0 +1,169 @@ +use std::process::Command; + +fn bin() -> &'static str { + env!("CARGO_BIN_EXE_t27c") +} + +fn run(args: &[&str]) -> (bool, String, String) { + let out = Command::new(bin()) + .args(args) + .output() + .expect("failed to spawn t27c"); + ( + out.status.success(), + String::from_utf8_lossy(&out.stdout).to_string(), + String::from_utf8_lossy(&out.stderr).to_string(), + ) +} + +#[test] +fn poll_vs_irq_default_succeeds() { + let (ok, stdout, _) = run(&["host-poll-vs-irq"]); + assert!(ok, "default should succeed"); + assert!(stdout.starts_with("OK "), "stdout = {stdout}"); +} + +#[test] +fn poll_vs_irq_prints_poll_metrics() { + let (ok, stdout, _) = run(&["host-poll-vs-irq"]); + assert!(ok); + assert!(stdout.contains("poll="), "stdout = {stdout}"); +} + +#[test] +fn poll_vs_irq_prints_irq_metrics() { + let (ok, stdout, _) = run(&["host-poll-vs-irq"]); + assert!(ok); + assert!(stdout.contains("irq="), "stdout = {stdout}"); +} + +#[test] +fn poll_vs_irq_writes_match_field_present() { + let (ok, stdout, _) = run(&["host-poll-vs-irq"]); + assert!(ok); + assert!(stdout.contains("writes_match="), "stdout = {stdout}"); +} + +#[test] +fn poll_vs_irq_prints_irq_stat_poll() { + let (ok, stdout, _) = run(&["host-poll-vs-irq"]); + assert!(ok); + assert!(stdout.contains("irq_stat_poll=0x"), "stdout = {stdout}"); +} + +#[test] +fn poll_vs_irq_prints_irq_stat_irq() { + let (ok, stdout, _) = run(&["host-poll-vs-irq"]); + assert!(ok); + assert!(stdout.contains("irq_stat_irq=0x"), "stdout = {stdout}"); +} + +#[test] +fn poll_vs_irq_custom_layers() { + let (ok, stdout, _) = run(&["host-poll-vs-irq", "--num-layers", "5"]); + assert!(ok); + assert!(stdout.starts_with("OK ")); +} + +#[test] +fn poll_vs_irq_custom_neurons() { + let (ok, stdout, _) = run(&["host-poll-vs-irq", "--neurons", "128"]); + assert!(ok); +} + +#[test] +fn poll_vs_irq_custom_chunks() { + let (ok, stdout, _) = run(&["host-poll-vs-irq", "--chunks", "32"]); + assert!(ok); +} + +#[test] +fn poll_vs_irq_custom_threshold() { + let (ok, stdout, _) = run(&["host-poll-vs-irq", "--threshold", "7"]); + assert!(ok); +} + +#[test] +fn poll_vs_irq_custom_weight_addr() { + let (ok, stdout, _) = run(&["host-poll-vs-irq", "--weight-addr", "1099511627776"]); + assert!(ok); +} + +#[test] +fn poll_vs_irq_zero_layers_fails() { + let (ok, _, stderr) = run(&["host-poll-vs-irq", "--num-layers", "0"]); + assert!(!ok); + assert!(stderr.contains("configure") || stderr.contains("InvalidConfig")); +} + +#[test] +fn poll_vs_irq_zero_neurons_fails() { + let (ok, _, _stderr) = run(&["host-poll-vs-irq", "--neurons", "0"]); + assert!(!ok); +} + +#[test] +fn poll_vs_irq_zero_chunks_fails() { + let (ok, _, _stderr) = run(&["host-poll-vs-irq", "--chunks", "0"]); + assert!(!ok); +} + +#[test] +fn poll_vs_irq_max_polls_one_succeeds() { + let (ok, stdout, _) = run(&["host-poll-vs-irq", "--max-polls", "1"]); + assert!(ok, "done is preset, 1 poll is enough"); + assert!(stdout.starts_with("OK ")); +} + +#[test] +fn poll_vs_irq_deterministic() { + let (ok1, s1, _) = run(&["host-poll-vs-irq"]); + let (ok2, s2, _) = run(&["host-poll-vs-irq"]); + assert!(ok1 && ok2); + assert_eq!(s1, s2, "should be deterministic"); +} + +#[test] +fn poll_vs_irq_single_line_output() { + let (ok, stdout, _) = run(&["host-poll-vs-irq"]); + assert!(ok); + let trimmed = stdout.trim_end_matches('\n'); + assert!(!trimmed.contains('\n'), "expected single line, got {stdout}"); +} + +#[test] +fn poll_vs_irq_no_stderr_on_success() { + let (ok, _, stderr) = run(&["host-poll-vs-irq"]); + assert!(ok); + assert!(stderr.trim().is_empty(), "stderr should be empty: {stderr}"); +} + +#[test] +fn poll_vs_irq_combined_overrides() { + let (ok, stdout, _) = run(&[ + "host-poll-vs-irq", + "--num-layers", "3", + "--neurons", "64", + "--chunks", "8", + "--threshold", "42", + "--weight-addr", "1024", + ]); + assert!(ok); + assert!(stdout.contains("writes_match=")); +} + +#[test] +fn poll_vs_irq_help_lists_all_flags() { + let (ok, stdout, _) = run(&["host-poll-vs-irq", "--help"]); + assert!(ok); + for flag in ["--num-layers", "--neurons", "--chunks", "--threshold", "--weight-addr", "--max-polls"] { + assert!(stdout.contains(flag), "missing {flag} in help: {stdout}"); + } +} + +#[test] +fn poll_vs_irq_help_mentions_wave_40() { + let (ok, stdout, _) = run(&["host-poll-vs-irq", "--help"]); + assert!(ok); + assert!(stdout.contains("Wave 40") || stdout.contains("R-HS-2"), "expected Wave 40 / R-HS-2: {stdout}"); +} diff --git a/bootstrap/tests/host_json.rs b/bootstrap/tests/host_json.rs new file mode 100644 index 00000000..1f4179a6 --- /dev/null +++ b/bootstrap/tests/host_json.rs @@ -0,0 +1,235 @@ +use std::process::Command; + +fn bin() -> &'static str { + env!("CARGO_BIN_EXE_t27c") +} + +fn run(args: &[&str]) -> (bool, String, String) { + let out = Command::new(bin()) + .args(args) + .output() + .expect("failed to spawn t27c"); + ( + out.status.success(), + String::from_utf8_lossy(&out.stdout).to_string(), + String::from_utf8_lossy(&out.stderr).to_string(), + ) +} + +fn parse_json(s: &str) -> serde_json::Value { + let trimmed = s.trim(); + serde_json::from_str(trimmed).unwrap_or_else(|e| panic!("invalid JSON: {e}\ninput: {trimmed}")) +} + +// -- host-smoke --json -- + +#[test] +fn smoke_json_is_valid() { + let (ok, stdout, _) = run(&["host-smoke", "--json"]); + assert!(ok); + let v = parse_json(&stdout); + assert_eq!(v["ok"], true); +} + +#[test] +fn smoke_json_has_writes_reads() { + let (ok, stdout, _) = run(&["host-smoke", "--json"]); + assert!(ok); + let v = parse_json(&stdout); + assert!(v["writes"].is_number()); + assert!(v["reads"].is_number()); +} + +#[test] +fn smoke_json_has_config_fields() { + let (ok, stdout, _) = run(&["host-smoke", "--json"]); + assert!(ok); + let v = parse_json(&stdout); + assert_eq!(v["layers"], 2); + assert_eq!(v["neurons"], 16); + assert_eq!(v["chunks"], 4); + assert_eq!(v["threshold"], 1); +} + +#[test] +fn smoke_json_weight_addr_is_string() { + let (ok, stdout, _) = run(&["host-smoke", "--json"]); + assert!(ok); + let v = parse_json(&stdout); + assert!(v["weight_addr"].is_string()); + assert!(v["weight_addr"].as_str().unwrap().starts_with("0x")); +} + +#[test] +fn smoke_json_irq_stat_is_string() { + let (ok, stdout, _) = run(&["host-smoke", "--json"]); + assert!(ok); + let v = parse_json(&stdout); + assert!(v["irq_stat"].is_string()); +} + +// -- host-poll-vs-irq --json -- + +#[test] +fn poll_vs_irq_json_is_valid() { + let (ok, stdout, _) = run(&["host-poll-vs-irq", "--json"]); + assert!(ok); + let v = parse_json(&stdout); + assert_eq!(v["ok"], true); +} + +#[test] +fn poll_vs_irq_json_has_poll_and_irq_counts() { + let (ok, stdout, _) = run(&["host-poll-vs-irq", "--json"]); + assert!(ok); + let v = parse_json(&stdout); + assert!(v["poll_writes"].is_number()); + assert!(v["poll_reads"].is_number()); + assert!(v["irq_writes"].is_number()); + assert!(v["irq_reads"].is_number()); +} + +#[test] +fn poll_vs_irq_json_writes_match_is_bool() { + let (ok, stdout, _) = run(&["host-poll-vs-irq", "--json"]); + assert!(ok); + let v = parse_json(&stdout); + assert!(v["writes_match"].is_boolean()); +} + +// -- host-inference --json -- + +#[test] +fn inference_json_is_valid() { + let (ok, stdout, _) = run(&["host-inference", "--json"]); + assert!(ok); + let v = parse_json(&stdout); + assert_eq!(v["ok"], true); +} + +#[test] +fn inference_json_has_layers_completed() { + let (ok, stdout, _) = run(&["host-inference", "--json"]); + assert!(ok); + let v = parse_json(&stdout); + assert_eq!(v["total_layers"], 2); + assert_eq!(v["layers_completed"], 2); +} + +#[test] +fn inference_json_has_writes_reads() { + let (ok, stdout, _) = run(&["host-inference", "--json"]); + assert!(ok); + let v = parse_json(&stdout); + assert!(v["total_writes"].is_number()); + assert!(v["total_reads"].is_number()); +} + +#[test] +fn inference_json_error_layer_is_null_on_success() { + let (ok, stdout, _) = run(&["host-inference", "--json"]); + assert!(ok); + let v = parse_json(&stdout); + assert!(v["error_layer"].is_null()); +} + +#[test] +fn inference_json_single_layer() { + let (ok, stdout, _) = run(&["host-inference", "--json", "--num-layers", "1"]); + assert!(ok); + let v = parse_json(&stdout); + assert_eq!(v["total_layers"], 1); + assert_eq!(v["layers_completed"], 1); +} + +// -- host-perf --json -- + +#[test] +fn perf_json_is_valid() { + let (ok, stdout, _) = run(&["host-perf", "--json"]); + assert!(ok); + let v = parse_json(&stdout); + assert_eq!(v["ok"], true); +} + +#[test] +fn perf_json_has_config() { + let (ok, stdout, _) = run(&["host-perf", "--json"]); + assert!(ok); + let v = parse_json(&stdout); + assert_eq!(v["layers"], 2); + assert_eq!(v["neurons"], 16); + assert_eq!(v["chunks"], 4); +} + +#[test] +fn perf_json_has_cycles_and_dma() { + let (ok, stdout, _) = run(&["host-perf", "--json"]); + assert!(ok); + let v = parse_json(&stdout); + assert!(v["total_cycles"].is_number()); + assert!(v["total_dma_beats"].is_number()); + assert!(v["total_weight_words"].is_number()); +} + +#[test] +fn perf_json_has_bram_pct() { + let (ok, stdout, _) = run(&["host-perf", "--json"]); + assert!(ok); + let v = parse_json(&stdout); + assert!(v["bram_utilization_pct"].is_number()); +} + +#[test] +fn perf_json_has_throughput() { + let (ok, stdout, _) = run(&["host-perf", "--json"]); + assert!(ok); + let v = parse_json(&stdout); + assert!(v["throughput_inf_per_sec"].is_number()); + assert!(v["clock_mhz"].is_number()); +} + +#[test] +fn perf_json_custom_clock() { + let (ok, stdout, _) = run(&["host-perf", "--json", "--clock-mhz", "100.0"]); + assert!(ok); + let v = parse_json(&stdout); + assert_eq!(v["clock_mhz"], 100.0); +} + +// -- cross-command: without --json, output is NOT JSON -- + +#[test] +fn smoke_without_json_is_not_json_object() { + let (ok, stdout, _) = run(&["host-smoke"]); + assert!(ok); + let trimmed = stdout.trim(); + assert!(trimmed.starts_with("OK "), "expected human-readable: {trimmed}"); + assert!(!trimmed.starts_with("{"), "should not be JSON: {trimmed}"); +} + +#[test] +fn perf_without_json_is_not_json_object() { + let (ok, stdout, _) = run(&["host-perf"]); + assert!(ok); + let trimmed = stdout.trim(); + assert!(trimmed.starts_with("OK "), "expected human-readable: {trimmed}"); +} + +// -- determinism -- + +#[test] +fn smoke_json_deterministic() { + let (ok1, s1, _) = run(&["host-smoke", "--json"]); + let (ok2, s2, _) = run(&["host-smoke", "--json"]); + assert!(ok1 && ok2); + assert_eq!(s1, s2); +} + +#[test] +fn perf_json_deterministic() { + let (ok1, s1, _) = run(&["host-perf", "--json"]); + let (ok2, s2, _) = run(&["host-perf", "--json"]); + assert!(ok1 && ok2); + assert_eq!(s1, s2); +} diff --git a/bootstrap/tests/host_perf.rs b/bootstrap/tests/host_perf.rs new file mode 100644 index 00000000..002d81f0 --- /dev/null +++ b/bootstrap/tests/host_perf.rs @@ -0,0 +1,189 @@ +use std::process::Command; + +fn bin() -> &'static str { + env!("CARGO_BIN_EXE_t27c") +} + +fn run(args: &[&str]) -> (bool, String, String) { + let out = Command::new(bin()) + .args(args) + .output() + .expect("failed to spawn t27c"); + ( + out.status.success(), + String::from_utf8_lossy(&out.stdout).to_string(), + String::from_utf8_lossy(&out.stderr).to_string(), + ) +} + +#[test] +fn perf_default_succeeds() { + let (ok, stdout, _) = run(&["host-perf"]); + assert!(ok, "default should succeed"); + assert!(stdout.starts_with("OK "), "stdout = {stdout}"); +} + +#[test] +fn perf_prints_layer_count() { + let (ok, stdout, _) = run(&["host-perf"]); + assert!(ok); + assert!(stdout.contains("layers=2"), "stdout = {stdout}"); +} + +#[test] +fn perf_prints_neuron_count() { + let (ok, stdout, _) = run(&["host-perf"]); + assert!(ok); + assert!(stdout.contains("neurons=16"), "stdout = {stdout}"); +} + +#[test] +fn perf_prints_chunk_count() { + let (ok, stdout, _) = run(&["host-perf"]); + assert!(ok); + assert!(stdout.contains("chunks=4"), "stdout = {stdout}"); +} + +#[test] +fn perf_prints_total_cycles() { + let (ok, stdout, _) = run(&["host-perf"]); + assert!(ok); + assert!(stdout.contains("total_cycles="), "stdout = {stdout}"); +} + +#[test] +fn perf_prints_weight_words() { + let (ok, stdout, _) = run(&["host-perf"]); + assert!(ok); + assert!(stdout.contains("total_weight_words="), "stdout = {stdout}"); +} + +#[test] +fn perf_prints_bram_pct() { + let (ok, stdout, _) = run(&["host-perf"]); + assert!(ok); + assert!(stdout.contains("bram_pct="), "stdout = {stdout}"); +} + +#[test] +fn perf_prints_dma_beats() { + let (ok, stdout, _) = run(&["host-perf"]); + assert!(ok); + assert!(stdout.contains("dma_beats="), "stdout = {stdout}"); +} + +#[test] +fn perf_prints_throughput() { + let (ok, stdout, _) = run(&["host-perf"]); + assert!(ok); + assert!(stdout.contains("throughput="), "stdout = {stdout}"); + assert!(stdout.contains("inf/s"), "stdout = {stdout}"); +} + +#[test] +fn perf_prints_clock_freq() { + let (ok, stdout, _) = run(&["host-perf"]); + assert!(ok); + assert!(stdout.contains("MHz"), "stdout = {stdout}"); +} + +#[test] +fn perf_custom_layers() { + let (ok, stdout, _) = run(&["host-perf", "--num-layers", "5"]); + assert!(ok); + assert!(stdout.contains("layers=5"), "stdout = {stdout}"); +} + +#[test] +fn perf_custom_neurons() { + let (ok, stdout, _) = run(&["host-perf", "--neurons", "128"]); + assert!(ok); + assert!(stdout.contains("neurons=128"), "stdout = {stdout}"); +} + +#[test] +fn perf_custom_chunks() { + let (ok, stdout, _) = run(&["host-perf", "--chunks", "32"]); + assert!(ok); + assert!(stdout.contains("chunks=32"), "stdout = {stdout}"); +} + +#[test] +fn perf_custom_clock() { + let (ok, stdout, _) = run(&["host-perf", "--clock-mhz", "100.0"]); + assert!(ok); + assert!(stdout.contains("@ 100.0 MHz"), "stdout = {stdout}"); +} + +#[test] +fn perf_zero_layers_fails() { + let (ok, _, stderr) = run(&["host-perf", "--num-layers", "0"]); + assert!(!ok); + assert!(stderr.contains("invalid config"), "stderr = {stderr}"); +} + +#[test] +fn perf_zero_neurons_fails() { + let (ok, _, _stderr) = run(&["host-perf", "--neurons", "0"]); + assert!(!ok); +} + +#[test] +fn perf_zero_chunks_fails() { + let (ok, _, _stderr) = run(&["host-perf", "--chunks", "0"]); + assert!(!ok); +} + +#[test] +fn perf_deterministic() { + let (ok1, s1, _) = run(&["host-perf"]); + let (ok2, s2, _) = run(&["host-perf"]); + assert!(ok1 && ok2); + assert_eq!(s1, s2, "should be deterministic"); +} + +#[test] +fn perf_single_line_output() { + let (ok, stdout, _) = run(&["host-perf"]); + assert!(ok); + let trimmed = stdout.trim_end_matches('\n'); + assert!(!trimmed.contains('\n'), "expected single line, got {stdout}"); +} + +#[test] +fn perf_no_stderr_on_success() { + let (ok, _, stderr) = run(&["host-perf"]); + assert!(ok); + assert!(stderr.trim().is_empty(), "stderr should be empty: {stderr}"); +} + +#[test] +fn perf_help_lists_all_flags() { + let (ok, stdout, _) = run(&["host-perf", "--help"]); + assert!(ok); + for flag in ["--num-layers", "--neurons", "--chunks", "--clock-mhz"] { + assert!(stdout.contains(flag), "missing {flag} in help: {stdout}"); + } +} + +#[test] +fn perf_help_mentions_wave_42() { + let (ok, stdout, _) = run(&["host-perf", "--help"]); + assert!(ok); + assert!(stdout.contains("Wave 42") || stdout.contains("R-HS-4"), "expected Wave 42 / R-HS-4: {stdout}"); +} + +#[test] +fn perf_cycles_increase_with_layers() { + let (ok1, s1, _) = run(&["host-perf", "--num-layers", "1"]); + let (ok2, s2, _) = run(&["host-perf", "--num-layers", "4"]); + assert!(ok1 && ok2); + let c1: u64 = extract_total_cycles(&s1); + let c2: u64 = extract_total_cycles(&s2); + assert!(c2 > c1, "4-layer cycles ({c2}) should exceed 1-layer ({c1})"); +} + +fn extract_total_cycles(s: &str) -> u64 { + let part = s.split("total_cycles=").nth(1).unwrap_or(""); + part.split_whitespace().next().unwrap_or("0").parse().unwrap_or(0) +} diff --git a/docs/NOW.md b/docs/NOW.md index 14aae767..52f7ccbb 100644 --- a/docs/NOW.md +++ b/docs/NOW.md @@ -2,6 +2,30 @@ Last updated: 2026-05-23 +## wave-43 -- t27c --json flag for host CLI commands (R-HS-5, Closes #795) + +- **WHERE** (bootstrap-only, additive): new file `bootstrap/src/host/json_output.rs` (`HostSmokeJson`, `HostPollVsIrqJson`, `HostInferenceJson`, `HostPerfJson` structs, `print_json` helper); updated `bootstrap/src/host/mod.rs`; added `--json` flag to all 4 host commands in `main.rs`; new test file `bootstrap/tests/host_json.rs` (23 integration tests). +- **Why** (R-HS-5): host commands emit human-readable single-line output by default. `--json` enables structured JSON for CI pipelines, trios-bridge, and downstream tooling. Additive — default output unchanged. +- **Tests**: 23 new integration tests. All pass. Zero regressions. + +## wave-42 -- t27c host-perf -- performance model and cycle estimator (R-HS-4, Closes #791) + +- **WHERE** (bootstrap-only, additive): new file `bootstrap/src/host/perf.rs` (`EngineConfig`, `PerformanceEstimate`, `LayerEstimate`; cycle/DMA/BRAM/throughput estimation; 19 inline unit tests); updated `bootstrap/src/host/mod.rs`; new CLI `Commands::HostPerf` + `run_host_perf()`; new test file `bootstrap/tests/host_perf.rs` (23 integration tests). +- **Why** (R-HS-4): W41 gave us the full inference engine. W42 adds the analytical performance model: given engine config (layers, neurons, chunks), compute per-layer DMA beats, compute cycles, BRAM utilization, total inference cycles, and throughput at a given clock frequency. Pure arithmetic, no hardware dependency. Essential for FPGA bringup (compare estimated vs actual CYCLES counter from W36f). +- **Tests**: 42 new (19 inline + 23 integration). All pass. Zero regressions. + +## wave-41 -- t27c host-inference -- DMA-driven multi-layer BitNet inference flow (R-HS-3, Closes #789) + +- **WHERE** (bootstrap-only, additive): new file `bootstrap/src/host/engine.rs` (`InferenceEngine`, `InferenceReport`, per-layer DMA prefetch → inference → DMA drain cycle; 16 inline unit tests); updated `bootstrap/src/host/irq.rs` (`wait_irq_mask` generic IRQ wait, refactored from `wait_done_irq`); updated `bootstrap/src/host/mod.rs`; new CLI `Commands::HostInference` + `run_host_inference()`; new test file `bootstrap/tests/host_engine.rs` (20 integration tests). +- **Why** (R-HS-3): W40 gave us IRQ-driven single-completion. W41 orchestrates the full multi-layer BitNet inference flow: for each layer, DMA prefetch weights (wait DmaDone IRQ), start inference (wait InferenceDone IRQ), DMA drain output (wait DmaDone IRQ). Uses `wait_irq_mask()` — a new generic IRQ-wait method on `IrqDrivenDriver` that waits for any mask, not just inference-done. +- **Tests**: 36 new (16 inline + 20 integration). All pass. Zero regressions. + +## wave-40 -- t27c host-poll-vs-irq -- IRQ-handler harness + poll-vs-IRQ comparison (R-HS-2, Closes #786) + +- **WHERE** (bootstrap-only, additive): new file `bootstrap/src/host/irq.rs` (`IrqSource` enum, `IrqHandler` callback registry, `IrqDrivenDriver` with `wait_done_irq`; 11 inline unit tests); updated `bootstrap/src/host/mod.rs`; new CLI `Commands::HostPollVsIrq` + `run_host_poll_vs_irq()`; new test file `bootstrap/tests/host_irq.rs` (21 integration tests). +- **Why** (R-HS-2): W39 added poll-mode driver. W40 adds interrupt-driven completion path with callback dispatch and side-by-side poll-vs-IRQ comparison on MockMmio. +- **Tests**: 32 new (11 inline + 21 integration). All pass. + ## wave-39 -- t27c host-side Rust driver module: BitNet AXI-Lite CSR aperture (R-HS-1, Closes #784) - **WHERE** (bootstrap-only, additive): new directory `bootstrap/src/host/` with four files -- `mod.rs` (re-exports), `csr_map.rs` (10 CSR offset constants + status/IRQ bit masks + 10 inline unit tests), `mmio.rs` (`Mmio` trait + `MockMmio` deterministic BTreeMap backend + transaction log + 10 inline unit tests), `driver.rs` (`BitnetDriver` orchestrator with configure / start / poll / IRQ / dump methods + `CsrSnapshot` struct + `DriverError` enum + 11 inline unit tests). One new `mod host;` declaration in `bootstrap/src/main.rs`. One new CLI subcommand `Commands::HostSmoke { num_layers, neurons, chunks, threshold, weight_addr, max_polls }` registered in the `Commands` enum and dispatched in both HTTP-server and CLI match arms via `run_host_smoke(...)`. **Zero** edits under `gen/`, `coq/`, `trios-coq/`, `proofs/`, `specs/`, `conformance/`, `architecture/`, `rings/`, root `Cargo.toml`. Doc-only update to this file. New test file `bootstrap/tests/host_driver.rs` (25 integration tests via `CARGO_BIN_EXE_t27c`).