diff --git a/Cargo.lock b/Cargo.lock index f846386fd..257864205 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -596,7 +596,7 @@ dependencies = [ "objc2-foundation 0.3.1", "parking_lot 0.12.3", "percent-encoding", - "windows-sys 0.59.0", + "windows-sys 0.52.0", "wl-clipboard-rs", "x11rb", ] @@ -1759,7 +1759,7 @@ dependencies = [ "bitflags 2.9.1", "cexpr", "clang-sys", - "itertools 0.12.1", + "itertools 0.11.0", "lazy_static", "lazycell", "log", @@ -2669,7 +2669,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "117725a109d387c937a1533ce01b450cbde6b88abceea8473c4d7a85853cda3c" dependencies = [ "lazy_static", - "windows-sys 0.59.0", + "windows-sys 0.48.0", ] [[package]] @@ -2678,7 +2678,7 @@ version = "3.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fde0e0ec90c9dfb3b4b1a0891a7dcd0e2bffde2f7efed5fe7c9bb00e5bfb915e" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.48.0", ] [[package]] @@ -4092,7 +4092,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cea14ef9355e3beab063703aa9dab15afd25f0667c341310c1e5274bb1d0da18" dependencies = [ "libc", - "windows-sys 0.59.0", + "windows-sys 0.52.0", ] [[package]] @@ -7166,7 +7166,7 @@ dependencies = [ "portable-atomic", "portable-atomic-util", "serde", - "windows-sys 0.59.0", + "windows-sys 0.52.0", ] [[package]] @@ -7447,8 +7447,9 @@ dependencies = [ [[package]] name = "knf-rs" -version = "0.3.0-beta.0" -source = "git+https://github.com/thewh1teagle/pyannote-rs?rev=e3abad6#e3abad6ba8bd505bb6dbd7aca16ff239fde85e21" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "342e12f10c01b856e5f9f8fc2bb7fbc6aa85c84eaa3d4636ad6786fcb3c8b9f8" dependencies = [ "eyre", "knf-rs-sys", @@ -7457,8 +7458,9 @@ dependencies = [ [[package]] name = "knf-rs-sys" -version = "0.3.0-beta.0" -source = "git+https://github.com/thewh1teagle/pyannote-rs?rev=e3abad6#e3abad6ba8bd505bb6dbd7aca16ff239fde85e21" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "796636ff383658023a3cd4dca321be53966afdf155041807895d573fb476d831" dependencies = [ "bindgen 0.69.5", "cmake", @@ -7619,7 +7621,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a793df0d7afeac54f95b471d3af7f0d4fb975699f972341a4b76988d49cdf0c" dependencies = [ "cfg-if", - "windows-targets 0.53.0", + "windows-targets 0.48.5", ] [[package]] @@ -10284,7 +10286,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1" dependencies = [ "anyhow", - "itertools 0.12.1", + "itertools 0.11.0", "proc-macro2", "quote", "syn 2.0.101", @@ -10448,7 +10450,7 @@ dependencies = [ "once_cell", "socket2", "tracing", - "windows-sys 0.59.0", + "windows-sys 0.52.0", ] [[package]] @@ -11319,7 +11321,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys 0.4.15", - "windows-sys 0.59.0", + "windows-sys 0.52.0", ] [[package]] @@ -11332,7 +11334,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys 0.9.4", - "windows-sys 0.59.0", + "windows-sys 0.52.0", ] [[package]] @@ -13746,17 +13748,21 @@ dependencies = [ name = "tauri-plugin-local-stt" version = "0.1.0" dependencies = [ + "audio-utils", "axum 0.8.4", "bytes", "chunker", "data", "db-user", + "dirs 6.0.0", "file", "futures-util", + "hound", "kalosm-common", "kalosm-sound", "language", "listener-interface", + "pyannote", "reqwest 0.12.15", "rodio", "serde", @@ -14313,7 +14319,7 @@ dependencies = [ "getrandom 0.3.3", "once_cell", "rustix 1.0.7", - "windows-sys 0.59.0", + "windows-sys 0.52.0", ] [[package]] @@ -16163,7 +16169,7 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.48.0", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 404b0d374..051e0d343 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -50,6 +50,7 @@ hypr-notification2 = { path = "crates/notification2", package = "notification2" hypr-notion = { path = "crates/notion", package = "notion" } hypr-onnx = { path = "crates/onnx", package = "onnx" } hypr-openai = { path = "crates/openai", package = "openai" } +hypr-pyannote = { path = "crates/pyannote", package = "pyannote" } hypr-s3 = { path = "crates/s3", package = "s3" } hypr-slack = { path = "crates/slack", package = "slack" } hypr-stt = { path = "crates/stt", package = "stt", features = ["realtime", "recorded"] } diff --git a/crates/audio-utils/src/lib.rs b/crates/audio-utils/src/lib.rs index 697b089f1..551c5727b 100644 --- a/crates/audio-utils/src/lib.rs +++ b/crates/audio-utils/src/lib.rs @@ -26,3 +26,20 @@ pub trait AudioFormatExt: AsyncSource { }) } } + +pub fn i16_to_f32_samples(samples: &[i16]) -> Vec { + samples + .iter() + .map(|&sample| sample as f32 / 32768.0) + .collect() +} + +pub fn f32_to_i16_samples(samples: &[f32]) -> Vec { + samples + .iter() + .map(|&sample| { + let scaled = (sample * 32768.0).clamp(-32768.0, 32768.0); + scaled as i16 + }) + .collect() +} diff --git a/crates/pyannote/Cargo.toml b/crates/pyannote/Cargo.toml index 73e2b39b3..ca34008e5 100644 --- a/crates/pyannote/Cargo.toml +++ b/crates/pyannote/Cargo.toml @@ -4,7 +4,7 @@ version = "0.1.0" edition = "2021" [features] -default = ["local"] +default = [] cloud = ["reqwest", "url"] local = ["hypr-onnx", "knf-rs"] knf-rs = ["dep:knf-rs"] @@ -14,7 +14,7 @@ reqwest = { workspace = true, features = ["json"], optional = true } url = { workspace = true, optional = true } hypr-onnx = { workspace = true, optional = true } -knf-rs = { git = "https://github.com/thewh1teagle/pyannote-rs", rev = "e3abad6", package = "knf-rs", optional = true } +knf-rs = { version = "0.2.9", optional = true } anyhow = { workspace = true } thiserror = { workspace = true } diff --git a/plugins/local-stt/Cargo.toml b/plugins/local-stt/Cargo.toml index 8d2105af6..1a751be3a 100644 --- a/plugins/local-stt/Cargo.toml +++ b/plugins/local-stt/Cargo.toml @@ -24,10 +24,12 @@ specta-typescript = { workspace = true } tracing = { workspace = true } [dependencies] +hypr-audio-utils = { workspace = true } hypr-chunker = { workspace = true } hypr-db-user = { workspace = true } hypr-file = { workspace = true } hypr-listener-interface = { workspace = true } +hypr-pyannote = { workspace = true, features = [] } hypr-whisper = { workspace = true, features = ["local"] } hypr-ws-utils = { workspace = true } @@ -36,6 +38,8 @@ tauri-plugin-store = { workspace = true } tauri-plugin-store2 = { workspace = true } tauri-specta = { workspace = true, features = ["derive", "typescript"] } +dirs = { workspace = true } +hound = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } specta = { workspace = true } diff --git a/plugins/local-stt/src/ext.rs b/plugins/local-stt/src/ext.rs index 9d12697ff..e65749a0e 100644 --- a/plugins/local-stt/src/ext.rs +++ b/plugins/local-stt/src/ext.rs @@ -4,6 +4,7 @@ use tauri::{ipc::Channel, Manager, Runtime}; use tauri_plugin_store2::StorePluginExt; use hypr_file::{download_file_with_callback, DownloadProgress}; +use hypr_listener_interface::Word; pub trait LocalSttPluginExt { fn local_stt_store(&self) -> tauri_plugin_store2::ScopedStore; @@ -14,6 +15,12 @@ pub trait LocalSttPluginExt { fn get_current_model(&self) -> Result; fn set_current_model(&self, model: crate::SupportedModel) -> Result<(), crate::Error>; + fn process_wav( + &self, + model_path: impl AsRef, + audio_path: impl AsRef, + ) -> impl Future, crate::Error>>; + fn download_model( &self, model: crate::SupportedModel, @@ -153,6 +160,54 @@ impl> LocalSttPluginExt for T { Ok(()) } + #[tracing::instrument(skip_all)] + async fn process_wav( + &self, + model_path: impl AsRef, + audio_path: impl AsRef, + ) -> Result, crate::Error> { + let mut wav = hound::WavReader::open(audio_path.as_ref()).unwrap(); + let samples = wav.samples::().collect::, _>>().unwrap(); + + let mut model = hypr_whisper::local::Whisper::builder() + .model_path(model_path.as_ref().to_str().unwrap()) + .language(hypr_whisper::Language::En) + .static_prompt("") + .dynamic_prompt("") + .build(); + + // TODO + // https://github.com/thewh1teagle/pyannote-rs/issues/13 + + // let mut segmenter = hypr_pyannote::local::segmentation::Segmenter::new(16000).unwrap(); + let segments = segmenter.process(&samples, 16000).unwrap(); + + let mut words = Vec::new(); + + // for segment in segments { + // let audio_f32 = hypr_audio_utils::i16_to_f32_samples(&segment.samples); + + // let whisper_segments = model.transcribe(&audio_f32).unwrap(); + + // for whisper_segment in whisper_segments { + // let start_sec: f64 = segment.start + (whisper_segment.start() as f64); + // let end_sec: f64 = segment.start + (whisper_segment.end() as f64); + // let start_ms = (start_sec * 1000.0) as u64; + // let end_ms = (end_sec * 1000.0) as u64; + + // words.push(Word { + // text: whisper_segment.text().to_string(), + // speaker: None, + // confidence: Some(whisper_segment.confidence()), + // start_ms: Some(start_ms), + // end_ms: Some(end_ms), + // }); + // } + // } + + Ok(words) + } + #[tracing::instrument(skip_all)] async fn is_model_downloading(&self, model: &crate::SupportedModel) -> bool { let state = self.state::(); diff --git a/plugins/local-stt/src/lib.rs b/plugins/local-stt/src/lib.rs index 59889b284..493c0a27e 100644 --- a/plugins/local-stt/src/lib.rs +++ b/plugins/local-stt/src/lib.rs @@ -118,4 +118,23 @@ mod test { app.stop_server().await.unwrap(); } + + #[tokio::test] + #[ignore] + // cargo test test_local_stt2 -p tauri-plugin-local-stt -- --ignored --nocapture + async fn test_local_stt2() { + let app = create_app(tauri::test::mock_builder()); + + let model_path = dirs::data_dir() + .unwrap() + .join("com.hyprnote.dev") + .join("ggml-tiny.en-q8_0.bin"); + + let words = app + .process_wav(model_path, hypr_data::english_1::AUDIO_PATH) + .await + .unwrap(); + + println!("{:?}", words); + } }