Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions apps/app/server/src/native/listen/realtime.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ async fn websocket(socket: WebSocket, state: STTState, params: ListenParams) {
let audio = Bytes::from(data);
Ok::<Option<(Bytes, _)>, axum::Error>(Some((audio, ws_receiver)))
}
ListenInputChunk::DualAudio { .. } => {
todo!()
}
ListenInputChunk::End => Ok::<Option<(Bytes, _)>, axum::Error>(None),
}
}
Expand Down
26 changes: 23 additions & 3 deletions crates/audio-utils/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ use kalosm_sound::AsyncSource;
mod error;
pub use error::*;

const I16_SCALE: f32 = 32768.0;

impl<T: AsyncSource> AudioFormatExt for T {}

pub trait AudioFormatExt: AsyncSource {
Expand All @@ -21,7 +23,7 @@ pub trait AudioFormatExt: AsyncSource {

let mut buf = BytesMut::with_capacity(n);
for sample in chunk {
let scaled = (sample * 32768.0).clamp(-32768.0, 32768.0);
let scaled = (sample * I16_SCALE).clamp(-I16_SCALE, I16_SCALE);
buf.put_i16_le(scaled as i16);
}
buf.freeze()
Expand All @@ -32,20 +34,38 @@ pub trait AudioFormatExt: AsyncSource {
pub fn i16_to_f32_samples(samples: &[i16]) -> Vec<f32> {
samples
.iter()
.map(|&sample| sample as f32 / 32768.0)
.map(|&sample| sample as f32 / I16_SCALE)
.collect()
}

pub fn f32_to_i16_samples(samples: &[f32]) -> Vec<i16> {
samples
.iter()
.map(|&sample| {
let scaled = (sample * 32768.0).clamp(-32768.0, 32768.0);
let scaled = (sample * I16_SCALE).clamp(-I16_SCALE, I16_SCALE);
scaled as i16
})
.collect()
}

pub fn f32_to_i16_bytes(chunk: Vec<f32>) -> bytes::Bytes {
let mut bytes = Vec::with_capacity(chunk.len() * 2);
for sample in chunk {
let i16_sample = (sample * I16_SCALE) as i16;
bytes.extend_from_slice(&i16_sample.to_le_bytes());
}
bytes::Bytes::from(bytes)
}

pub fn bytes_to_f32_samples(data: &[u8]) -> Vec<f32> {
data.chunks_exact(2)
.map(|chunk| {
let sample = i16::from_le_bytes([chunk[0], chunk[1]]);
sample as f32 / I16_SCALE
})
.collect()
}

pub fn resample_audio<S, T>(source: S, to_rate: u32) -> Result<Vec<f32>, crate::Error>
where
S: rodio::Source<Item = T> + Iterator<Item = T>,
Expand Down
1 change: 1 addition & 0 deletions crates/stt/src/realtime/clova.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ impl<S, E> RealtimeSpeechToText<S, E> for hypr_clova::realtime::Client {
end_ms: Some(r.transcription.end_timestamp * 1000),
confidence: Some(r.transcription.confidence as f32),
}],
..Default::default()
})),
clova::StreamResponse::Config(_) => None,
},
Expand Down
5 changes: 4 additions & 1 deletion crates/stt/src/realtime/deepgram.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,10 @@ impl<S, E> RealtimeSpeechToText<S, E> for crate::deepgram::DeepgramClient {
})
.collect();

Some(Ok(ListenOutputChunk { words }))
Some(Ok(ListenOutputChunk {
words,
..Default::default()
}))
}
}
_ => None,
Expand Down
1 change: 1 addition & 0 deletions crates/stt/src/realtime/whisper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ impl<S, E> RealtimeSpeechToText<S, E> for WhisperClient {
start_ms: None,
confidence: None,
}],
..Default::default()
})
});

Expand Down
3 changes: 2 additions & 1 deletion crates/whisper-cloud/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,11 @@ impl WhisperClient {
}

impl WebSocketIO for WhisperClient {
type Data = bytes::Bytes;
type Input = bytes::Bytes;
type Output = WhisperOutput;

fn to_input(data: bytes::Bytes) -> Self::Input {
fn to_input(data: Self::Data) -> Self::Input {
data
}

Expand Down
6 changes: 3 additions & 3 deletions crates/whisper-local/src/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ pub struct Segment {
pub start: f32,
pub end: f32,
pub confidence: f32,
pub metadata: Option<serde_json::Value>,
pub meta: Option<serde_json::Value>,
}

impl Segment {
Expand All @@ -261,8 +261,8 @@ impl Segment {
self.confidence
}

pub fn metadata(&self) -> Option<&serde_json::Value> {
self.metadata.as_ref()
pub fn meta(&self) -> Option<serde_json::Value> {
self.meta.clone()
}

pub fn trim(&mut self) {
Expand Down
22 changes: 11 additions & 11 deletions crates/whisper-local/src/stream.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,22 @@ pub struct TranscriptionTask<S, T> {

pub trait AudioChunk: Send + 'static {
fn samples(&self) -> &[f32];
fn metadata(&self) -> Option<&serde_json::Value>;
fn meta(&self) -> Option<serde_json::Value>;
}

#[derive(Default)]
pub struct SimpleAudioChunk {
pub samples: Vec<f32>,
pub metadata: Option<serde_json::Value>,
pub meta: Option<serde_json::Value>,
}

impl AudioChunk for SimpleAudioChunk {
fn samples(&self) -> &[f32] {
&self.samples
}

fn metadata(&self) -> Option<&serde_json::Value> {
self.metadata.as_ref()
fn meta(&self) -> Option<serde_json::Value> {
self.meta.clone()
}
}

Expand Down Expand Up @@ -153,13 +153,14 @@ where

match this.stream.poll_next_unpin(cx) {
Poll::Ready(Some(chunk)) => {
let meta = chunk.meta();
let samples = chunk.samples();
let metadata = chunk.metadata();

match process_transcription(
&mut this.whisper,
samples,
&mut this.current_segment_task,
metadata,
meta,
) {
Poll::Ready(result) => return Poll::Ready(result),
Poll::Pending => continue,
Expand All @@ -176,7 +177,7 @@ fn process_transcription<'a>(
whisper: &'a mut Whisper,
samples: &'a [f32],
current_segment_task: &'a mut Option<Pin<Box<dyn Stream<Item = Segment> + Send>>>,
metadata: Option<&serde_json::Value>,
meta: Option<serde_json::Value>,
) -> Poll<Option<Segment>> {
if !samples.is_empty() {
match whisper.transcribe(samples) {
Expand All @@ -187,11 +188,10 @@ fn process_transcription<'a>(
Poll::Ready(None)
}
Ok(mut segments) => {
if let Some(meta) = metadata {
for segment in &mut segments {
segment.metadata = Some(meta.clone());
}
for segment in &mut segments {
segment.meta = meta.clone();
}

*current_segment_task = Some(Box::pin(futures_util::stream::iter(segments)));
Poll::Pending
}
Expand Down
5 changes: 4 additions & 1 deletion crates/ws-utils/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@ version = "0.1.0"
edition = "2021"

[dependencies]
hypr-audio-utils = { workspace = true }
hypr-listener-interface = { workspace = true }

axum = { workspace = true, features = ["ws"] }
futures-util = { workspace = true }
kalosm-sound = { workspace = true, default-features = false }
serde_json = { workspace = true }

futures-util = { workspace = true }
tokio = { workspace = true }
Loading