Skip to content

Commit 2820a98

Browse files
authored
🤖 feat: voice input mode with OpenAI transcription (#836)
Adds voice dictation capability to the chat input using OpenAI's gpt-4o-transcribe model. ## Features - **Voice recording** via MediaRecorder API (webm/opus format) - **OpenAI transcription** via backend IPC (API key stays server-side) - **Recording overlay** replaces textarea with animated waveform visualization - **Multiple shortcuts**: - Space on empty input → start recording - Space during recording → stop and send immediately - Ctrl+D / Cmd+D → toggle recording anytime - Escape → cancel recording (discard audio) - **Global keybinds** during recording work regardless of focus - **User education** when OpenAI key not configured (disabled button with tooltip) ## UI States | State | Appearance | |-------|------------| | Idle | Subtle gray mic icon in textarea corner | | Recording | Blue border overlay with animated waveform | | Transcribing | Amber border overlay, waiting for API | | No API Key | Disabled mic with explanatory tooltip | ## Implementation - `useVoiceInput` hook with clean state enum (`idle` / `recording` / `transcribing`) - `VoiceInputButton` floating component - `WaveformBars` reusable animated component - IPC channel `voice:transcribe` for backend API calls - Hidden on mobile (native keyboards have built-in dictation) ## Files Changed - `src/browser/hooks/useVoiceInput.ts` - Core hook - `src/browser/components/ChatInput/VoiceInputButton.tsx` - Button component - `src/browser/components/ChatInput/WaveformBars.tsx` - Animation component - `src/browser/components/ChatInput/index.tsx` - Integration - `src/node/services/ipcMain.ts` - Backend transcription handler - `src/common/constants/ipc-constants.ts` - IPC channel - `src/common/types/ipc.ts` - Type definitions --- _Generated with `mux`_
1 parent 372f0d9 commit 2820a98

File tree

20 files changed

+651
-27
lines changed

20 files changed

+651
-27
lines changed

bun.lock

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
"minimist": "^1.2.8",
4343
"motion": "^12.23.24",
4444
"ollama-ai-provider-v2": "^1.5.4",
45+
"openai": "^6.9.1",
4546
"rehype-harden": "^1.1.5",
4647
"shescape": "^2.1.6",
4748
"source-map-support": "^0.5.21",
@@ -2688,6 +2689,8 @@
26882689

26892690
"oniguruma-to-es": ["oniguruma-to-es@4.3.3", "", { "dependencies": { "oniguruma-parser": "^0.12.1", "regex": "^6.0.1", "regex-recursion": "^6.0.2" } }, "sha512-rPiZhzC3wXwE59YQMRDodUwwT9FZ9nNBwQQfsd1wfdtlKEyCdRV0avrTcSZ5xlIvGRVPd/cx6ZN45ECmS39xvg=="],
26902691

2692+
"openai": ["openai@6.9.1", "", { "peerDependencies": { "ws": "^8.18.0", "zod": "^3.25 || ^4.0" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-vQ5Rlt0ZgB3/BNmTa7bIijYFhz3YBceAA3Z4JuoMSBftBF9YqFHIEhZakSs+O/Ad7EaoEimZvHxD5ylRjN11Lg=="],
2693+
26912694
"optionator": ["optionator@0.9.4", "", { "dependencies": { "deep-is": "^0.1.3", "fast-levenshtein": "^2.0.6", "levn": "^0.4.1", "prelude-ls": "^1.2.1", "type-check": "^0.4.0", "word-wrap": "^1.2.5" } }, "sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g=="],
26922695

26932696
"ora": ["ora@5.4.1", "", { "dependencies": { "bl": "^4.1.0", "chalk": "^4.1.0", "cli-cursor": "^3.1.0", "cli-spinners": "^2.5.0", "is-interactive": "^1.0.0", "is-unicode-supported": "^0.1.0", "log-symbols": "^4.1.0", "strip-ansi": "^6.0.0", "wcwidth": "^1.0.1" } }, "sha512-5b6Y85tPxZZ7QytO+BQzysW31HJku27cRIlkbAXaNx+BdcVi+LlRFmVXzeF6a7JCwJpyw5c4b+YSVImQIrBpuQ=="],

package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@
8383
"minimist": "^1.2.8",
8484
"motion": "^12.23.24",
8585
"ollama-ai-provider-v2": "^1.5.4",
86+
"openai": "^6.9.1",
8687
"rehype-harden": "^1.1.5",
8788
"shescape": "^2.1.6",
8889
"source-map-support": "^0.5.21",

src/browser/api.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,9 @@ const webApi: IPCApi = {
361361
},
362362
closeWindow: (workspaceId) => invokeIPC(IPC_CHANNELS.TERMINAL_WINDOW_CLOSE, workspaceId),
363363
},
364+
voice: {
365+
transcribe: (audioBase64) => invokeIPC(IPC_CHANNELS.VOICE_TRANSCRIBE, audioBase64),
366+
},
364367
update: {
365368
check: () => invokeIPC(IPC_CHANNELS.UPDATE_CHECK),
366369
download: () => invokeIPC(IPC_CHANNELS.UPDATE_DOWNLOAD),
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
/**
2+
* Voice input button - floats inside the chat input textarea.
3+
* Minimal footprint: just an icon that changes color based on state.
4+
*/
5+
6+
import React from "react";
7+
import { Mic, Loader2 } from "lucide-react";
8+
import { TooltipWrapper, Tooltip } from "../Tooltip";
9+
import { formatKeybind, KEYBINDS } from "@/browser/utils/ui/keybinds";
10+
import { cn } from "@/common/lib/utils";
11+
import type { VoiceInputState } from "@/browser/hooks/useVoiceInput";
12+
13+
interface VoiceInputButtonProps {
14+
state: VoiceInputState;
15+
isApiKeySet: boolean;
16+
shouldShowUI: boolean;
17+
onToggle: () => void;
18+
disabled?: boolean;
19+
}
20+
21+
const STATE_CONFIG: Record<VoiceInputState, { label: string; colorClass: string }> = {
22+
idle: { label: "Voice input", colorClass: "text-muted/50 hover:text-muted" },
23+
recording: { label: "Stop recording", colorClass: "text-blue-500 animate-pulse" },
24+
transcribing: { label: "Transcribing...", colorClass: "text-amber-500" },
25+
};
26+
27+
export const VoiceInputButton: React.FC<VoiceInputButtonProps> = (props) => {
28+
if (!props.shouldShowUI) return null;
29+
30+
const needsApiKey = !props.isApiKeySet;
31+
const { label, colorClass } = needsApiKey
32+
? { label: "Voice input (requires OpenAI API key)", colorClass: "text-muted/50" }
33+
: STATE_CONFIG[props.state];
34+
35+
const Icon = props.state === "transcribing" ? Loader2 : Mic;
36+
const isTranscribing = props.state === "transcribing";
37+
38+
return (
39+
<TooltipWrapper inline>
40+
<button
41+
type="button"
42+
onClick={props.onToggle}
43+
disabled={(props.disabled ?? false) || isTranscribing || needsApiKey}
44+
aria-label={label}
45+
aria-pressed={props.state === "recording"}
46+
className={cn(
47+
"inline-flex items-center justify-center rounded p-0.5 transition-colors duration-150",
48+
"disabled:cursor-not-allowed disabled:opacity-40",
49+
colorClass
50+
)}
51+
>
52+
<Icon className={cn("h-4 w-4", isTranscribing && "animate-spin")} strokeWidth={1.5} />
53+
</button>
54+
<Tooltip className="tooltip" align="right">
55+
{needsApiKey ? (
56+
<>
57+
Voice input requires OpenAI API key.
58+
<br />
59+
Configure in Settings → Providers.
60+
</>
61+
) : (
62+
<>
63+
<strong>Voice input</strong> — press space on empty input
64+
<br />
65+
or {formatKeybind(KEYBINDS.TOGGLE_VOICE_INPUT)} anytime
66+
<br />
67+
<br />
68+
While recording: space sends, esc cancels
69+
</>
70+
)}
71+
</Tooltip>
72+
</TooltipWrapper>
73+
);
74+
};
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
/**
2+
* Animated waveform bars for voice recording UI.
3+
* Shows 5 bars with staggered pulse animation.
4+
*/
5+
6+
import { cn } from "@/common/lib/utils";
7+
8+
interface WaveformBarsProps {
9+
/** Color class for the bars (e.g., "bg-blue-500") */
10+
colorClass: string;
11+
/** Whether to mirror the animation (for right-side waveform) */
12+
mirrored?: boolean;
13+
}
14+
15+
export const WaveformBars: React.FC<WaveformBarsProps> = (props) => {
16+
const indices = props.mirrored ? [4, 3, 2, 1, 0] : [0, 1, 2, 3, 4];
17+
18+
return (
19+
<div className="flex items-center gap-1">
20+
{indices.map((i, displayIndex) => (
21+
<div
22+
key={displayIndex}
23+
className={cn("w-1 rounded-full", props.colorClass)}
24+
style={{
25+
height: `${12 + Math.sin(i * 0.8) * 8}px`,
26+
animation: `pulse 0.8s ease-in-out ${i * 0.1}s infinite alternate`,
27+
}}
28+
/>
29+
))}
30+
</div>
31+
);
32+
};

0 commit comments

Comments
 (0)