Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions packages/core/src/elevenlabs/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ export {
ElevenLabsError,
} from "./client.js";
export type { ElevenLabsVoice, SynthesizeOptions } from "./client.js";
export { generateSoundEffect, clampSfxDuration, SFX_BOUNDS } from "./sfx.js";
export type { GenerateSfxOptions, GenerateSfxResult } from "./sfx.js";
104 changes: 104 additions & 0 deletions packages/core/src/elevenlabs/sfx.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/**
* ElevenLabs Sound Generation client.
*
* Mirrors the existing `synthesize()` shape for voice — single function call
* that returns audio bytes + chosen format. Errors surface via the same
* `ElevenLabsError` so callers get a uniform try/catch shape across
* voice / sfx / music.
*/

import { ElevenLabsError } from "./client.js";

const API_BASE = "https://api.elevenlabs.io/v1";

export interface GenerateSfxOptions {
/** 0.5..22 seconds. ElevenLabs treats this as a target — actual ±20%. */
durationSeconds?: number;
/**
* 0..1. Higher = stick closer to the prompt; lower = more creative
* variation. Default 0.3 matches ElevenLabs's own default for the SFX
* playground.
*/
promptInfluence?: number;
/** Output format. mp3_44100_128 is the cheapest acceptable quality for SFX. */
outputFormat?: "mp3_44100_128" | "mp3_44100_192";
}

export interface GenerateSfxResult {
bytes: Uint8Array;
format: NonNullable<GenerateSfxOptions["outputFormat"]>;
}

/**
* Generate one sound effect from a text prompt. Returns mp3 bytes — caller
* writes them to disk.
*
* const { bytes } = await generateSoundEffect(apiKey, "snap zoom whoosh", { durationSeconds: 1.5 });
* fs.writeFileSync("sfx.mp3", bytes);
*
* Cost is billed per generation, not per duration — short SFX cost the same
* as long ones up to the 22-second cap. Surface the count in the cost log.
*/
export async function generateSoundEffect(
apiKey: string,
prompt: string,
opts: GenerateSfxOptions = {},
): Promise<GenerateSfxResult> {
if (!prompt || !prompt.trim()) {
throw new ElevenLabsError("generateSoundEffect: prompt is required");
}
const trimmed = prompt.trim();
if (trimmed.length > 1000) {
throw new ElevenLabsError("generateSoundEffect: prompt too long (max 1000 chars)");
}
const format = opts.outputFormat ?? "mp3_44100_128";
const body: Record<string, unknown> = { text: trimmed, output_format: format };
if (typeof opts.durationSeconds === "number") {
// ElevenLabs caps at 0.5..22 — clamp here so a misconfigured caller
// doesn't get a 422 round-trip.
body.duration_seconds = Math.max(0.5, Math.min(22, opts.durationSeconds));
}
if (typeof opts.promptInfluence === "number") {
body.prompt_influence = Math.max(0, Math.min(1, opts.promptInfluence));
}

const res = await fetch(`${API_BASE}/sound-generation`, {
method: "POST",
headers: {
"xi-api-key": apiKey,
"Content-Type": "application/json",
Accept: "audio/mpeg",
},
body: JSON.stringify(body),
});
if (!res.ok) {
let detail = "";
try {
const text = await res.text();
detail = text.length > 500 ? text.slice(0, 500) + "…" : text;
} catch {
/* ignore */
}
throw new ElevenLabsError(
`generateSoundEffect: ${res.status} ${res.statusText}${detail ? ` — ${detail}` : ""}`,
res.status,
);
}
const buf = new Uint8Array(await res.arrayBuffer());
return { bytes: buf, format };
}

/**
* Helpers for clamping values before they hit the wire — exported so the
* studio API route can reuse the same bounds when validating user input.
*/
export const SFX_BOUNDS = {
durationMin: 0.5,
durationMax: 22,
promptMaxChars: 1000,
} as const;

export function clampSfxDuration(durationSeconds: number): number {
if (!Number.isFinite(durationSeconds)) return 2;
return Math.max(SFX_BOUNDS.durationMin, Math.min(SFX_BOUNDS.durationMax, durationSeconds));
}
28 changes: 28 additions & 0 deletions packages/core/src/script/assemble.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { getLoadedThemeByName } from "./themes/index.js";
import type { PlannedScene, PlannedScript, SceneTransition } from "./types.js";
import type { ImageEntry, ImageManifest } from "../images/index.js";
import type { VisualDirectionPlan } from "./visualDirector.js";
import { readSfxManifest, resolveSfxStartForScene, type SfxEntry } from "./sfx/manifest.js";

export interface AssembleOptions {
projectDir: string;
Expand Down Expand Up @@ -75,6 +76,17 @@ export function assembleMaster(planned: PlannedScript, opts: AssembleOptions): A
let cursor = 0;
const sceneFragments: string[] = [];
const audioTags: string[] = [];

// SFX manifest: read once, group entries by sceneId so each scene's loop
// iteration can emit them at the right cursor position. The manifest is
// optional — projects without it just skip the SFX lane.
const sfxManifest = readSfxManifest(opts.projectDir);
const sfxBySceneId = new Map<string, SfxEntry[]>();
for (const entry of sfxManifest.entries) {
const list = sfxBySceneId.get(entry.sceneId) ?? [];
list.push(entry);
sfxBySceneId.set(entry.sceneId, list);
}
const sceneVisibility: Array<{
id: string;
start: number;
Expand Down Expand Up @@ -182,6 +194,22 @@ export function assembleMaster(planned: PlannedScript, opts: AssembleOptions): A
` <audio id="hf-vo-${scene.id}" src="${escapeAttr(scene.audio.path)}" data-start="${audioStart.toFixed(2)}" data-duration="${audioDur.toFixed(2)}" data-track-index="1" data-timeline-group="voiceover" data-timeline-label="Voiceover" preload="auto"></audio>`,
);
}

// SFX entries land on track 3 with the same audio timing rules as the
// voiceover. Each entry's start time is computed from its anchor (scene-
// start / accent-word / scene-end) — see resolveSfxStartForScene for
// the math. Volume scales the runtime mixer when supplied; the producer
// package consumes data-volume-db at render time.
const sceneSfx = sfxBySceneId.get(scene.id) ?? [];
for (const entry of sceneSfx) {
const start = resolveSfxStartForScene(entry, scene, cursor, sceneTotal);
const volumeAttr =
typeof entry.volumeDb === "number" ? ` data-volume-db="${entry.volumeDb.toFixed(1)}"` : "";
const labelAttr = entry.label ? ` data-timeline-label="${escapeAttr(entry.label)}"` : "";
audioTags.push(
` <audio id="hf-sfx-${scene.id}-${entry.id}" src="${escapeAttr(entry.path)}" data-start="${start.toFixed(2)}" data-duration="${entry.durationSeconds.toFixed(2)}" data-track-index="3" data-timeline-group="sfx"${labelAttr}${volumeAttr} preload="auto"></audio>`,
);
}
const transitionIn: SceneTransition =
scene.transition ?? defaultTransitionForTemplate(scene.template);
const transitionInMs = (TRANSITION_DURATIONS[transitionIn] ?? 0) * 1000;
Expand Down
145 changes: 145 additions & 0 deletions packages/core/src/script/sfx/manifest.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import { describe, it, expect } from "vitest";
import { resolveSfxStart, type SfxEntry } from "./manifest";

const baseEntry: Pick<SfxEntry, "anchor" | "accentWordIndex" | "durationSeconds"> = {
anchor: "scene-start",
durationSeconds: 1.5,
};

describe("resolveSfxStart", () => {
it("scene-start anchor returns sceneStart + audioStartOffset", () => {
expect(
resolveSfxStart({
sceneStart: 10,
sceneDuration: 6,
audioStartOffset: 0.15,
voiceDurationSeconds: 5,
voiceWordCount: 12,
entry: { ...baseEntry, anchor: "scene-start" },
}),
).toBeCloseTo(10.15, 3);
});

it("scene-start with zero lead-in returns sceneStart exactly", () => {
expect(
resolveSfxStart({
sceneStart: 30,
sceneDuration: 4,
audioStartOffset: 0,
voiceDurationSeconds: 3,
voiceWordCount: 8,
entry: { ...baseEntry, anchor: "scene-start" },
}),
).toBe(30);
});

it("scene-end anchor places SFX so it finishes at scene end", () => {
// sceneEnd = 16; SFX is 1.5s long → start at 14.5
expect(
resolveSfxStart({
sceneStart: 10,
sceneDuration: 6,
audioStartOffset: 0.15,
voiceDurationSeconds: 5,
voiceWordCount: 12,
entry: { ...baseEntry, anchor: "scene-end", durationSeconds: 1.5 },
}),
).toBe(14.5);
});

it("scene-end with SFX longer than scene clamps to sceneStart (won't precede the scene)", () => {
expect(
resolveSfxStart({
sceneStart: 50,
sceneDuration: 2,
audioStartOffset: 0,
voiceDurationSeconds: 1.5,
voiceWordCount: 4,
entry: { ...baseEntry, anchor: "scene-end", durationSeconds: 5 },
}),
).toBe(50);
});

it("accent-word anchor interpolates by word index", () => {
// 4 words over 5s narration → each word gets ~1.25s; word index 2 = 2 * 1.25 = 2.5s
// sceneStart 10 + audioStartOffset 0.15 + 2.5 = 12.65
expect(
resolveSfxStart({
sceneStart: 10,
sceneDuration: 6,
audioStartOffset: 0.15,
voiceDurationSeconds: 5,
voiceWordCount: 4,
entry: { ...baseEntry, anchor: "accent-word", accentWordIndex: 2 },
}),
).toBeCloseTo(12.65, 3);
});

it("accent-word with index 0 = scene-start + offset (the first word fires immediately)", () => {
expect(
resolveSfxStart({
sceneStart: 10,
sceneDuration: 6,
audioStartOffset: 0.15,
voiceDurationSeconds: 5,
voiceWordCount: 5,
entry: { ...baseEntry, anchor: "accent-word", accentWordIndex: 0 },
}),
).toBeCloseTo(10.15, 3);
});

it("accent-word with index past the word count clamps to last word", () => {
// wordCount 5, requested index 99 → clamps to 4 (last word).
// Each word = 5 / 5 = 1s. Index 4 = 4s offset. 10 + 0.15 + 4 = 14.15
expect(
resolveSfxStart({
sceneStart: 10,
sceneDuration: 6,
audioStartOffset: 0.15,
voiceDurationSeconds: 5,
voiceWordCount: 5,
entry: { ...baseEntry, anchor: "accent-word", accentWordIndex: 99 },
}),
).toBeCloseTo(14.15, 3);
});

it("accent-word falls back to scene-start when narration is empty", () => {
expect(
resolveSfxStart({
sceneStart: 8,
sceneDuration: 4,
audioStartOffset: 0.2,
voiceDurationSeconds: 0,
voiceWordCount: 0,
entry: { ...baseEntry, anchor: "accent-word", accentWordIndex: 3 },
}),
).toBeCloseTo(8.2, 3);
});

it("accent-word with negative index clamps to 0", () => {
expect(
resolveSfxStart({
sceneStart: 10,
sceneDuration: 6,
audioStartOffset: 0,
voiceDurationSeconds: 4,
voiceWordCount: 4,
entry: { ...baseEntry, anchor: "accent-word", accentWordIndex: -5 },
}),
).toBe(10);
});

it("never returns a value before sceneStart", () => {
// Defensive: scene-end with bizarre inputs.
expect(
resolveSfxStart({
sceneStart: 100,
sceneDuration: 0.1,
audioStartOffset: 0,
voiceDurationSeconds: 0.05,
voiceWordCount: 1,
entry: { ...baseEntry, anchor: "scene-end", durationSeconds: 99 },
}),
).toBe(100);
});
});
Loading