diff --git a/src/common/utils/ai/cacheStrategy.test.ts b/src/common/utils/ai/cacheStrategy.test.ts index 24a25e64a..194d505b0 100644 --- a/src/common/utils/ai/cacheStrategy.test.ts +++ b/src/common/utils/ai/cacheStrategy.test.ts @@ -41,13 +41,22 @@ describe("cacheStrategy", () => { expect(result).toEqual(messages); }); - it("should not modify messages if less than 2 messages", () => { + it("should add cache control to single message for Anthropic models", () => { const messages: ModelMessage[] = [{ role: "user", content: "Hello" }]; const result = applyCacheControl(messages, "anthropic:claude-3-5-sonnet"); - expect(result).toEqual(messages); + expect(result[0]).toEqual({ + ...messages[0], + providerOptions: { + anthropic: { + cacheControl: { + type: "ephemeral", + }, + }, + }, + }); }); - it("should add cache control to second-to-last message for Anthropic models", () => { + it("should add cache control to last message for Anthropic models", () => { const messages: ModelMessage[] = [ { role: "user", content: "Hello" }, { role: "assistant", content: "Hi there!" }, @@ -56,9 +65,10 @@ describe("cacheStrategy", () => { const result = applyCacheControl(messages, "anthropic:claude-3-5-sonnet"); expect(result[0]).toEqual(messages[0]); // First message unchanged - expect(result[1]).toEqual({ - // Second message has cache control - ...messages[1], + expect(result[1]).toEqual(messages[1]); // Second message unchanged + expect(result[2]).toEqual({ + // Last message has cache control + ...messages[2], providerOptions: { anthropic: { cacheControl: { @@ -67,7 +77,6 @@ describe("cacheStrategy", () => { }, }, }); - expect(result[2]).toEqual(messages[2]); // Last message unchanged }); it("should work with exactly 2 messages", () => { @@ -77,9 +86,10 @@ describe("cacheStrategy", () => { ]; const result = applyCacheControl(messages, "anthropic:claude-3-5-sonnet"); - expect(result[0]).toEqual({ - // First message gets cache control - ...messages[0], + expect(result[0]).toEqual(messages[0]); // First message unchanged + expect(result[1]).toEqual({ + // Last message gets cache control + ...messages[1], providerOptions: { anthropic: { cacheControl: { @@ -88,7 +98,6 @@ describe("cacheStrategy", () => { }, }, }); - expect(result[1]).toEqual(messages[1]); // Last message unchanged }); it("should add cache control to last content part for array content", () => { @@ -108,17 +117,24 @@ describe("cacheStrategy", () => { { type: "text", text: "How can I help?" }, ], }, - { role: "user", content: "Final question" }, + { + role: "user", + content: [ + { type: "text", text: "Final" }, + { type: "text", text: "question" }, + ], + }, ]; const result = applyCacheControl(messages, "anthropic:claude-3-5-sonnet"); expect(result[0]).toEqual(messages[0]); // First message unchanged + expect(result[1]).toEqual(messages[1]); // Second message unchanged - // Second message (array content): cache control on LAST content part only - const secondMsg = result[1]; - expect(secondMsg.role).toBe("assistant"); - expect(Array.isArray(secondMsg.content)).toBe(true); - const content = secondMsg.content as Array<{ + // Last message (array content): cache control on LAST content part only + const lastMsg = result[2]; + expect(lastMsg.role).toBe("user"); + expect(Array.isArray(lastMsg.content)).toBe(true); + const content = lastMsg.content as Array<{ type: string; text: string; providerOptions?: unknown; @@ -127,8 +143,6 @@ describe("cacheStrategy", () => { expect(content[1].providerOptions).toEqual({ anthropic: { cacheControl: { type: "ephemeral" } }, }); // Last part has cache control - - expect(result[2]).toEqual(messages[2]); // Last message unchanged }); }); diff --git a/src/common/utils/ai/cacheStrategy.ts b/src/common/utils/ai/cacheStrategy.ts index 70d82f5f3..c6fdc78e7 100644 --- a/src/common/utils/ai/cacheStrategy.ts +++ b/src/common/utils/ai/cacheStrategy.ts @@ -66,10 +66,10 @@ function addCacheControlToLastContentPart(msg: ModelMessage): ModelMessage { /** * Apply cache control to messages for Anthropic models. - * Caches all messages except the last user message for optimal cache hits. + * Adds a cache marker to the last message so the entire conversation is cached. * * NOTE: The SDK requires providerOptions on content parts, not on the message. - * We add cache_control to the last content part of the second-to-last message. + * We add cache_control to the last content part of the last message. */ export function applyCacheControl(messages: ModelMessage[], modelString: string): ModelMessage[] { // Only apply cache control for Anthropic models @@ -77,14 +77,13 @@ export function applyCacheControl(messages: ModelMessage[], modelString: string) return messages; } - // Need at least 2 messages to add a cache breakpoint - if (messages.length < 2) { + // Need at least 1 message to add a cache breakpoint + if (messages.length < 1) { return messages; } - // Add cache breakpoint at the second-to-last message - // This caches everything up to (but not including) the current user message - const cacheIndex = messages.length - 2; + // Add cache breakpoint at the last message + const cacheIndex = messages.length - 1; return messages.map((msg, index) => { if (index === cacheIndex) { diff --git a/src/node/services/aiService.ts b/src/node/services/aiService.ts index 2995d46fb..6fd672f06 100644 --- a/src/node/services/aiService.ts +++ b/src/node/services/aiService.ts @@ -102,7 +102,7 @@ if (typeof globalFetchWithExtras.certificate === "function") { * * Injects cache_control on: * 1. Last tool (caches all tool definitions) - * 2. Second-to-last message's last content part (caches conversation history) + * 2. Last message's last content part (caches entire conversation) */ function wrapFetchWithAnthropicCacheControl(baseFetch: typeof fetch): typeof fetch { const cachingFetch = async ( @@ -123,11 +123,11 @@ function wrapFetchWithAnthropicCacheControl(baseFetch: typeof fetch): typeof fet lastTool.cache_control ??= { type: "ephemeral" }; } - // Inject cache_control on second-to-last message's last content part - // This caches conversation history up to (but not including) the current user message - if (Array.isArray(json.messages) && json.messages.length >= 2) { - const secondToLastMsg = json.messages[json.messages.length - 2] as Record; - const content = secondToLastMsg.content; + // Inject cache_control on last message's last content part + // This caches the entire conversation + if (Array.isArray(json.messages) && json.messages.length >= 1) { + const lastMsg = json.messages[json.messages.length - 1] as Record; + const content = lastMsg.content; if (Array.isArray(content) && content.length > 0) { // Array content: add cache_control to last part