API Token Limiting (#266)

This PR adds automatic handling of large API responses by chunking them and exposing a `loadBySimilarity` function to LLM based on Cohere reranker. The reranker is proxied through the Fixie server (PR fixie-ai/fixie#1573). --------- Co-authored-by: Peter Salas <peter@fixie.ai>, Nick Heiner <nick@fixie.ai>
fixie-ai · Sep 14, 2023 · 49ecdad · 49ecdad · vercel · Sep 14, 2023
1 parent ab94381
commit 49ecdad
Show file tree

Hide file tree

Showing 13 changed files with 640 additions and 232 deletions.
diff --git a/packages/ai-jsx/package.json b/packages/ai-jsx/package.json
@@ -4,7 +4,7 @@
   "repository": "fixie-ai/ai-jsx",
   "bugs": "https://github.com/fixie-ai/ai-jsx/issues",
   "homepage": "https://ai-jsx.com",
-  "version": "0.17.0",
+  "version": "0.17.1",
   "volta": {
     "extends": "../../package.json"
   },
@@ -198,6 +198,15 @@
         "default": "./dist/cjs/lib/openai.cjs"
       }
     },
+    "./lib/cohere": {
+      "import": {
+        "types": "./dist/esm/lib/cohere.d.ts",
+        "default": "./dist/esm/lib/cohere.js"
+      },
+      "require": {
+        "default": "./dist/cjs/lib/cohere.cjs"
+      }
+    },
     "./lib/anthropic": {
       "import": {
         "types": "./dist/esm/lib/anthropic.d.ts",
@@ -386,7 +395,7 @@
     "ink": "^4.2.0",
     "js-tiktoken": "^1.0.7",
     "js-yaml": "^4.1.0",
-    "langchain": "^0.0.81",
+    "langchain": "^0.0.143",
     "lodash": "^4.17.21",
     "ml-distance": "^4.0.1",
     "openai": "^4.1.0",

diff --git a/packages/ai-jsx/src/batteries/docs.tsx b/packages/ai-jsx/src/batteries/docs.tsx
@@ -739,21 +739,23 @@ const defaultLangchainChunkLimit = 4;
 async function searchVectorStore<ChunkMetadata extends Jsonifiable = Jsonifiable>(
   vectorStore: VectorStore,
   query: string,
-  params?: { limit?: number; score_threshold?: number }
+  params?: { limit?: number; score_threshold?: number; filter?: any }
 ): Promise<ScoredChunk<ChunkMetadata>[]> {
   const k = params?.limit ?? defaultLangchainChunkLimit;
-  const scoredLcDocs = await vectorStore.similaritySearchWithScore(query, k, _.omit(params, 'limit'));
-  return scoredLcDocs.map((lcDocAndScore) => {
-    const lcDoc = lcDocAndScore[0];
-    return {
-      score: lcDocAndScore[1],
-      chunk: {
-        // TODO: Wrap chunker to track document name in ChunkMetadata in a way we can pull back out here.
-        content: lcDoc.pageContent,
-        metadata: lcDoc.metadata as ChunkMetadata,
-      },
-    } as ScoredChunk<ChunkMetadata>;
-  });
+  const scoredLcDocs = await vectorStore.similaritySearchWithScore(query, k, params?.filter);
+  return scoredLcDocs
+    .map((lcDocAndScore) => {
+      const lcDoc = lcDocAndScore[0];
+      return {
+        score: lcDocAndScore[1],
+        chunk: {
+          // TODO: Wrap chunker to track document name in ChunkMetadata in a way we can pull back out here.
+          content: lcDoc.pageContent,
+          metadata: lcDoc.metadata as ChunkMetadata,
+        },
+      } as ScoredChunk<ChunkMetadata>;
+    })
+    .filter((chunk) => chunk.score >= (params?.score_threshold ?? Number.MIN_VALUE));
 }
 
 /** A default component for formatting document chunks. */

diff --git a/packages/ai-jsx/src/batteries/sidekick/platform/conversation.tsx b/packages/ai-jsx/src/batteries/sidekick/platform/conversation.tsx
@@ -12,7 +12,9 @@ import {
   renderToConversation,
   SystemMessage,
 } from '../../../core/conversation.js';
+import { LargeFunctionResponseWrapper, redactedFunctionTools } from './large-response-handler.js';
 import { ExecuteFunction, UseToolsProps } from '../../use-tools.js';
+import _ from 'lodash';
 
 /**
  * This function defines the shrinking policy. It's activated when the conversation history overflows the context
@@ -81,15 +83,40 @@ export function getNextConversationStep(
 ) {
   const shrinkableConversation = getShrinkableConversation(messages, fullConversation);
   const lastMessage = messages[messages.length - 1];
+
+  // Add tools for interacting with redacted function responses (if one exists).
+  // We will only take into account the current round of messages (after last UserMessage). In the next round
+  // the LLM will need to call the function again. This is to prevent the LLM from accessing stale data.
+  const lastTurnMessages = _.takeRightWhile(fullConversation, ({ type }) => type !== 'user');
+  const updatedTools = { ...tools, ...redactedFunctionTools(lastTurnMessages) };
+
   switch (lastMessage.type) {
     case 'functionCall': {
       const { name, args } = lastMessage.element.props;
-      return <ExecuteFunction func={tools[name].func} name={name} args={args} />;
+      const executedFunction = (
+        <ExecuteFunction
+          // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
+          func={updatedTools[name]?.func}
+          name={name}
+          args={args}
+        />
+      );
+      // If we are using a tool based on redacted functions, we don't want to redact it further
+      if (!(name in tools)) {
+        return executedFunction;
+      }
+      // Function responses can potentially be very large. In that case, we need
+      // some way of handling that so the context window doesn't blow up.
+      return (
+        <LargeFunctionResponseWrapper numChunks={4} maxLength={4000} failedMaxLength={2000}>
+          {executedFunction}
+        </LargeFunctionResponseWrapper>
+      );
     }
     case 'functionResponse':
       return (
         <RepairMdxInConversation>
-          <ChatCompletion functionDefinitions={tools}>
+          <ChatCompletion functionDefinitions={updatedTools}>
             {shrinkableConversation}
             {finalSystemMessageBeforeResponse}
           </ChatCompletion>
@@ -98,7 +125,7 @@ export function getNextConversationStep(
     case 'user':
       return (
         <RepairMdxInConversation>
-          <ChatCompletion functionDefinitions={tools}>{shrinkableConversation}</ChatCompletion>
+          <ChatCompletion functionDefinitions={updatedTools}>{shrinkableConversation}</ChatCompletion>
         </RepairMdxInConversation>
       );
     default:

diff --git a/packages/ai-jsx/src/batteries/sidekick/platform/large-response-handler.tsx b/packages/ai-jsx/src/batteries/sidekick/platform/large-response-handler.tsx
@@ -0,0 +1,248 @@
+import * as AI from '../../../index.js';
+import { ConversationMessage, FunctionResponse, renderToConversation } from '../../../core/conversation.js';
+
+import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
+import { getEncoding, Tiktoken } from 'js-tiktoken';
+import yaml from 'js-yaml';
+import _ from 'lodash';
+import { UseToolsProps } from '../../use-tools.js';
+import { cohereContext, MarkdownChunkFormatter, RerankerFormatted } from '../../../lib/cohere.js';
+import { Jsonifiable } from 'type-fest';
+
+export interface RedactedFuncionResponseMetadata {
+  isRedacted: true;
+  chunks: string[];
+}
+
+const getOpenAIEncoder = _.once(() => getEncoding('cl100k_base'));
+
+function tokenCount(text: string, encoder: Tiktoken) {
+  return encoder.encode(text).length;
+}
+
+const TRUNCATION_SUFFIX = '\n\n...[value-truncated:too-large]...';
+
+export async function TruncateByChars(
+  {
+    children,
+    maxLength,
+  }: {
+    children: AI.Node;
+    maxLength: number;
+  },
+  { render }: AI.ComponentContext
+) {
+  const stringified = await render(children);
+  if (stringified.length <= maxLength) {
+    return stringified;
+  }
+  return `${stringified.slice(0, maxLength - TRUNCATION_SUFFIX.length)}${TRUNCATION_SUFFIX}`;
+}
+
+export async function TruncateByTokens(
+  {
+    children,
+    maxLength,
+    encoder = getOpenAIEncoder(),
+  }: {
+    children: AI.Node;
+    maxLength: number;
+    encoder?: Tiktoken;
+  },
+  { render }: AI.ComponentContext
+) {
+  const stringified = await render(children);
+  if (tokenCount(stringified, encoder) <= maxLength) {
+    return stringified;
+  }
+  const budget = maxLength - tokenCount(TRUNCATION_SUFFIX, encoder);
+
+  return encoder.decode(encoder.encode(stringified).slice(0, budget)) + TRUNCATION_SUFFIX;
+}
+
+export interface LargeFunctionResponseProps {
+  maxLength: number;
+  failedMaxLength: number;
+  numChunks: number;
+  encoder?: Tiktoken;
+}
+
+async function LargeFunctionResponseHandler(
+  {
+    children,
+    maxLength = 4000,
+    failedMaxLength = 1000,
+    numChunks = 4,
+    encoder = getOpenAIEncoder(),
+    ...props
+  }: AI.PropsOfComponent<typeof FunctionResponse> & LargeFunctionResponseProps,
+  { render, logger, getContext }: AI.ComponentContext
+) {
+  if (props.failed) {
+    return (
+      // TODO: fix issue between maxLength chars and tokens
+      <FunctionResponse {...props}>
+        <TruncateByChars maxLength={failedMaxLength}>{children}</TruncateByChars>
+      </FunctionResponse>
+    );
+  }
+
+  let stringified = await render(children);
+
+  // Option 1: do nothing if it's already small enough
+  if (tokenCount(stringified, encoder) <= maxLength) {
+    return <FunctionResponse {...props}>{stringified}</FunctionResponse>;
+  }
+
+  stringified = yamlOptimizeIfPossible(stringified);
+
+  // Option 2: try dumping as YAML. If it's small enough, then we are done.
+  if (tokenCount(stringified, encoder) <= maxLength) {
+    return <FunctionResponse {...props}>{stringified}</FunctionResponse>;
+  }
+
+  // Option 3 (last reosrt): split into chunks and allow LLM to query by similarity
+  const cohereConfig = getContext(cohereContext);
+
+  if (!cohereConfig.api_key) {
+    // We require Cohere API key for doing similarity search
+    // If it's not set, we fall back to truncating the response.
+    logger.warn(
+      { CohereContext: cohereConfig },
+      'FunctionResponse is too big, but Cohere API key is not set. Please set it in the context.' +
+        'Falling back to truncating the response.'
+    );
+    return (
+      <FunctionResponse {...props} metadata={{ isRedacted: true }}>
+        <TruncateByTokens maxLength={maxLength} encoder={encoder}>
+          {stringified}
+        </TruncateByTokens>
+      </FunctionResponse>
+    );
+  }
+
+  const splitter = new RecursiveCharacterTextSplitter({
+    chunkSize: maxLength / numChunks,
+    chunkOverlap: maxLength / numChunks / 10,
+    lengthFunction: (x) => tokenCount(x, encoder),
+  });
+  const chunks = await splitter.splitText(stringified);
+
+  return (
+    <FunctionResponse {...props} metadata={{ isRedacted: true, chunks }}>
+      ... The response is too big and hence redacted. The response can be queried using semantic similarity search by
+      calling the `loadBySimilarity` function.
+    </FunctionResponse>
+  );
+}
+
+/**
+ * This function allows wrapping {@link FunctionResponse} elements that can possibly be too large.
+ * It will replace FunctionResponse elements with {@link LargeFunctionResponseHandler}s that know how to handle large responses.
+ *
+ * {@link LargeFunctionResponseHandler} will not modify responses that are not large. If they are, it will first try
+ * to optimize the response by dumping it as YAML, which is more token-efficient than JSON.
+ * If that doesn't work, it will split the response into chunks and allows the LLM to query it using semantic similarity
+ * search by exposing a dynamic function called `loadBySimilarity(query)`.
+ *
+ * Note that failed responses are not optimized and will simply be truncated to `failedMaxLength` characters.
+ *
+ * @see {@link redactedFunctionTools} to see how to add the `loadBySimilarity` function.
+ *
+ * @example
+ * ```tsx
+ *    <LargeFunctionResponseWrapper maxLength={4000} failedMaxLength={1000} numChunks={4}>
+ *      <ExecuteFunction func={tool[name].func} name={name} args={args} />
+ *    </LargeFunctionResponseWrapper>
+ * ```
+ */
+export async function LargeFunctionResponseWrapper(
+  { children, ...props }: { children: AI.Node } & LargeFunctionResponseProps,
+  { render }: AI.ComponentContext
+) {
+  // We need to render the children to get the FunctionResponse elements
+  const messages = await renderToConversation(children, render);
+
+  // We expect messages to just contain a single FunctionResponse but we handle multiple just in case
+  return messages.map((msg) =>
+    msg.type == 'functionResponse' ? <LargeFunctionResponseHandler {...props} {...msg.element.props} /> : msg.element
+  );
+}
+
+function getLastRedactedFnResponseData(messages: ConversationMessage[]): RedactedFuncionResponseMetadata | undefined {
+  const metadataOrUndefined = messages.map((msg) =>
+    msg.type == 'functionResponse' &&
+    typeof msg.element.props.metadata !== 'undefined' &&
+    'isRedacted' in msg.element.props.metadata &&
+    Boolean(msg.element.props.metadata.isRedacted) &&
+    'chunks' in msg.element.props.metadata &&
+    Array.isArray(msg.element.props.metadata.chunks) &&
+    msg.element.props.metadata.chunks.every((chunk) => typeof chunk === 'string')
+      ? (msg.element.props.metadata as unknown as RedactedFuncionResponseMetadata)
+      : undefined
+  );
+  return _.findLast(metadataOrUndefined, Boolean);
+}
+
+export function redactedFunctionTools(messages: ConversationMessage[]): UseToolsProps['tools'] {
+  const responseContent = getLastRedactedFnResponseData(messages);
+  if (!responseContent) {
+    return {};
+  }
+  // TODO: it is possible for this formulation to confuse the model when multiple redacted responses are present.
+  // We should consider adding an argument or improve prompts to disambiguate if the need arises.
+  return {
+    loadBySimilarity: {
+      description: 'Query the response of the "latest redacted function call" by using semantic similarity search.',
+      parameters: {
+        query: {
+          type: 'string',
+          description: 'A query string.',
+          required: true,
+        },
+      },
+      func: ({ query }) => (
+        <RerankerFormatted
+          query={query}
+          documents={responseContent.chunks}
+          top_n={2}
+          Formatter={MarkdownChunkFormatter}
+        />
+      ),
+    },
+  };
+}
+
+/**
+ * YAML is more token-efficient than JSON, hence we try to convert to YAML if possible.
+ * Note that minifiying JSON might reduce tokens, but it is also more confusing for LLMs.
+ * To get the best of both worlds, we use YAML with a large line width and a max flow level.
+ * This means that after a certain nesting level, YAML switches to inline format.
+ *
+ * Here is a comparison for a large test object:
+ *  - JSON multi-line # tokens: 36175
+ *  - JSON minified # tokens: 15006
+ *  - YAML # tokens: 14546
+ *  - YAML # tokens flowLevel=4: 13833
+ *
+ * Also see the following for more comparisons:
+ *     [Internal] https://www.notion.so/fixieai/API-Response-Token-Limiting-2ba2a63b047044599370c2f26fcf2bfa
+ *     [Public] https://nikas.praninskas.com/ai/2023/04/05/efficient-gpt-data-formats/
+ *     [Public] https://betterprogramming.pub/yaml-vs-json-which-is-more-efficient-for-language-models-5bc11dd0f6df
+ *
+ * @param possiblyObjectText The text to optimize (in JSON/YAML format)
+ * @returns Optimized text if possible, otherwise the original text
+ */
+function yamlOptimizeIfPossible(possiblyObjectText: string) {
+  let content: Jsonifiable;
+  try {
+    content = JSON.parse(possiblyObjectText);
+  } catch (e) {
+    try {
+      content = yaml.load(possiblyObjectText) as Jsonifiable;
+    } catch (e) {
+      return possiblyObjectText;
+    }
+  }
+  return yaml.dump(content, { lineWidth: 200, flowLevel: 4 });
+}
diff --git a/packages/ai-jsx/src/core/errors.ts b/packages/ai-jsx/src/core/errors.ts
@@ -40,6 +40,9 @@ export enum ErrorCode {
 
   OpenAIAPIError = 1032,
 
+  CohereAPIError = 1033,
+  MissingRerankerModel = 1034,
+
   ModelOutputDidNotMatchConstraint = 2000,
 
   UnsupportedMimeType = 2001,