Skip to content

Commit

Permalink
API Token Limiting (#266)
Browse files Browse the repository at this point in the history
This PR adds automatic handling of large API responses by chunking them
and exposing a `loadBySimilarity` function to LLM based on Cohere
reranker. The reranker is proxied through the Fixie server (PR
fixie-ai/fixie#1573).
---------
Co-authored-by: Peter Salas <peter@fixie.ai>, Nick Heiner <nick@fixie.ai>
  • Loading branch information
farzadab committed Sep 14, 2023
1 parent ab94381 commit 49ecdad
Show file tree
Hide file tree
Showing 13 changed files with 640 additions and 232 deletions.
13 changes: 11 additions & 2 deletions packages/ai-jsx/package.json
Expand Up @@ -4,7 +4,7 @@
"repository": "fixie-ai/ai-jsx",
"bugs": "https://github.com/fixie-ai/ai-jsx/issues",
"homepage": "https://ai-jsx.com",
"version": "0.17.0",
"version": "0.17.1",
"volta": {
"extends": "../../package.json"
},
Expand Down Expand Up @@ -198,6 +198,15 @@
"default": "./dist/cjs/lib/openai.cjs"
}
},
"./lib/cohere": {
"import": {
"types": "./dist/esm/lib/cohere.d.ts",
"default": "./dist/esm/lib/cohere.js"
},
"require": {
"default": "./dist/cjs/lib/cohere.cjs"
}
},
"./lib/anthropic": {
"import": {
"types": "./dist/esm/lib/anthropic.d.ts",
Expand Down Expand Up @@ -386,7 +395,7 @@
"ink": "^4.2.0",
"js-tiktoken": "^1.0.7",
"js-yaml": "^4.1.0",
"langchain": "^0.0.81",
"langchain": "^0.0.143",
"lodash": "^4.17.21",
"ml-distance": "^4.0.1",
"openai": "^4.1.0",
Expand Down
28 changes: 15 additions & 13 deletions packages/ai-jsx/src/batteries/docs.tsx
Expand Up @@ -739,21 +739,23 @@ const defaultLangchainChunkLimit = 4;
async function searchVectorStore<ChunkMetadata extends Jsonifiable = Jsonifiable>(
vectorStore: VectorStore,
query: string,
params?: { limit?: number; score_threshold?: number }
params?: { limit?: number; score_threshold?: number; filter?: any }
): Promise<ScoredChunk<ChunkMetadata>[]> {
const k = params?.limit ?? defaultLangchainChunkLimit;
const scoredLcDocs = await vectorStore.similaritySearchWithScore(query, k, _.omit(params, 'limit'));
return scoredLcDocs.map((lcDocAndScore) => {
const lcDoc = lcDocAndScore[0];
return {
score: lcDocAndScore[1],
chunk: {
// TODO: Wrap chunker to track document name in ChunkMetadata in a way we can pull back out here.
content: lcDoc.pageContent,
metadata: lcDoc.metadata as ChunkMetadata,
},
} as ScoredChunk<ChunkMetadata>;
});
const scoredLcDocs = await vectorStore.similaritySearchWithScore(query, k, params?.filter);
return scoredLcDocs
.map((lcDocAndScore) => {
const lcDoc = lcDocAndScore[0];
return {
score: lcDocAndScore[1],
chunk: {
// TODO: Wrap chunker to track document name in ChunkMetadata in a way we can pull back out here.
content: lcDoc.pageContent,
metadata: lcDoc.metadata as ChunkMetadata,
},
} as ScoredChunk<ChunkMetadata>;
})
.filter((chunk) => chunk.score >= (params?.score_threshold ?? Number.MIN_VALUE));
}

/** A default component for formatting document chunks. */
Expand Down
33 changes: 30 additions & 3 deletions packages/ai-jsx/src/batteries/sidekick/platform/conversation.tsx
Expand Up @@ -12,7 +12,9 @@ import {
renderToConversation,
SystemMessage,
} from '../../../core/conversation.js';
import { LargeFunctionResponseWrapper, redactedFunctionTools } from './large-response-handler.js';
import { ExecuteFunction, UseToolsProps } from '../../use-tools.js';
import _ from 'lodash';

/**
* This function defines the shrinking policy. It's activated when the conversation history overflows the context
Expand Down Expand Up @@ -81,15 +83,40 @@ export function getNextConversationStep(
) {
const shrinkableConversation = getShrinkableConversation(messages, fullConversation);
const lastMessage = messages[messages.length - 1];

// Add tools for interacting with redacted function responses (if one exists).
// We will only take into account the current round of messages (after last UserMessage). In the next round
// the LLM will need to call the function again. This is to prevent the LLM from accessing stale data.
const lastTurnMessages = _.takeRightWhile(fullConversation, ({ type }) => type !== 'user');
const updatedTools = { ...tools, ...redactedFunctionTools(lastTurnMessages) };

switch (lastMessage.type) {
case 'functionCall': {
const { name, args } = lastMessage.element.props;
return <ExecuteFunction func={tools[name].func} name={name} args={args} />;
const executedFunction = (
<ExecuteFunction
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
func={updatedTools[name]?.func}
name={name}
args={args}
/>
);
// If we are using a tool based on redacted functions, we don't want to redact it further
if (!(name in tools)) {
return executedFunction;
}
// Function responses can potentially be very large. In that case, we need
// some way of handling that so the context window doesn't blow up.
return (
<LargeFunctionResponseWrapper numChunks={4} maxLength={4000} failedMaxLength={2000}>
{executedFunction}
</LargeFunctionResponseWrapper>
);
}
case 'functionResponse':
return (
<RepairMdxInConversation>
<ChatCompletion functionDefinitions={tools}>
<ChatCompletion functionDefinitions={updatedTools}>
{shrinkableConversation}
{finalSystemMessageBeforeResponse}
</ChatCompletion>
Expand All @@ -98,7 +125,7 @@ export function getNextConversationStep(
case 'user':
return (
<RepairMdxInConversation>
<ChatCompletion functionDefinitions={tools}>{shrinkableConversation}</ChatCompletion>
<ChatCompletion functionDefinitions={updatedTools}>{shrinkableConversation}</ChatCompletion>
</RepairMdxInConversation>
);
default:
Expand Down
@@ -0,0 +1,248 @@
import * as AI from '../../../index.js';
import { ConversationMessage, FunctionResponse, renderToConversation } from '../../../core/conversation.js';

import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import { getEncoding, Tiktoken } from 'js-tiktoken';
import yaml from 'js-yaml';
import _ from 'lodash';
import { UseToolsProps } from '../../use-tools.js';
import { cohereContext, MarkdownChunkFormatter, RerankerFormatted } from '../../../lib/cohere.js';
import { Jsonifiable } from 'type-fest';

export interface RedactedFuncionResponseMetadata {
isRedacted: true;
chunks: string[];
}

const getOpenAIEncoder = _.once(() => getEncoding('cl100k_base'));

function tokenCount(text: string, encoder: Tiktoken) {
return encoder.encode(text).length;
}

const TRUNCATION_SUFFIX = '\n\n...[value-truncated:too-large]...';

export async function TruncateByChars(
{
children,
maxLength,
}: {
children: AI.Node;
maxLength: number;
},
{ render }: AI.ComponentContext
) {
const stringified = await render(children);
if (stringified.length <= maxLength) {
return stringified;
}
return `${stringified.slice(0, maxLength - TRUNCATION_SUFFIX.length)}${TRUNCATION_SUFFIX}`;
}

export async function TruncateByTokens(
{
children,
maxLength,
encoder = getOpenAIEncoder(),
}: {
children: AI.Node;
maxLength: number;
encoder?: Tiktoken;
},
{ render }: AI.ComponentContext
) {
const stringified = await render(children);
if (tokenCount(stringified, encoder) <= maxLength) {
return stringified;
}
const budget = maxLength - tokenCount(TRUNCATION_SUFFIX, encoder);

return encoder.decode(encoder.encode(stringified).slice(0, budget)) + TRUNCATION_SUFFIX;
}

export interface LargeFunctionResponseProps {
maxLength: number;
failedMaxLength: number;
numChunks: number;
encoder?: Tiktoken;
}

async function LargeFunctionResponseHandler(
{
children,
maxLength = 4000,
failedMaxLength = 1000,
numChunks = 4,
encoder = getOpenAIEncoder(),
...props
}: AI.PropsOfComponent<typeof FunctionResponse> & LargeFunctionResponseProps,
{ render, logger, getContext }: AI.ComponentContext
) {
if (props.failed) {
return (
// TODO: fix issue between maxLength chars and tokens
<FunctionResponse {...props}>
<TruncateByChars maxLength={failedMaxLength}>{children}</TruncateByChars>
</FunctionResponse>
);
}

let stringified = await render(children);

// Option 1: do nothing if it's already small enough
if (tokenCount(stringified, encoder) <= maxLength) {
return <FunctionResponse {...props}>{stringified}</FunctionResponse>;
}

stringified = yamlOptimizeIfPossible(stringified);

// Option 2: try dumping as YAML. If it's small enough, then we are done.
if (tokenCount(stringified, encoder) <= maxLength) {
return <FunctionResponse {...props}>{stringified}</FunctionResponse>;
}

// Option 3 (last reosrt): split into chunks and allow LLM to query by similarity
const cohereConfig = getContext(cohereContext);

if (!cohereConfig.api_key) {
// We require Cohere API key for doing similarity search
// If it's not set, we fall back to truncating the response.
logger.warn(
{ CohereContext: cohereConfig },
'FunctionResponse is too big, but Cohere API key is not set. Please set it in the context.' +
'Falling back to truncating the response.'
);
return (
<FunctionResponse {...props} metadata={{ isRedacted: true }}>
<TruncateByTokens maxLength={maxLength} encoder={encoder}>
{stringified}
</TruncateByTokens>
</FunctionResponse>
);
}

const splitter = new RecursiveCharacterTextSplitter({
chunkSize: maxLength / numChunks,
chunkOverlap: maxLength / numChunks / 10,
lengthFunction: (x) => tokenCount(x, encoder),
});
const chunks = await splitter.splitText(stringified);

return (
<FunctionResponse {...props} metadata={{ isRedacted: true, chunks }}>
... The response is too big and hence redacted. The response can be queried using semantic similarity search by
calling the `loadBySimilarity` function.
</FunctionResponse>
);
}

/**
* This function allows wrapping {@link FunctionResponse} elements that can possibly be too large.
* It will replace FunctionResponse elements with {@link LargeFunctionResponseHandler}s that know how to handle large responses.
*
* {@link LargeFunctionResponseHandler} will not modify responses that are not large. If they are, it will first try
* to optimize the response by dumping it as YAML, which is more token-efficient than JSON.
* If that doesn't work, it will split the response into chunks and allows the LLM to query it using semantic similarity
* search by exposing a dynamic function called `loadBySimilarity(query)`.
*
* Note that failed responses are not optimized and will simply be truncated to `failedMaxLength` characters.
*
* @see {@link redactedFunctionTools} to see how to add the `loadBySimilarity` function.
*
* @example
* ```tsx
* <LargeFunctionResponseWrapper maxLength={4000} failedMaxLength={1000} numChunks={4}>
* <ExecuteFunction func={tool[name].func} name={name} args={args} />
* </LargeFunctionResponseWrapper>
* ```
*/
export async function LargeFunctionResponseWrapper(
{ children, ...props }: { children: AI.Node } & LargeFunctionResponseProps,
{ render }: AI.ComponentContext
) {
// We need to render the children to get the FunctionResponse elements
const messages = await renderToConversation(children, render);

// We expect messages to just contain a single FunctionResponse but we handle multiple just in case
return messages.map((msg) =>
msg.type == 'functionResponse' ? <LargeFunctionResponseHandler {...props} {...msg.element.props} /> : msg.element
);
}

function getLastRedactedFnResponseData(messages: ConversationMessage[]): RedactedFuncionResponseMetadata | undefined {
const metadataOrUndefined = messages.map((msg) =>
msg.type == 'functionResponse' &&
typeof msg.element.props.metadata !== 'undefined' &&
'isRedacted' in msg.element.props.metadata &&
Boolean(msg.element.props.metadata.isRedacted) &&
'chunks' in msg.element.props.metadata &&
Array.isArray(msg.element.props.metadata.chunks) &&
msg.element.props.metadata.chunks.every((chunk) => typeof chunk === 'string')
? (msg.element.props.metadata as unknown as RedactedFuncionResponseMetadata)
: undefined
);
return _.findLast(metadataOrUndefined, Boolean);
}

export function redactedFunctionTools(messages: ConversationMessage[]): UseToolsProps['tools'] {
const responseContent = getLastRedactedFnResponseData(messages);
if (!responseContent) {
return {};
}
// TODO: it is possible for this formulation to confuse the model when multiple redacted responses are present.
// We should consider adding an argument or improve prompts to disambiguate if the need arises.
return {
loadBySimilarity: {
description: 'Query the response of the "latest redacted function call" by using semantic similarity search.',
parameters: {
query: {
type: 'string',
description: 'A query string.',
required: true,
},
},
func: ({ query }) => (
<RerankerFormatted
query={query}
documents={responseContent.chunks}
top_n={2}
Formatter={MarkdownChunkFormatter}
/>
),
},
};
}

/**
* YAML is more token-efficient than JSON, hence we try to convert to YAML if possible.
* Note that minifiying JSON might reduce tokens, but it is also more confusing for LLMs.
* To get the best of both worlds, we use YAML with a large line width and a max flow level.
* This means that after a certain nesting level, YAML switches to inline format.
*
* Here is a comparison for a large test object:
* - JSON multi-line # tokens: 36175
* - JSON minified # tokens: 15006
* - YAML # tokens: 14546
* - YAML # tokens flowLevel=4: 13833
*
* Also see the following for more comparisons:
* [Internal] https://www.notion.so/fixieai/API-Response-Token-Limiting-2ba2a63b047044599370c2f26fcf2bfa
* [Public] https://nikas.praninskas.com/ai/2023/04/05/efficient-gpt-data-formats/
* [Public] https://betterprogramming.pub/yaml-vs-json-which-is-more-efficient-for-language-models-5bc11dd0f6df
*
* @param possiblyObjectText The text to optimize (in JSON/YAML format)
* @returns Optimized text if possible, otherwise the original text
*/
function yamlOptimizeIfPossible(possiblyObjectText: string) {
let content: Jsonifiable;
try {
content = JSON.parse(possiblyObjectText);
} catch (e) {
try {
content = yaml.load(possiblyObjectText) as Jsonifiable;
} catch (e) {
return possiblyObjectText;
}
}
return yaml.dump(content, { lineWidth: 200, flowLevel: 4 });
}
3 changes: 3 additions & 0 deletions packages/ai-jsx/src/core/errors.ts
Expand Up @@ -40,6 +40,9 @@ export enum ErrorCode {

OpenAIAPIError = 1032,

CohereAPIError = 1033,
MissingRerankerModel = 1034,

ModelOutputDidNotMatchConstraint = 2000,

UnsupportedMimeType = 2001,
Expand Down

3 comments on commit 49ecdad

@vercel
Copy link

@vercel vercel bot commented on 49ecdad Sep 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Successfully deployed to the following URLs:

ai-jsx-docs – ./packages/docs

ai-jsx-docs-fixie-ai.vercel.app
ai-jsx-docs-git-main-fixie-ai.vercel.app
ai-jsx-docs.vercel.app
docs.ai-jsx.com

@vercel
Copy link

@vercel vercel bot commented on 49ecdad Sep 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Successfully deployed to the following URLs:

ai-jsx-tutorial-nextjs – ./packages/tutorial-nextjs

ai-jsx-tutorial-nextjs-git-main-fixie-ai.vercel.app
ai-jsx-tutorial-nextjs.vercel.app
ai-jsx-tutorial-nextjs-fixie-ai.vercel.app

@vercel
Copy link

@vercel vercel bot commented on 49ecdad Sep 14, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Successfully deployed to the following URLs:

ai-jsx-nextjs-demo – ./packages/nextjs-demo

ai-jsx-nextjs-demo.vercel.app
ai-jsx-nextjs-demo-fixie-ai.vercel.app
ai-jsx-nextjs-demo-git-main-fixie-ai.vercel.app

Please sign in to comment.