Add DocsQAWithSources component (#207)

This extends on DocsQA by encouraging the LLM to provide citations for its answer and allowing the developer to format the result as they like.
fixie-ai · Jul 21, 2023 · 80e25c7 · 80e25c7 · vercel · Jul 21, 2023
1 parent c925534
commit 80e25c7
Show file tree

Hide file tree

Showing 5 changed files with 188 additions and 26 deletions.
diff --git a/packages/ai-jsx/package.json b/packages/ai-jsx/package.json
@@ -4,7 +4,7 @@
   "repository": "fixie-ai/ai-jsx",
   "bugs": "https://github.com/fixie-ai/ai-jsx/issues",
   "homepage": "https://ai-jsx.com",
-  "version": "0.5.12",
+  "version": "0.5.13",
   "volta": {
     "extends": "../../package.json"
   },

diff --git a/packages/ai-jsx/src/batteries/docs.tsx b/packages/ai-jsx/src/batteries/docs.tsx
@@ -11,10 +11,13 @@ import { VectorStore } from 'langchain/vectorstores';
 import _ from 'lodash';
 import { similarity } from 'ml-distance';
 import { Jsonifiable } from 'type-fest';
+import z from 'zod';
 import { ChatCompletion, SystemMessage, UserMessage } from '../core/completion.js';
+import { AIJSXError, ErrorCode } from '../core/errors.js';
+import * as AI from '../index.js';
 import { Node } from '../index.js';
 import { getEnvVar } from '../lib/util.js';
-import { AIJSXError, ErrorCode } from '../core/errors.js';
+import { JsonChatCompletion } from './constrained-output.js';
 
 /**
  * A raw document loaded from an arbitrary source that has not yet been parsed.
@@ -765,6 +768,23 @@ export interface DocsQAProps<ChunkMetadata extends Jsonifiable = Jsonifiable> {
   chunkFormatter?: (props: { doc: ScoredChunk<ChunkMetadata> }) => Node;
 }
 
+export interface DocsQAWithCitationsProps<ChunkMetadata extends Jsonifiable = Jsonifiable>
+  extends DocsQAProps<ChunkMetadata> {
+  /**
+   * The component used to format results from a DocsQAWithCitations query.
+   *
+   * ```tsx
+   *   function FormatQAResult(result: QAWithCitationsResult) {
+   *     if (result.sources?.length) {
+   *       return `${result.answer}\n\nSources:\n${result.sources.join('\n')}`;
+   *     }
+   *     return result.answer;
+   *   }
+   * ```
+   */
+  resultFormatter?: (result: QAWithCitationsResult) => Node;
+}
+
 /**
  * A component that can be used to answer questions about documents. This is a very common usecase for LLMs.
  * @example
@@ -789,3 +809,56 @@ export async function DocsQA<ChunkMetadata extends Jsonifiable = Jsonifiable>(pr
     </ChatCompletion>
   );
 }
+
+const ResultSchema = z
+  .object({
+    answer: z.string().describe("The answer to the user's question"),
+    sources: z.array(z.string()).describe('The title or URL of each document used to answer the question'),
+  })
+  .required({ answer: true });
+
+export type QAWithCitationsResult = Partial<z.infer<typeof ResultSchema>>;
+
+/** A default component for formatting DocsQAWithCitations results. */
+function DefaultQAResultFormatter(result: QAWithCitationsResult) {
+  if (result.sources?.length) {
+    return `${result.answer}\n\nSources:\n${result.sources.join('\n')}`;
+  }
+  return result.answer;
+}
+
+/**
+ * Similar to {@link DocsQA}, but encourages the LLM to return citations for its answer.
+ */
+export async function* DocsQAWithCitations<ChunkMetadata extends Jsonifiable = Jsonifiable>(
+  props: DocsQAWithCitationsProps<ChunkMetadata>,
+  { render, logger }: AI.ComponentContext
+) {
+  const chunks = await props.corpus.search(props.question, { limit: props.chunkLimit });
+  const chunkFormatter: (props: { doc: ScoredChunk<ChunkMetadata> }) => Node = props.chunkFormatter ?? DefaultFormatter;
+  const resultFormatter: (result: QAWithCitationsResult) => Node = props.resultFormatter ?? DefaultQAResultFormatter;
+
+  const stringifiedResult = (
+    <JsonChatCompletion schema={ResultSchema}>
+      <SystemMessage>
+        You are a trained question answerer. Answer questions truthfully, using only the document excerpts below. Do not
+        use any other knowledge you have about the world. If you don't know how to answer the question, just say "I
+        don't know." Here are the relevant document excerpts you have been given:
+        {chunks.map((chunk) => chunkFormatter({ doc: chunk }))}
+        And here is the question you must answer:
+      </SystemMessage>
+      <UserMessage>{props.question}</UserMessage>
+    </JsonChatCompletion>
+  );
+
+  const frames = render(stringifiedResult);
+  for await (const frame of frames) {
+    try {
+      yield resultFormatter(ResultSchema.parse(JSON.parse(frame)));
+    } catch (e) {
+      logger.debug(`Failed to parse DocsQAWithCitations frame: ${e}`);
+      yield frame;
+    }
+  }
+  return resultFormatter(ResultSchema.parse(JSON.parse(await frames)));
+}
diff --git a/packages/ai-jsx/src/lib/openai.tsx b/packages/ai-jsx/src/lib/openai.tsx
@@ -328,18 +328,12 @@ export async function* OpenAIChatModel(
     );
   }
 
-  if (props.forcedFunction) {
-    if (
-      !Object.entries(props.functionDefinitions)
-        .map(([functionName, _]) => functionName)
-        .find((f) => f == props.forcedFunction)
-    ) {
-      throw new AIJSXError(
-        `The function ${props.forcedFunction} was forced, but no function with that name was defined.`,
-        ErrorCode.ChatCompletionBadInput,
-        'user'
-      );
-    }
+  if (props.forcedFunction && !Object.keys(props.functionDefinitions).includes(props.forcedFunction)) {
+    throw new AIJSXError(
+      `The function ${props.forcedFunction} was forced, but no function with that name was defined.`,
+      ErrorCode.ChatCompletionBadInput,
+      'user'
+    );
   }
 
   const messageElements = await render(props.children, {
@@ -412,11 +406,9 @@ export async function* OpenAIChatModel(
         description: functionDefinition.description,
         parameters: getParametersSchema(functionDefinition.parameters),
       }));
-  const openaiFunctionCall: CreateChatCompletionRequestFunctionCall | undefined = !props.forcedFunction
-    ? undefined
-    : {
-        name: props.forcedFunction,
-      };
+  const openaiFunctionCall: CreateChatCompletionRequestFunctionCall | undefined = props.forcedFunction
+    ? { name: props.forcedFunction }
+    : undefined;
 
   const openai = getContext(openAiClientContext);
   const chatCompletionRequest = {

diff --git a/packages/docs/docs/guides/docsqa.md b/packages/docs/docs/guides/docsqa.md
@@ -86,7 +86,7 @@ flowchart LR
   que[Query] -->|string| embed2[Embed] -->|vector| vdb2[(Vector DB)] -->|similar chunks| LLM
 ```
 
-If you use the built-in DocsQA tag from AI.JSX, then you just need to decide how to present the chunk to your LLM:
+If you use the built-in [`DocsQA`](../api/modules/batteries_docs.md#docsqa) component from AI.JSX, then you just need to decide how to present the chunk to your LLM:
 
 ```typescript
 function ShowDoc({ doc }: { doc: Document<MyDocMetadata> }) {
@@ -109,6 +109,61 @@ function AskAndAnswer({ query }: { query: string }) {
 }
 ```
 
+The `DocsQA` component provides an answer, like:
+
+```tsx
+<DocsQA question="What is the atomic number of nitrogen?" corpus={corpus} docComponent={ShowDoc} />
+/* Renders:
+    Nitogen's atomic number is 7
+*/
+```
+
+If you want an answer that cites sources, use [`DocsQAWithCitations`](../api/modules/batteries_docs.md#docsqawithcitations):
+
+```tsx
+<DocsQAWithCitations question="What is the atomic number of nitrogen?" corpus={corpus} docComponent={ShowDoc} />
+/* Renders:
+    Nitogen's atomic number is 7
+    Sources: https://en.wikipedia.org/wiki/Nitrogen
+*/
+```
+
+If you want to customize how the citations are formatted, pass a `resultsFormatter`:
+
+```tsx
+function ResultFormatter(result: QAWithCitationsResult) {
+  return (
+    <>
+      {result.answer}
+      {result.sources.length && (
+        <>
+          Learn more:{'\n'}
+          {result.sources.map((source) => (
+            <>
+              * {source}
+              {'\n'}
+            </>
+          ))}
+        </>
+      )}
+    </>
+  );
+}
+
+<DocsQAWithCitations
+  question="What is the atomic number of nitrogen?"
+  corpus={corpus}
+  docComponent={ShowDoc}
+  // highlight-next-line
+  resultFormatter={ResultFormatter}
+/>;
+/* Renders:
+    Nitogen's atomic number is 7
+    Learn more:
+    * https://en.wikipedia.org/wiki/Nitrogen
+*/
+```
+
 ## Picking a Corpus Implementation
 
 To get you started, AI.JSX includes an in-memory corpus that can be used effectively for demos. When you've outgrown that, you could use a Langchain VectorStore like [Pinecone](https://www.pinecone.io/) or [Chroma](https://www.trychroma.com/). Alternatively, [Fixie](https://www.fixie.ai) provides a fully-managed Corpus solution you could drop in instead.
diff --git a/packages/tutorial/src/docsqa.tsx b/packages/tutorial/src/docsqa.tsx
@@ -1,4 +1,12 @@
-import { DocsQA, LocalCorpus, ScoredChunk, makeChunker, staticLoader } from 'ai-jsx/batteries/docs';
+import {
+  DocsQA,
+  DocsQAWithCitations,
+  LocalCorpus,
+  QAWithCitationsResult,
+  ScoredChunk,
+  makeChunker,
+  staticLoader,
+} from 'ai-jsx/batteries/docs';
 import { showInspector } from 'ai-jsx/core/inspector';
 import fetch from 'node-fetch';
 import TurndownService from 'turndown';
@@ -17,22 +25,56 @@ const docs = [
 const corpus = new LocalCorpus(staticLoader(docs), makeChunker(600, 100));
 await corpus.load();
 
-function GetChunk({ doc }: { doc: ScoredChunk }) {
+function OptionalCustomChunkFormatter({ doc }: { doc: ScoredChunk }) {
+  /**
+   * This presents document chunks as a simple string with the chunk's contents instead of
+   * formatting it with metadata like a title.
+   *
+   * Note that not including a title makes it difficult to use DocsQAWithCitations since the LLM
+   * won't know how to refer to this doc.
+   */
   return doc.chunk.content;
 }
 
+function OptionalCustomResultFormatter(result: QAWithCitationsResult) {
+  /**
+   * The formats the result of a DocsQAWithCitations call to present the answer and sources as
+   * desired.
+   */
+  const linkedSources =
+    result.sources?.map((source: string) => {
+      if (source == 'Wikipedia Article about Hurricane Katrina') {
+        return `<a href="${URL}">${source}</a>`;
+      }
+      return source;
+    }) ?? [];
+
+  if (linkedSources.length) {
+    return `${result.answer} (from ${linkedSources.join(', ')})`;
+  }
+  return result.answer;
+}
+
 function App() {
   return (
     <>
-      <DocsQA question="What was Hurricane Katrina?" corpus={corpus} chunkLimit={5} chunkFormatter={GetChunk} />
+      DocsQA without source citations:{'\n'}
+      <DocsQA
+        question="Which dates did the Hurricane Katrina occur?"
+        corpus={corpus}
+        chunkLimit={5}
+        chunkFormatter={OptionalCustomChunkFormatter}
+      />
       {'\n\n'}
-      <DocsQA question="Which dates did the storm occur?" corpus={corpus} chunkLimit={5} chunkFormatter={GetChunk} />
+      DocsQA with source citations:{'\n'}
+      <DocsQAWithCitations question="What was Hurricane Katrina?" corpus={corpus} chunkLimit={5} />
       {'\n\n'}
-      <DocsQA
+      DocsQA with source citations and custom result formatter:{'\n'}
+      <DocsQAWithCitations
         question="Where were the strongest winds reported?"
         corpus={corpus}
         chunkLimit={5}
-        chunkFormatter={GetChunk}
+        resultFormatter={OptionalCustomResultFormatter}
       />
     </>
   );