From d00c8e22c334f9595c812583c94e47f3139c44b7 Mon Sep 17 00:00:00 2001 From: Kevin Cheung Date: Thu, 2 May 2024 14:04:51 -0700 Subject: [PATCH 1/4] Firestore vector store docs --- docs/_guides.yaml | 2 - docs/index.md | 1 + docs/plugins/firebase.md | 46 ++++++++++++++- docs/rag.md | 2 +- docs/templates/firestore-vector.md | 95 ------------------------------ 5 files changed, 47 insertions(+), 99 deletions(-) delete mode 100644 docs/templates/firestore-vector.md diff --git a/docs/_guides.yaml b/docs/_guides.yaml index fdc7db3077..d523a6c04f 100644 --- a/docs/_guides.yaml +++ b/docs/_guides.yaml @@ -65,8 +65,6 @@ toc: path: /docs/genkit/plugins/pinecone - title: pgvector (code template) path: /docs/genkit/templates/pgvector - - title: Firestore vector store (code template) - path: /docs/genkit/templates/firestore-vector - title: Firebase path: /docs/genkit/plugins/firebase - title: Google Cloud diff --git a/docs/index.md b/docs/index.md index 10276cda4a..c4382d9246 100644 --- a/docs/index.md +++ b/docs/index.md @@ -370,6 +370,7 @@ maintained by the Genkit team: firebase Cloud deployment: Cloud Functions, Firebase Authentication, App Check
+ Vector database: Cloud Firestore vector store
diff --git a/docs/plugins/firebase.md b/docs/plugins/firebase.md index a1772a4c3e..f80b22ee13 100644 --- a/docs/plugins/firebase.md +++ b/docs/plugins/firebase.md @@ -2,6 +2,7 @@ The Firebase plugin provides several integrations with Firebase services: +- Indexers and retrievers using Cloud Firestore vector store - Trace storage using Cloud Firestore - Flow deployment using Cloud Functions - Authorization policies for Firebase Authentication users @@ -58,7 +59,50 @@ Application Default Credentials. To specify your credentials: This plugin provides several integrations with Firebase services, which you can use together or individually. -### Cloud Firestore +### Cloud Firestore vector store + +You can use Cloud Firestore as a vector store for RAG indexing and retrieval. + +The `firebase` plugin provides a convenience function for defining Firestore +retrievers, `defineFirestoreRetriever()`: + +```js +import { defineFirestoreRetriever } from '@genkit-ai/firebase'; +import { initializeApp } from 'firebase-admin/app'; +import { getFirestore } from 'firebase-admin/firestore'; + +const app = initializeApp(); +const firestore = getFirestore(app); + +const yourRetrieverRef = defineFirestoreRetriever({ + name: 'yourRetriever', + firestore: getFirestore(app), + collection: 'yourCollection', + contentField: 'yourDataChunks', + vectorField: 'embedding', + embedder: textEmbeddingGecko, +}); +``` + +To use it, pass it to the `retrieve()` function: + +```js +const docs = await retrieve({ + retriever: yourRetrieverRef, + query: "look for something", + config: {limit: 5}, +}); +``` + +For indexing, use an embedding generator along with the Admin SDK: + +```js +``` + +See the [Retrieval-augmented generation](../rag.md) page for a general +discussion on indexers and retrievers. + +### Cloud Firestore trace storage You can use Cloud Firestore to store traces: diff --git a/docs/rag.md b/docs/rag.md index 97790a2389..b7425f602d 100644 --- a/docs/rag.md +++ b/docs/rag.md @@ -249,6 +249,7 @@ export const ragFlow = defineFlow( Genkit provides indexer and retriever support through its plugin system. The following plugins are officially supported: +- [Cloud Firestore vector store](plugins/firebase.md) - [Chroma DB](plugins/chroma.md) vector database - [Pinecone](plugins/pinecone.md) cloud vector database @@ -257,7 +258,6 @@ code templates, which you can customize for your database configuration and schema: - PostgreSQL with [`pgvector`](templates/pgvector.md) -- [Firestore vector store](templates/firestore-vector.md) Embedding model support is provided through the following plugins: diff --git a/docs/templates/firestore-vector.md b/docs/templates/firestore-vector.md deleted file mode 100644 index bf2166c81a..0000000000 --- a/docs/templates/firestore-vector.md +++ /dev/null @@ -1,95 +0,0 @@ -# Firestore vector store template - -You can use Firestore vector store in Firebase Genkit to power your RAG flows by -storing and retrieving embedding vectors. - -Here is a sample template which retrieves documents from Firestore. - -Use the following example as a starting point and modify it to work with your database layout. -This sample assumes that you already have a Firestore collection called `vectors` in which each document -has an `embedding` field that stores the embedding vector. - -Important: Vector support is available only in `@google-cloud/firestore` versions starting from `7.6.0`. You must update your dependecies to match this version. - -Firestore depends on indices to provide fast and efficient querying on collections. This sample requires the `embedding` field to be indexed to work. To do so, invoke the -flow and Firestore will throw an error with a command to create an index. Execute that command -and your index should be ready to use. - -```js -import { embed } from '@genkit-ai/ai/embedder'; -import { Document, defineRetriever } from '@genkit-ai/ai/retriever'; -import { textEmbeddingGecko } from '@genkit-ai/vertexai'; -import { - FieldValue, - VectorQuery, - VectorQuerySnapshot, -} from '@google-cloud/firestore'; -import { Firestore } from 'firebase-admin/firestore'; -import * as z from 'zod'; -import { augmentedPrompt } from './prompt'; - -const QueryOptions = z.object({ - k: z.number().optional(), -}); - -const firestoreArtifactsRetriever = defineRetriever( - { - name: 'firestore/artifacts', - configSchema: QueryOptions, - }, - async (input, options) => { - const embedding = await embed({ - embedder: textEmbeddingGecko, - content: input, - }); - - const db = new Firestore(); - const coll = db.collection('vectors' /* your collection name */); - - const vectorQuery: VectorQuery = coll.findNearest( - 'embedding' /* the name of the field that contains the vector */, - FieldValue.vector(embedding), - { - limit: options.k ?? 3, - distanceMeasure: 'COSINE', - } - ); - - const vectorQuerySnapshot: VectorQuerySnapshot = await vectorQuery.get(); - return { - documents: vectorQuerySnapshot.docs.map((doc) => - // doc.data() represents the Firestore document. You may process it as needed to generate - // a Genkit document object, depending on your storage format. - Document.fromText(doc.data().content.text) - ), - }; - } -); -``` - -And here's how to use the retriever in a flow: - -```js -// Simple flow to use the firestoreArtifactsRetriever -export const askQuestionsOnNewsArticles = defineFlow( - { - name: 'askQuestionsOnNewsArticles', - inputSchema: z.string(), - outputSchema: z.string(), - }, - async (inputQuestion) => { - const docs = await retrieve({ - retriever: firestoreArtifactsRetriever, - query: inputQuestion, - options: { - k: 5, - }, - }); - console.log(docs); - - // Continue with using retrieved docs - // in RAG prompts. - //... - } -); -``` From 484de44e592c509abfb27469769df161852dc32a Mon Sep 17 00:00:00 2001 From: Kevin Cheung Date: Fri, 3 May 2024 09:30:18 -0700 Subject: [PATCH 2/4] indexing --- docs/plugins/firebase.md | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/docs/plugins/firebase.md b/docs/plugins/firebase.md index f80b22ee13..ab8bdbc069 100644 --- a/docs/plugins/firebase.md +++ b/docs/plugins/firebase.md @@ -81,6 +81,7 @@ const yourRetrieverRef = defineFirestoreRetriever({ contentField: 'yourDataChunks', vectorField: 'embedding', embedder: textEmbeddingGecko, + distanceMeasure: 'COSINE', // 'EUCLIDEAN', 'DOT_PRODUCT', or 'COSINE' (default) }); ``` @@ -97,8 +98,39 @@ const docs = await retrieve({ For indexing, use an embedding generator along with the Admin SDK: ```js +import { initializeApp } from "firebase-admin"; +import { getFirestore } from "firebase-admin/firestore"; +import { textEmbeddingGecko } from '@genkit-ai/vertexai'; +import { embed } from '@genkit-ai/ai/embedder'; + +const app = initializeApp(); +const firestore = getFirestore(app); + +const indexConfig = { + collection: "yourCollection", + contentField: "yourDataChunks", + vectorField: "embedding", + embedder: textEmbeddingGecko, +} + +async function indexToFirestore(content) { + const embedding = await embed({ + embedder: indexConfig.embedder, + content, + }); + await firestore.collection(indexConfig.collection).add({ + [indexConfig.vectorField]: embedding, + [indexConfig.contentField]: content, + }); +} ``` +Firestore depends on indices to provide fast and efficient querying on +collections. The prior example requires the `embedding` field to be indexed to +work. To do so, invoke the function and Firestore will throw an error with a +command to create an index. Execute that command and your index should be ready +to use. + See the [Retrieval-augmented generation](../rag.md) page for a general discussion on indexers and retrievers. From 6fe566bc040ef73d110b63918768a493fbbebe9a Mon Sep 17 00:00:00 2001 From: Kevin Cheung Date: Fri, 3 May 2024 09:33:08 -0700 Subject: [PATCH 3/4] prettier --- docs/plugins/firebase.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/plugins/firebase.md b/docs/plugins/firebase.md index ab8bdbc069..87761e6667 100644 --- a/docs/plugins/firebase.md +++ b/docs/plugins/firebase.md @@ -90,16 +90,16 @@ To use it, pass it to the `retrieve()` function: ```js const docs = await retrieve({ retriever: yourRetrieverRef, - query: "look for something", - config: {limit: 5}, + query: 'look for something', + config: { limit: 5 }, }); ``` For indexing, use an embedding generator along with the Admin SDK: ```js -import { initializeApp } from "firebase-admin"; -import { getFirestore } from "firebase-admin/firestore"; +import { initializeApp } from 'firebase-admin'; +import { getFirestore } from 'firebase-admin/firestore'; import { textEmbeddingGecko } from '@genkit-ai/vertexai'; import { embed } from '@genkit-ai/ai/embedder'; @@ -107,11 +107,11 @@ const app = initializeApp(); const firestore = getFirestore(app); const indexConfig = { - collection: "yourCollection", - contentField: "yourDataChunks", - vectorField: "embedding", + collection: 'yourCollection', + contentField: 'yourDataChunks', + vectorField: 'embedding', embedder: textEmbeddingGecko, -} +}; async function indexToFirestore(content) { const embedding = await embed({ From 5664be656684c68c9344d97a89d92f647a899447 Mon Sep 17 00:00:00 2001 From: Kevin Cheung Date: Fri, 3 May 2024 10:51:50 -0700 Subject: [PATCH 4/4] feedback --- docs/plugins/firebase.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/plugins/firebase.md b/docs/plugins/firebase.md index 87761e6667..f5901a1b33 100644 --- a/docs/plugins/firebase.md +++ b/docs/plugins/firebase.md @@ -99,7 +99,7 @@ For indexing, use an embedding generator along with the Admin SDK: ```js import { initializeApp } from 'firebase-admin'; -import { getFirestore } from 'firebase-admin/firestore'; +import { getFirestore, FieldValue } from 'firebase-admin/firestore'; import { textEmbeddingGecko } from '@genkit-ai/vertexai'; import { embed } from '@genkit-ai/ai/embedder'; @@ -119,13 +119,13 @@ async function indexToFirestore(content) { content, }); await firestore.collection(indexConfig.collection).add({ - [indexConfig.vectorField]: embedding, + [indexConfig.vectorField]: FieldValue.vector(embedding), [indexConfig.contentField]: content, }); } ``` -Firestore depends on indices to provide fast and efficient querying on +Firestore depends on indexes to provide fast and efficient querying on collections. The prior example requires the `embedding` field to be indexed to work. To do so, invoke the function and Firestore will throw an error with a command to create an index. Execute that command and your index should be ready