diff --git a/examples/gdrive_text_embedding/.env.example b/examples/gdrive_text_embedding/.env.example new file mode 100644 index 00000000..39a7d032 --- /dev/null +++ b/examples/gdrive_text_embedding/.env.example @@ -0,0 +1,8 @@ +# Postgres database address for cocoindex +COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex + +# Google Drive service account credential path +# GOOGLE_SERVICE_ACCOUNT_CREDENTIAL=/path/to/service_account_credential.json + +# Google Drive root folder IDs, comma separated +# GOOGLE_DRIVE_ROOT_FOLDER_IDS=id1,id2 \ No newline at end of file diff --git a/examples/gdrive_text_embedding/.gitignore b/examples/gdrive_text_embedding/.gitignore new file mode 100644 index 00000000..2eea525d --- /dev/null +++ b/examples/gdrive_text_embedding/.gitignore @@ -0,0 +1 @@ +.env \ No newline at end of file diff --git a/examples/gdrive_text_embedding/README.md b/examples/gdrive_text_embedding/README.md new file mode 100644 index 00000000..ff3c16bb --- /dev/null +++ b/examples/gdrive_text_embedding/README.md @@ -0,0 +1,41 @@ +Simple example for cocoindex: build embedding index based on Google Drive files. + +## Prerequisite +[Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one. + +## Run + +Install dependencies: + +```bash +pip install -e . +``` + +Setup: + +```bash +python main.py cocoindex setup +``` + +Update index: + +```bash +python main.py cocoindex update +``` + +Run: + +```bash +python main.py +``` + +## CocoInsight +CocoInsight is in Early Access now (Free) 😊 You found us! A quick 3 minute video tutorial about CocoInsight: [Watch on YouTube](https://youtu.be/ZnmyoHslBSc?si=pPLXWALztkA710r9). + +Run CocoInsight to understand your RAG data pipeline: + +``` +python main.py cocoindex server -c https://cocoindex.io +``` + +Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight). \ No newline at end of file diff --git a/examples/gdrive_text_embedding/main.py b/examples/gdrive_text_embedding/main.py new file mode 100644 index 00000000..6350e2a2 --- /dev/null +++ b/examples/gdrive_text_embedding/main.py @@ -0,0 +1,73 @@ +from dotenv import load_dotenv + +import cocoindex +import os + +def text_to_embedding(text: cocoindex.DataSlice) -> cocoindex.DataSlice: + """ + Embed the text using a SentenceTransformer model. + This is a shared logic between indexing and querying, so extract it as a function. + """ + return text.transform( + cocoindex.functions.SentenceTransformerEmbed( + model="sentence-transformers/all-MiniLM-L6-v2")) + +@cocoindex.flow_def(name="GoogleDriveTextEmbedding") +def gdrive_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope): + """ + Define an example flow that embeds text into a vector database. + """ + credential_path = os.environ["GOOGLE_SERVICE_ACCOUNT_CREDENTIAL"] + root_folder_ids = os.environ["GOOGLE_DRIVE_ROOT_FOLDER_IDS"].split(",") + + data_scope["documents"] = flow_builder.add_source( + cocoindex.sources.GoogleDrive( + service_account_credential_path=credential_path, + root_folder_ids=root_folder_ids)) + + doc_embeddings = data_scope.add_collector() + + with data_scope["documents"].row() as doc: + doc["chunks"] = doc["content"].transform( + cocoindex.functions.SplitRecursively(), + language="markdown", chunk_size=2000, chunk_overlap=500) + + with doc["chunks"].row() as chunk: + chunk["embedding"] = text_to_embedding(chunk["text"]) + doc_embeddings.collect(filename=doc["filename"], location=chunk["location"], + text=chunk["text"], embedding=chunk["embedding"]) + + doc_embeddings.export( + "doc_embeddings", + cocoindex.storages.Postgres(), + primary_key_fields=["filename", "location"], + vector_index=[("embedding", cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)]) + +query_handler = cocoindex.query.SimpleSemanticsQueryHandler( + name="SemanticsSearch", + flow=gdrive_text_embedding_flow, + target_name="doc_embeddings", + query_transform_flow=text_to_embedding, + default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY) + +@cocoindex.main_fn() +def _run(): + # Run queries in a loop to demonstrate the query capabilities. + while True: + try: + query = input("Enter search query (or Enter to quit): ") + if query == '': + break + results, _ = query_handler.search(query, 10) + print("\nSearch results:") + for result in results: + print(f"[{result.score:.3f}] {result.data['filename']}") + print(f" {result.data['text']}") + print("---") + print() + except KeyboardInterrupt: + break + +if __name__ == "__main__": + load_dotenv(override=True) + _run() diff --git a/examples/gdrive_text_embedding/pyproject.toml b/examples/gdrive_text_embedding/pyproject.toml new file mode 100644 index 00000000..638b79e7 --- /dev/null +++ b/examples/gdrive_text_embedding/pyproject.toml @@ -0,0 +1,6 @@ +[project] +name = "gdrive-text-embedding" +version = "0.1.0" +description = "Simple example for cocoindex: build embedding index based on Google Drive files." +requires-python = ">=3.11" +dependencies = ["cocoindex>=0.1.12", "python-dotenv>=1.0.1"]