From 33d4ff55ccba095c6b0896fb79406d6f4246fc7d Mon Sep 17 00:00:00 2001 From: lemorage Date: Wed, 21 May 2025 15:37:58 +0200 Subject: [PATCH] Update jupyter notebook code in text embedding example --- examples/text_embedding/Text_Embedding.ipynb | 339 ++++++++++--------- 1 file changed, 178 insertions(+), 161 deletions(-) diff --git a/examples/text_embedding/Text_Embedding.ipynb b/examples/text_embedding/Text_Embedding.ipynb index ff6c77e4..afd01556 100644 --- a/examples/text_embedding/Text_Embedding.ipynb +++ b/examples/text_embedding/Text_Embedding.ipynb @@ -1,111 +1,103 @@ { - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, "cells": [ { "cell_type": "markdown", + "metadata": { + "id": "Up70lME5E0Tc" + }, "source": [ "# ![icon.svg](https://cocoindex.io/icon.svg) Welcome to [Cocoindex](https://cocoindex.io/)\n", "\n" - ], - "metadata": { - "id": "Up70lME5E0Tc" - } + ] }, { "cell_type": "markdown", + "metadata": { + "id": "bJ3LGSyF9D1M" + }, "source": [ "\n", "# ![icon.svg](https://cocoindex.io/icon.svg) This example will show you how you can get started with Cocoindex by building embedding for RAG" - ], - "metadata": { - "id": "bJ3LGSyF9D1M" - } + ] }, { "cell_type": "markdown", - "source": [ - "# Install Cocoindex and other required packages using pip" - ], "metadata": { "id": "ymNZ0fk09noG" - } + }, + "source": [ + "# Install Cocoindex and other required packages using pip" + ] }, { "cell_type": "markdown", - "source": [], "metadata": { "id": "s4MT3saT9COe" - } + }, + "source": [] }, { "cell_type": "code", - "source": [ - "%pip install cocoindex python-dotenv" - ], + "execution_count": null, "metadata": { "collapsed": true, "id": "rQcJanCi-W3I" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "%pip install cocoindex python-dotenv psycopg[binary,pool]" + ] }, { "cell_type": "markdown", - "source": [ - "# Grab some markdown files for demo" - ], "metadata": { "id": "Xh2sMemiA7_N" - } + }, + "source": [ + "# Grab some markdown files for demo" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "0Gi-MHrNA8sQ" + }, + "outputs": [], "source": [ "!mkdir -p markdown_files && \\\n", "wget -P markdown_files https://raw.githubusercontent.com/cocoindex-io/cocoindex/refs/heads/main/examples/text_embedding/markdown_files/1706.03762v7.md && \\\n", "wget -P markdown_files https://raw.githubusercontent.com/cocoindex-io/cocoindex/refs/heads/main/examples/text_embedding/markdown_files/1810.04805v2.md && \\\n", "wget -P markdown_files https://raw.githubusercontent.com/cocoindex-io/cocoindex/refs/heads/main/examples/text_embedding/markdown_files/rfc8259.md\n" - ], - "metadata": { - "collapsed": true, - "id": "0Gi-MHrNA8sQ" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "code", - "source": [], + "execution_count": null, "metadata": { "id": "hPctYqRAzgEq" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [] }, { "cell_type": "markdown", - "source": [ - "# Create a Postgres Server" - ], "metadata": { "id": "ZEetEtmPAuZ-" - } + }, + "source": [ + "# Create a Postgres Server" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "id": "lkATpKLUAuuY" + }, + "outputs": [], "source": [ "# Update package lists\n", "!sudo apt-get update\n", @@ -129,72 +121,74 @@ "# Enable the pgvector extension\n", "!sudo -u postgres psql -d cocoindex -c \"CREATE EXTENSION IF NOT EXISTS vector;\"\n", "\n" - ], - "metadata": { - "id": "lkATpKLUAuuY", - "collapsed": true - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "# Update .env with POSTGRES URL" - ], "metadata": { "id": "utZpExYkAzi6" - } + }, + "source": [ + "# Update .env with POSTGRES URL" + ] }, { "cell_type": "code", - "source": [ - "%%writefile .env\n", - "COCOINDEX_DATABASE_URL=\"postgresql://cocoindex:cocoindex@localhost:5432/cocoindex\"" - ], + "execution_count": null, "metadata": { "id": "X3P8pEUOA5D2" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "%%writefile .env\n", + "COCOINDEX_DATABASE_URL=\"postgresql://cocoindex:cocoindex@localhost:5432/cocoindex\"" + ] }, { "cell_type": "markdown", - "source": [ - "# Create a new file and import modules" - ], "metadata": { "id": "9zN612eW_1nX" - } + }, + "source": [ + "# Create a new file and import modules" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7HUYtsoN-10D" + }, + "outputs": [], "source": [ "%%writefile main.py\n", "from dotenv import load_dotenv\n", + "import os\n", + "from psycopg_pool import ConnectionPool\n", "import cocoindex\n" - ], - "metadata": { - "id": "7HUYtsoN-10D" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "# Define your embedding function" - ], "metadata": { "id": "2DOY5Q27ADS2" - } + }, + "source": [ + "# Define your embedding function" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "L_puYY6FABbr" + }, + "outputs": [], "source": [ "%%writefile -a main.py\n", "\n", - "def text_to_embedding(text: cocoindex.DataSlice) -> cocoindex.DataSlice:\n", + "@cocoindex.transform_flow()\n", + "def text_to_embedding(text: cocoindex.DataSlice[str]) -> cocoindex.DataSlice[list[float]]:\n", " \"\"\"\n", " Embed the text using a SentenceTransformer model.\n", " This is shared logic between indexing and querying.\n", @@ -202,24 +196,24 @@ " return text.transform(\n", " cocoindex.functions.SentenceTransformerEmbed(\n", " model=\"sentence-transformers/all-MiniLM-L6-v2\"))\n" - ], - "metadata": { - "id": "L_puYY6FABbr" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", - "source": [ - "# Define your flow" - ], "metadata": { "id": "H6j2aiRaAEKz" - } + }, + "source": [ + "# Define your flow" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oatJUXjAAEhE" + }, + "outputs": [], "source": [ "%%writefile -a main.py\n", "\n", @@ -251,68 +245,81 @@ " cocoindex.VectorIndexDef(\n", " field_name=\"embedding\",\n", " metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])\n" - ], - "metadata": { - "id": "oatJUXjAAEhE" - }, - "execution_count": null, - "outputs": [] + ] }, { "cell_type": "markdown", + "metadata": { + "id": "KLb41N5UAFJx" + }, "source": [ "\n", - "# Define query handler\n", + "# Provide query logic\n", "\n" - ], - "metadata": { - "id": "KLb41N5UAFJx" - } + ] }, { "cell_type": "code", - "source": [ - "%%writefile -a main.py\n", - "\n", - "query_handler = cocoindex.query.SimpleSemanticsQueryHandler(\n", - " name=\"SemanticsSearch\",\n", - " flow=text_embedding_flow,\n", - " target_name=\"doc_embeddings\",\n", - " query_transform_flow=text_to_embedding,\n", - " default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)\n" - ], + "execution_count": null, "metadata": { "id": "tRdfIP6OAFe1" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "%%writefile -a main.py\n", + "\n", + "def search(pool: ConnectionPool, query: str, top_k: int = 5):\n", + " # Get the table name, for the export target in the text_embedding_flow above.\n", + " table_name = cocoindex.utils.get_target_storage_default_name(text_embedding_flow, \"doc_embeddings\")\n", + " # Evaluate the transform flow defined above with the input query, to get the embedding.\n", + " query_vector = text_to_embedding.eval(query)\n", + " # Run the query and get the results.\n", + " with pool.connection() as conn:\n", + " with conn.cursor() as cur:\n", + " cur.execute(f\"\"\"\n", + " SELECT filename, text, embedding <=> %s::vector AS distance\n", + " FROM {table_name} ORDER BY distance LIMIT %s\n", + " \"\"\", (query_vector, top_k))\n", + " return [\n", + " {\"filename\": row[0], \"text\": row[1], \"score\": 1.0 - row[2]}\n", + " for row in cur.fetchall()\n", + " ]\n" + ] }, { "cell_type": "markdown", - "source": [ - "#Define search function and main" - ], "metadata": { "id": "IUBdoOmOAgwc" - } + }, + "source": [ + "# Define search function and main" + ] }, { "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "W78hBbDiAhFh" + }, + "outputs": [], "source": [ "%%writefile -a main.py\n", "\n", - "@cocoindex.main_fn()\n", - "def _run():\n", + "def _main():\n", + " # Initialize the database connection pool.\n", + " pool = ConnectionPool(os.getenv(\"COCOINDEX_DATABASE_URL\"))\n", + " # Run queries in a loop to demonstrate the query capabilities.\n", " while True:\n", " try:\n", " query = input(\"Enter search query (or Enter to quit): \")\n", " if query == '':\n", " break\n", - " results, _ = query_handler.search(query, 10)\n", + " # Run the query function with the database connection pool and the query.\n", + " results = search(pool, query)\n", " print(\"\\nSearch results:\")\n", " for result in results:\n", - " print(f\"[{result.score:.3f}] {result.data['filename']}\")\n", - " print(f\" {result.data['text']}\")\n", + " print(f\"[{result['score']:.3f}] {result['filename']}\")\n", + " print(f\" {result['text']}\")\n", " print(\"---\")\n", " print()\n", " except KeyboardInterrupt:\n", @@ -320,73 +327,83 @@ "\n", "if __name__ == \"__main__\":\n", " load_dotenv(override=True)\n", - " _run()\n" - ], - "metadata": { - "id": "W78hBbDiAhFh" - }, - "execution_count": null, - "outputs": [] + " cocoindex.init()\n", + " _main()\n" + ] }, { "cell_type": "markdown", - "source": [ - "# Setup" - ], "metadata": { "id": "I2oI_pjxCkRa" - } + }, + "source": [ + "# Setup" + ] }, { "cell_type": "code", - "source": [ - "!yes yes | cocoindex setup main.py" - ], + "execution_count": null, "metadata": { "id": "oBStjaI0Cli_" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "!yes yes | cocoindex setup main.py" + ] }, { "cell_type": "markdown", - "source": [ - "# Update" - ], "metadata": { "id": "aPBDVrG_CmwH" - } + }, + "source": [ + "# Update" + ] }, { "cell_type": "code", - "source": [ - "!cocoindex update main.py" - ], + "execution_count": null, "metadata": { "id": "M9g6xIZHCn5T" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "!cocoindex update main.py" + ] }, { "cell_type": "markdown", - "source": [ - "# Run query" - ], "metadata": { "id": "nIM78MBRCppz" - } + }, + "source": [ + "# Run query" + ] }, { "cell_type": "code", - "source": [ - "!python main.py" - ], + "execution_count": null, "metadata": { "id": "6E-HR_KSCqzP" }, - "execution_count": null, - "outputs": [] + "outputs": [], + "source": [ + "!python main.py" + ] } - ] -} \ No newline at end of file + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}