From 680b27c3aa03329cea545a92d8c87722bf9aed88 Mon Sep 17 00:00:00 2001 From: Quynh Nguyen Date: Sun, 28 Sep 2025 19:30:52 -0500 Subject: [PATCH 1/5] Add new python notebook --- .../multilingual_embedding.ipynb | 225 ++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb diff --git a/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb b/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb new file mode 100644 index 00000000..57bc36aa --- /dev/null +++ b/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb @@ -0,0 +1,225 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data successfully downloaded and saved to multilingual_coco_sample.json\n" + ] + } + ], + "source": [ + "import requests\n", + "import json\n", + "import os\n", + "\n", + "### Download multilingual coco dataset\n", + "### Here we are retrieving first 100 rows for this example\n", + "### Alternatively, you can use dataset library from Hugging Face\n", + "url = \"https://datasets-server.huggingface.co/rows?dataset=romrawinjp%2Fmultilingual-coco&config=default&split=restval&offset=0&length=100\"\n", + "# Make the GET request\n", + "response = requests.get(url)\n", + "\n", + "# Check if the request was successful\n", + "if response.status_code == 200:\n", + " # Parse the JSON response\n", + " data = response.json()\n", + "\n", + " # Define the output file path\n", + " output_file = \"multilingual_coco_sample.json\"\n", + "\n", + " # Save the JSON data to a file\n", + " with open(output_file, \"w\", encoding=\"utf-8\") as f:\n", + " json.dump(data, f, indent=4, ensure_ascii=False)\n", + "\n", + " print(f\"Data successfully downloaded and saved to {output_file}\")\n", + "else:\n", + " print(f\"Failed to download data: {response.status_code}\")\n", + " print(response.text)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from getpass import getpass\n", + "\n", + "# Get credentials securely for localhost Elasticsearch\n", + "print(\"Enter your Elasticsearch credentials:\")\n", + "cloud_id = input(\"Enter your cloud_id: \")\n", + "api_key = getpass(\"Enter your api_key: \")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Successfully connected to Elasticsearch\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/elasticsearch/_sync/client/__init__.py:311: SecurityWarning: Connecting to 'https://localhost:9200' using TLS with verify_certs=False is insecure\n", + " _transport = transport_class(\n", + "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py:1099: InsecureRequestWarning: Unverified HTTPS request is being made to host 'localhost'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "from elasticsearch import Elasticsearch\n", + "try:\n", + " es = Elasticsearch(\n", + " hosts=[{\"host\": \"localhost\", \"port\": 9200, \"scheme\": \"https\"}],\n", + " basic_auth=(\"elastic\", \"qaf_admin\"),\n", + " verify_certs=False, # Set to True if you have valid SSL certificates\n", + " # Alternatively, you can use Elastic cloud_id and api_key\n", + " #api_key=getpass(\"API Key: \")\n", + " #cloud_id=getpass(\"Cloud ID: \"),\n", + " )\n", + "\n", + " # Test the connection\n", + " if not es.ping():\n", + " raise Exception(\"Failed to connect to Elasticsearch\")\n", + "\n", + " print(\"Successfully connected to Elasticsearch\")\n", + "\n", + "except Exception as e:\n", + " print(f\"Error connecting to Elasticsearch: {e}\")\n", + " print(\"Please check your credentials\")\n", + " raise\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Successfully bulk indexed 4840 documents\n", + "Indexing complete!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py:1099: InsecureRequestWarning: Unverified HTTPS request is being made to host 'localhost'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings\n", + " warnings.warn(\n", + "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py:1099: InsecureRequestWarning: Unverified HTTPS request is being made to host 'localhost'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "# Define the index mapping\n", + "index_name = \"coco\"\n", + "mapping = {\n", + " \"mappings\": {\n", + " \"properties\": {\n", + " \"language\": {\"type\": \"keyword\"},\n", + " \"description\": {\"type\": \"text\"},\n", + " \"en\": {\"type\": \"text\"},\n", + " \"image_url\": {\"type\": \"keyword\"},\n", + " }\n", + " }\n", + "}\n", + "\n", + "# Create the index if it doesn't exist\n", + "if not es.indices.exists(index=index_name):\n", + " es.indices.create(index=index_name, body=mapping)\n", + "\n", + "# Load the JSON data\n", + "with open('./multilingual_coco_sample.json', 'r') as f:\n", + " data = json.load(f)\n", + "\n", + "rows = data[\"rows\"]\n", + "# List of languages to process\n", + "languages = [\"en\", \"es\", \"de\", \"it\", \"vi\", \"th\"]\n", + "\n", + "bulk_data = []\n", + "for obj in rows:\n", + " row = obj[\"row\"]\n", + " image_url = row.get(\"image\")\n", + " image_url = image_url[\"src\"]\n", + "\n", + " # Process each language\n", + " for lang in languages:\n", + " # Skip if language not present in this row\n", + " if lang not in row:\n", + " continue\n", + "\n", + " # Get all descriptions for this language\n", + " descriptions = row[lang]\n", + " first_eng_caption = row[\"en\"][0]\n", + "\n", + " # Prepare bulk indexing data\n", + " for description in descriptions:\n", + " if description == \"\":\n", + " continue\n", + " # Add index operation\n", + " bulk_data.append(\n", + " {\"index\": {\"_index\": index_name}}\n", + " )\n", + " # Add document\n", + " bulk_data.append({\n", + " \"language\": lang,\n", + " \"description\": description,\n", + " \"en\": first_eng_caption,\n", + " \"image_url\": image_url,\n", + " })\n", + "\n", + "# Perform bulk indexing\n", + "if bulk_data:\n", + " try:\n", + " response = es.bulk(operations=bulk_data)\n", + " if response[\"errors\"]:\n", + " print(\"Some documents failed to index\")\n", + " else:\n", + " print(f\"Successfully bulk indexed {len(bulk_data)} documents\")\n", + " except Exception as e:\n", + " print(f\"Error during bulk indexing: {str(e)}\")\n", + "\n", + "print(\"Indexing complete!\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 3702553fcb2e91d7abd9e2cd9e6a7bf2c72f9235 Mon Sep 17 00:00:00 2001 From: Carly Richmond Date: Wed, 8 Oct 2025 13:12:31 +0200 Subject: [PATCH 2/5] Adding title to make the build pass --- .../multilingual_embedding.ipynb | 38 ++++++++++++------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb b/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb index 57bc36aa..d8793bf5 100644 --- a/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb +++ b/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb @@ -1,5 +1,14 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lost In Translation? Multilingual Embedding Models Are All You Need*\n", + "\n", + "This notebook by Quynh Nguyen shows how cross-lingual vector search overcomes language barriers, enabling you to query and retrieve information in any language from both single and multilingual datasets. It accompanies the piece *Lost In Translation? Multilingual Embedding Models Are All You Need* from [Elasticsearch Labs](https://www.elastic.co/search-labs)." + ] + }, { "cell_type": "code", "execution_count": null, @@ -40,7 +49,7 @@ " print(f\"Data successfully downloaded and saved to {output_file}\")\n", "else:\n", " print(f\"Failed to download data: {response.status_code}\")\n", - " print(response.text)\n" + " print(response.text)" ] }, { @@ -54,7 +63,7 @@ "# Get credentials securely for localhost Elasticsearch\n", "print(\"Enter your Elasticsearch credentials:\")\n", "cloud_id = input(\"Enter your cloud_id: \")\n", - "api_key = getpass(\"Enter your api_key: \")\n" + "api_key = getpass(\"Enter your api_key: \")" ] }, { @@ -82,14 +91,15 @@ ], "source": [ "from elasticsearch import Elasticsearch\n", + "\n", "try:\n", " es = Elasticsearch(\n", " hosts=[{\"host\": \"localhost\", \"port\": 9200, \"scheme\": \"https\"}],\n", " basic_auth=(\"elastic\", \"qaf_admin\"),\n", " verify_certs=False, # Set to True if you have valid SSL certificates\n", " # Alternatively, you can use Elastic cloud_id and api_key\n", - " #api_key=getpass(\"API Key: \")\n", - " #cloud_id=getpass(\"Cloud ID: \"),\n", + " # api_key=getpass(\"API Key: \")\n", + " # cloud_id=getpass(\"Cloud ID: \"),\n", " )\n", "\n", " # Test the connection\n", @@ -101,7 +111,7 @@ "except Exception as e:\n", " print(f\"Error connecting to Elasticsearch: {e}\")\n", " print(\"Please check your credentials\")\n", - " raise\n" + " raise" ] }, { @@ -147,7 +157,7 @@ " es.indices.create(index=index_name, body=mapping)\n", "\n", "# Load the JSON data\n", - "with open('./multilingual_coco_sample.json', 'r') as f:\n", + "with open(\"./multilingual_coco_sample.json\", \"r\") as f:\n", " data = json.load(f)\n", "\n", "rows = data[\"rows\"]\n", @@ -175,16 +185,16 @@ " if description == \"\":\n", " continue\n", " # Add index operation\n", + " bulk_data.append({\"index\": {\"_index\": index_name}})\n", + " # Add document\n", " bulk_data.append(\n", - " {\"index\": {\"_index\": index_name}}\n", + " {\n", + " \"language\": lang,\n", + " \"description\": description,\n", + " \"en\": first_eng_caption,\n", + " \"image_url\": image_url,\n", + " }\n", " )\n", - " # Add document\n", - " bulk_data.append({\n", - " \"language\": lang,\n", - " \"description\": description,\n", - " \"en\": first_eng_caption,\n", - " \"image_url\": image_url,\n", - " })\n", "\n", "# Perform bulk indexing\n", "if bulk_data:\n", From fd84a1ac9b10199cc9e7d7ca4c236c66e7a7402c Mon Sep 17 00:00:00 2001 From: Carly Richmond Date: Wed, 8 Oct 2025 13:18:03 +0200 Subject: [PATCH 3/5] Changing to use endpoint instead of cloud id, and removing unused os reference --- .../multilingual_embedding.ipynb | 51 +++++++------------ 1 file changed, 17 insertions(+), 34 deletions(-) diff --git a/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb b/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb index d8793bf5..80d995f7 100644 --- a/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb +++ b/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb @@ -25,7 +25,6 @@ "source": [ "import requests\n", "import json\n", - "import os\n", "\n", "### Download multilingual coco dataset\n", "### Here we are retrieving first 100 rows for this example\n", @@ -54,21 +53,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Enter your Elasticsearch credentials:\n" + ] + } + ], "source": [ "from getpass import getpass\n", "\n", "# Get credentials securely for localhost Elasticsearch\n", "print(\"Enter your Elasticsearch credentials:\")\n", - "cloud_id = input(\"Enter your cloud_id: \")\n", - "api_key = getpass(\"Enter your api_key: \")" + "elastic_endpoint = input(\"Enter your Elastic endpoint: \")\n", + "api_key = getpass(\"Enter your API key: \")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -77,16 +84,6 @@ "text": [ "Successfully connected to Elasticsearch\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/elasticsearch/_sync/client/__init__.py:311: SecurityWarning: Connecting to 'https://localhost:9200' using TLS with verify_certs=False is insecure\n", - " _transport = transport_class(\n", - "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py:1099: InsecureRequestWarning: Unverified HTTPS request is being made to host 'localhost'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings\n", - " warnings.warn(\n" - ] } ], "source": [ @@ -94,12 +91,8 @@ "\n", "try:\n", " es = Elasticsearch(\n", - " hosts=[{\"host\": \"localhost\", \"port\": 9200, \"scheme\": \"https\"}],\n", - " basic_auth=(\"elastic\", \"qaf_admin\"),\n", - " verify_certs=False, # Set to True if you have valid SSL certificates\n", - " # Alternatively, you can use Elastic cloud_id and api_key\n", - " # api_key=getpass(\"API Key: \")\n", - " # cloud_id=getpass(\"Cloud ID: \"),\n", + " hosts=[elastic_endpoint],\n", + " api_key=api_key\n", " )\n", "\n", " # Test the connection\n", @@ -116,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -126,16 +119,6 @@ "Successfully bulk indexed 4840 documents\n", "Indexing complete!\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py:1099: InsecureRequestWarning: Unverified HTTPS request is being made to host 'localhost'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings\n", - " warnings.warn(\n", - "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/urllib3/connectionpool.py:1099: InsecureRequestWarning: Unverified HTTPS request is being made to host 'localhost'. Adding certificate verification is strongly advised. See: https://urllib3.readthedocs.io/en/latest/advanced-usage.html#tls-warnings\n", - " warnings.warn(\n" - ] } ], "source": [ @@ -227,7 +210,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.6" + "version": "3.12.10" } }, "nbformat": 4, From 3b8e5d21ba71c8bf612cecd178baa85edfdf32e8 Mon Sep 17 00:00:00 2001 From: Carly Richmond Date: Wed, 8 Oct 2025 13:21:56 +0200 Subject: [PATCH 4/5] Changing code formatting to fix issue --- .../multilingual-embedding/multilingual_embedding.ipynb | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb b/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb index 80d995f7..308fabb5 100644 --- a/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb +++ b/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb @@ -11,7 +11,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -75,7 +75,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -90,10 +90,7 @@ "from elasticsearch import Elasticsearch\n", "\n", "try:\n", - " es = Elasticsearch(\n", - " hosts=[elastic_endpoint],\n", - " api_key=api_key\n", - " )\n", + " es = Elasticsearch(hosts=[elastic_endpoint], api_key=api_key)\n", "\n", " # Test the connection\n", " if not es.ping():\n", From 4ef9d83c3964920a484f515fce79a4203a664be5 Mon Sep 17 00:00:00 2001 From: Quynh Nguyen Date: Wed, 8 Oct 2025 10:46:52 -0500 Subject: [PATCH 5/5] Update with new ES queries for completeness --- .../multilingual_embedding.ipynb | 227 ++++++++++++++++++ 1 file changed, 227 insertions(+) diff --git a/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb b/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb index 308fabb5..1414f79c 100644 --- a/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb +++ b/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb @@ -189,6 +189,233 @@ "\n", "print(\"Indexing complete!\")" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now are going to create a pipeline to vectorize the descriptions text_field through our inference text embedding model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_body = {\n", + " \"description\": \"Pipeline to run the descriptions text_field through our inference text embedding model\",\n", + " \"processors\": [\n", + " {\n", + " \"set\": {\n", + " \"field\": \"temp_desc\",\n", + " \"value\": \"passage: {{description}}\"\n", + " }\n", + " },\n", + " {\n", + " \"inference\": {\n", + " \"field_map\": {\n", + " \"temp_desc\": \"text_field\"\n", + " },\n", + " \"model_id\": \".multilingual-e5-small_linux-x86_64_search\",\n", + " \"target_field\": \"vector_description\"\n", + " }\n", + " },\n", + " {\n", + " \"remove\": {\n", + " \"field\": \"temp_desc\"\n", + " }\n", + " }\n", + " ]\n", + "}\n", + "\n", + "try:\n", + " es.ingest.put_pipeline(id=\"vectorize_descriptions\", body=pipeline_body)\n", + " print(\"Pipeline 'vectorize_descriptions' created successfully.\")\n", + "except Exception as e:\n", + " print(f\"Error creating pipeline: {str(e)}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We also need to create a new Elasticsearch index with the specified vector mapping." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "index_body = {\n", + " \"mappings\": {\n", + " \"properties\": {\n", + " \"description\": {\n", + " \"type\": \"text\"\n", + " },\n", + " \"en\": {\n", + " \"type\": \"text\"\n", + " },\n", + " \"image_url\": {\n", + " \"type\": \"keyword\"\n", + " },\n", + " \"language\": {\n", + " \"type\": \"keyword\"\n", + " },\n", + " \"vector_description.predicted_value\": {\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 384,\n", + " \"index\": True,\n", + " \"similarity\": \"cosine\",\n", + " \"index_options\": {\n", + " \"type\": \"bbq_hnsw\"\n", + " }\n", + " }\n", + " }\n", + " }\n", + "}\n", + "\n", + "try:\n", + " es.indices.create(index=\"coco_multi\", body=index_body)\n", + " print(\"Index 'coco_multi' created successfully.\")\n", + "except Exception as e:\n", + " print(f\"Error creating index: {str(e)}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we just need to run the pipeline to bring and vectorize the data into the Elasticsearch index." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from elasticsearch import Elasticsearch\n", + "\n", + "es = Elasticsearch()\n", + "\n", + "reindex_body = {\n", + " \"source\": {\n", + " \"index\": \"coco\"\n", + " },\n", + " \"dest\": {\n", + " \"index\": \"coco_multilingual\",\n", + " \"pipeline\": \"vectorize_descriptions\"\n", + " }\n", + "}\n", + "\n", + "response = es.reindex(\n", + " body=reindex_body,\n", + " # Not waiting for completion here cause this process might take a while\n", + " wait_for_completion=False\n", + ")\n", + "\n", + "print(\"Reindex task started. Task info:\")\n", + "print(response)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Voilà, now let's try some queries and have some fun!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query_body = {\n", + " \"size\": 10,\n", + " \"_source\": [\n", + " \"description\", \"language\", \"en\"\n", + " ],\n", + " \"knn\": {\n", + " \"field\": \"vector_description.predicted_value\",\n", + " \"k\": 10,\n", + " \"num_candidates\": 100,\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \".multilingual-e5-small_linux-x86_64_search\",\n", + " \"model_text\": \"query: kitty\"\n", + " }\n", + " }\n", + " }\n", + "}\n", + "\n", + "response = es.search(index=\"coco_multi\", body=query_body)\n", + "print(response)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query_body = {\n", + " \"size\": 100,\n", + " \"_source\": [\n", + " \"description\", \"language\", \"en\"\n", + " ],\n", + " \"knn\": {\n", + " \"field\": \"vector_description.predicted_value\",\n", + " \"k\": 50,\n", + " \"num_candidates\": 1000,\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \".multilingual-e5-small_linux-x86_64_search\",\n", + " \"model_text\": \"query: kitty lying on something\"\n", + " }\n", + " }\n", + " }\n", + "}\n", + "\n", + "response = es.search(index=\"coco_multi\", body=query_body)\n", + "print(response)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query_body = {\n", + " \"size\": 100,\n", + " \"_source\": [\n", + " \"description\", \"language\", \"en\"\n", + " ],\n", + " \"knn\": {\n", + " \"field\": \"vector_description.predicted_value\",\n", + " \"k\": 50,\n", + " \"num_candidates\": 1000,\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \".multilingual-e5-small_linux-x86_64_search\",\n", + " \"model_text\": \"query: 고양이\"\n", + " }\n", + " }\n", + " }\n", + "}\n", + "\n", + "response = es.search(index=\"coco_multi\", body=query_body)\n", + "print(response)\n" + ] } ], "metadata": {