diff --git a/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb b/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb new file mode 100644 index 00000000..1414f79c --- /dev/null +++ b/supporting-blog-content/multilingual-embedding/multilingual_embedding.ipynb @@ -0,0 +1,442 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lost In Translation? Multilingual Embedding Models Are All You Need*\n", + "\n", + "This notebook by Quynh Nguyen shows how cross-lingual vector search overcomes language barriers, enabling you to query and retrieve information in any language from both single and multilingual datasets. It accompanies the piece *Lost In Translation? Multilingual Embedding Models Are All You Need* from [Elasticsearch Labs](https://www.elastic.co/search-labs)." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data successfully downloaded and saved to multilingual_coco_sample.json\n" + ] + } + ], + "source": [ + "import requests\n", + "import json\n", + "\n", + "### Download multilingual coco dataset\n", + "### Here we are retrieving first 100 rows for this example\n", + "### Alternatively, you can use dataset library from Hugging Face\n", + "url = \"https://datasets-server.huggingface.co/rows?dataset=romrawinjp%2Fmultilingual-coco&config=default&split=restval&offset=0&length=100\"\n", + "# Make the GET request\n", + "response = requests.get(url)\n", + "\n", + "# Check if the request was successful\n", + "if response.status_code == 200:\n", + " # Parse the JSON response\n", + " data = response.json()\n", + "\n", + " # Define the output file path\n", + " output_file = \"multilingual_coco_sample.json\"\n", + "\n", + " # Save the JSON data to a file\n", + " with open(output_file, \"w\", encoding=\"utf-8\") as f:\n", + " json.dump(data, f, indent=4, ensure_ascii=False)\n", + "\n", + " print(f\"Data successfully downloaded and saved to {output_file}\")\n", + "else:\n", + " print(f\"Failed to download data: {response.status_code}\")\n", + " print(response.text)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Enter your Elasticsearch credentials:\n" + ] + } + ], + "source": [ + "from getpass import getpass\n", + "\n", + "# Get credentials securely for localhost Elasticsearch\n", + "print(\"Enter your Elasticsearch credentials:\")\n", + "elastic_endpoint = input(\"Enter your Elastic endpoint: \")\n", + "api_key = getpass(\"Enter your API key: \")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Successfully connected to Elasticsearch\n" + ] + } + ], + "source": [ + "from elasticsearch import Elasticsearch\n", + "\n", + "try:\n", + " es = Elasticsearch(hosts=[elastic_endpoint], api_key=api_key)\n", + "\n", + " # Test the connection\n", + " if not es.ping():\n", + " raise Exception(\"Failed to connect to Elasticsearch\")\n", + "\n", + " print(\"Successfully connected to Elasticsearch\")\n", + "\n", + "except Exception as e:\n", + " print(f\"Error connecting to Elasticsearch: {e}\")\n", + " print(\"Please check your credentials\")\n", + " raise" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Successfully bulk indexed 4840 documents\n", + "Indexing complete!\n" + ] + } + ], + "source": [ + "# Define the index mapping\n", + "index_name = \"coco\"\n", + "mapping = {\n", + " \"mappings\": {\n", + " \"properties\": {\n", + " \"language\": {\"type\": \"keyword\"},\n", + " \"description\": {\"type\": \"text\"},\n", + " \"en\": {\"type\": \"text\"},\n", + " \"image_url\": {\"type\": \"keyword\"},\n", + " }\n", + " }\n", + "}\n", + "\n", + "# Create the index if it doesn't exist\n", + "if not es.indices.exists(index=index_name):\n", + " es.indices.create(index=index_name, body=mapping)\n", + "\n", + "# Load the JSON data\n", + "with open(\"./multilingual_coco_sample.json\", \"r\") as f:\n", + " data = json.load(f)\n", + "\n", + "rows = data[\"rows\"]\n", + "# List of languages to process\n", + "languages = [\"en\", \"es\", \"de\", \"it\", \"vi\", \"th\"]\n", + "\n", + "bulk_data = []\n", + "for obj in rows:\n", + " row = obj[\"row\"]\n", + " image_url = row.get(\"image\")\n", + " image_url = image_url[\"src\"]\n", + "\n", + " # Process each language\n", + " for lang in languages:\n", + " # Skip if language not present in this row\n", + " if lang not in row:\n", + " continue\n", + "\n", + " # Get all descriptions for this language\n", + " descriptions = row[lang]\n", + " first_eng_caption = row[\"en\"][0]\n", + "\n", + " # Prepare bulk indexing data\n", + " for description in descriptions:\n", + " if description == \"\":\n", + " continue\n", + " # Add index operation\n", + " bulk_data.append({\"index\": {\"_index\": index_name}})\n", + " # Add document\n", + " bulk_data.append(\n", + " {\n", + " \"language\": lang,\n", + " \"description\": description,\n", + " \"en\": first_eng_caption,\n", + " \"image_url\": image_url,\n", + " }\n", + " )\n", + "\n", + "# Perform bulk indexing\n", + "if bulk_data:\n", + " try:\n", + " response = es.bulk(operations=bulk_data)\n", + " if response[\"errors\"]:\n", + " print(\"Some documents failed to index\")\n", + " else:\n", + " print(f\"Successfully bulk indexed {len(bulk_data)} documents\")\n", + " except Exception as e:\n", + " print(f\"Error during bulk indexing: {str(e)}\")\n", + "\n", + "print(\"Indexing complete!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now are going to create a pipeline to vectorize the descriptions text_field through our inference text embedding model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_body = {\n", + " \"description\": \"Pipeline to run the descriptions text_field through our inference text embedding model\",\n", + " \"processors\": [\n", + " {\n", + " \"set\": {\n", + " \"field\": \"temp_desc\",\n", + " \"value\": \"passage: {{description}}\"\n", + " }\n", + " },\n", + " {\n", + " \"inference\": {\n", + " \"field_map\": {\n", + " \"temp_desc\": \"text_field\"\n", + " },\n", + " \"model_id\": \".multilingual-e5-small_linux-x86_64_search\",\n", + " \"target_field\": \"vector_description\"\n", + " }\n", + " },\n", + " {\n", + " \"remove\": {\n", + " \"field\": \"temp_desc\"\n", + " }\n", + " }\n", + " ]\n", + "}\n", + "\n", + "try:\n", + " es.ingest.put_pipeline(id=\"vectorize_descriptions\", body=pipeline_body)\n", + " print(\"Pipeline 'vectorize_descriptions' created successfully.\")\n", + "except Exception as e:\n", + " print(f\"Error creating pipeline: {str(e)}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We also need to create a new Elasticsearch index with the specified vector mapping." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "index_body = {\n", + " \"mappings\": {\n", + " \"properties\": {\n", + " \"description\": {\n", + " \"type\": \"text\"\n", + " },\n", + " \"en\": {\n", + " \"type\": \"text\"\n", + " },\n", + " \"image_url\": {\n", + " \"type\": \"keyword\"\n", + " },\n", + " \"language\": {\n", + " \"type\": \"keyword\"\n", + " },\n", + " \"vector_description.predicted_value\": {\n", + " \"type\": \"dense_vector\",\n", + " \"dims\": 384,\n", + " \"index\": True,\n", + " \"similarity\": \"cosine\",\n", + " \"index_options\": {\n", + " \"type\": \"bbq_hnsw\"\n", + " }\n", + " }\n", + " }\n", + " }\n", + "}\n", + "\n", + "try:\n", + " es.indices.create(index=\"coco_multi\", body=index_body)\n", + " print(\"Index 'coco_multi' created successfully.\")\n", + "except Exception as e:\n", + " print(f\"Error creating index: {str(e)}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we just need to run the pipeline to bring and vectorize the data into the Elasticsearch index." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from elasticsearch import Elasticsearch\n", + "\n", + "es = Elasticsearch()\n", + "\n", + "reindex_body = {\n", + " \"source\": {\n", + " \"index\": \"coco\"\n", + " },\n", + " \"dest\": {\n", + " \"index\": \"coco_multilingual\",\n", + " \"pipeline\": \"vectorize_descriptions\"\n", + " }\n", + "}\n", + "\n", + "response = es.reindex(\n", + " body=reindex_body,\n", + " # Not waiting for completion here cause this process might take a while\n", + " wait_for_completion=False\n", + ")\n", + "\n", + "print(\"Reindex task started. Task info:\")\n", + "print(response)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Voilà, now let's try some queries and have some fun!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query_body = {\n", + " \"size\": 10,\n", + " \"_source\": [\n", + " \"description\", \"language\", \"en\"\n", + " ],\n", + " \"knn\": {\n", + " \"field\": \"vector_description.predicted_value\",\n", + " \"k\": 10,\n", + " \"num_candidates\": 100,\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \".multilingual-e5-small_linux-x86_64_search\",\n", + " \"model_text\": \"query: kitty\"\n", + " }\n", + " }\n", + " }\n", + "}\n", + "\n", + "response = es.search(index=\"coco_multi\", body=query_body)\n", + "print(response)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query_body = {\n", + " \"size\": 100,\n", + " \"_source\": [\n", + " \"description\", \"language\", \"en\"\n", + " ],\n", + " \"knn\": {\n", + " \"field\": \"vector_description.predicted_value\",\n", + " \"k\": 50,\n", + " \"num_candidates\": 1000,\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \".multilingual-e5-small_linux-x86_64_search\",\n", + " \"model_text\": \"query: kitty lying on something\"\n", + " }\n", + " }\n", + " }\n", + "}\n", + "\n", + "response = es.search(index=\"coco_multi\", body=query_body)\n", + "print(response)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query_body = {\n", + " \"size\": 100,\n", + " \"_source\": [\n", + " \"description\", \"language\", \"en\"\n", + " ],\n", + " \"knn\": {\n", + " \"field\": \"vector_description.predicted_value\",\n", + " \"k\": 50,\n", + " \"num_candidates\": 1000,\n", + " \"query_vector_builder\": {\n", + " \"text_embedding\": {\n", + " \"model_id\": \".multilingual-e5-small_linux-x86_64_search\",\n", + " \"model_text\": \"query: 고양이\"\n", + " }\n", + " }\n", + " }\n", + "}\n", + "\n", + "response = es.search(index=\"coco_multi\", body=query_body)\n", + "print(response)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}