# Lost In Translation? Multilingual Embedding Models Are All You Need*

This notebook by Quynh Nguyen shows how cross-lingual vector search overcomes language barriers, enabling you to query and retrieve information in any language from both single and multilingual datasets. It accompanies the piece *Lost In Translation? Multilingual Embedding Models Are All You Need* from [Elasticsearch Labs](https://www.elastic.co/search-labs).

In [7]:
import requests
import json

### Download multilingual coco dataset
### Here we are retrieving first 100 rows for this example
### Alternatively, you can use dataset library from Hugging Face
url = "https://datasets-server.huggingface.co/rows?dataset=romrawinjp%2Fmultilingual-coco&config=default&split=restval&offset=0&length=100"
# Make the GET request
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()

    # Define the output file path
    output_file = "multilingual_coco_sample.json"

    # Save the JSON data to a file
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

    print(f"Data successfully downloaded and saved to {output_file}")
else:
    print(f"Failed to download data: {response.status_code}")
    print(response.text)

Data successfully downloaded and saved to multilingual_coco_sample.json


In [3]:
from getpass import getpass

# Get credentials securely for localhost Elasticsearch
print("Enter your Elasticsearch credentials:")
elastic_endpoint = input("Enter your Elastic endpoint: ")
api_key = getpass("Enter your API key: ")

Enter your Elasticsearch credentials:


In [None]:
from elasticsearch import Elasticsearch

try:
    es = Elasticsearch(hosts=[elastic_endpoint], api_key=api_key)

    # Test the connection
    if not es.ping():
        raise Exception("Failed to connect to Elasticsearch")

    print("Successfully connected to Elasticsearch")

except Exception as e:
    print(f"Error connecting to Elasticsearch: {e}")
    print("Please check your credentials")
    raise

Successfully connected to Elasticsearch


In [6]:
# Define the index mapping
index_name = "coco"
mapping = {
    "mappings": {
        "properties": {
            "language": {"type": "keyword"},
            "description": {"type": "text"},
            "en": {"type": "text"},
            "image_url": {"type": "keyword"},
        }
    }
}

# Create the index if it doesn't exist
if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, body=mapping)

# Load the JSON data
with open("./multilingual_coco_sample.json", "r") as f:
    data = json.load(f)

rows = data["rows"]
# List of languages to process
languages = ["en", "es", "de", "it", "vi", "th"]

bulk_data = []
for obj in rows:
    row = obj["row"]
    image_url = row.get("image")
    image_url = image_url["src"]

    # Process each language
    for lang in languages:
        # Skip if language not present in this row
        if lang not in row:
            continue

        # Get all descriptions for this language
        descriptions = row[lang]
        first_eng_caption = row["en"][0]

        # Prepare bulk indexing data
        for description in descriptions:
            if description == "":
                continue
            # Add index operation
            bulk_data.append({"index": {"_index": index_name}})
            # Add document
            bulk_data.append(
                {
                    "language": lang,
                    "description": description,
                    "en": first_eng_caption,
                    "image_url": image_url,
                }
            )

# Perform bulk indexing
if bulk_data:
    try:
        response = es.bulk(operations=bulk_data)
        if response["errors"]:
            print("Some documents failed to index")
        else:
            print(f"Successfully bulk indexed {len(bulk_data)} documents")
    except Exception as e:
        print(f"Error during bulk indexing: {str(e)}")

print("Indexing complete!")

Successfully bulk indexed 4840 documents
Indexing complete!


Now we are going to create a pipeline to vectorize the descriptions text_field through our inference text embedding model.

In [None]:
pipeline_body = {
    "description": "Pipeline to run the descriptions text_field through our inference text embedding model",
    "processors": [
        {
            "set": {
                "field": "temp_desc",
                "value": "passage: {{description}}"
            }
        },
        {
            "inference": {
                "field_map": {
                    "temp_desc": "text_field"
                },
                "model_id": ".multilingual-e5-small_linux-x86_64_search",
                "target_field": "vector_description"
            }
        },
        {
            "remove": {
                "field": "temp_desc"
            }
        }
    ]
}

try:
    es.ingest.put_pipeline(id="vectorize_descriptions", body=pipeline_body)
    print("Pipeline 'vectorize_descriptions' created successfully.")
except Exception as e:
    print(f"Error creating pipeline: {str(e)}")


We also need to create a new Elasticsearch index with the specified vector mapping.

In [None]:
index_body = {
    "mappings": {
        "properties": {
            "description": {
                "type": "text"
            },
            "en": {
                "type": "text"
            },
            "image_url": {
                "type": "keyword"
            },
            "language": {
                "type": "keyword"
            },
            "vector_description.predicted_value": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine",
                "index_options": {
                    "type": "bbq_hnsw"
                }
            }
        }
    }
}

try:
    es.indices.create(index="coco_multi", body=index_body)
    print("Index 'coco_multi' created successfully.")
except Exception as e:
    print(f"Error creating index: {str(e)}")


Now, we just need to run the pipeline to bring and vectorize the data into the Elasticsearch index.

In [None]:
from elasticsearch import Elasticsearch

es = Elasticsearch()

reindex_body = {
    "source": {
        "index": "coco"
    },
    "dest": {
        "index": "coco_multilingual",
        "pipeline": "vectorize_descriptions"
    }
}

response = es.reindex(
    body=reindex_body,
    # Not waiting for completion here cause this process might take a while
    wait_for_completion=False
)

print("Reindex task started. Task info:")
print(response)


Voilà, now let's try some queries and have some fun!

In [None]:
query_body = {
    "size": 10,
    "_source": [
        "description", "language", "en"
    ],
    "knn": {
        "field": "vector_description.predicted_value",
        "k": 10,
        "num_candidates": 100,
        "query_vector_builder": {
            "text_embedding": {
                "model_id": ".multilingual-e5-small_linux-x86_64_search",
                "model_text": "query: kitty"
            }
        }
    }
}

response = es.search(index="coco_multi", body=query_body)
print(response)


In [None]:
query_body = {
    "size": 100,
    "_source": [
        "description", "language", "en"
    ],
    "knn": {
        "field": "vector_description.predicted_value",
        "k": 50,
        "num_candidates": 1000,
        "query_vector_builder": {
            "text_embedding": {
                "model_id": ".multilingual-e5-small_linux-x86_64_search",
                "model_text": "query: kitty lying on something"
            }
        }
    }
}

response = es.search(index="coco_multi", body=query_body)
print(response)


In [None]:
query_body = {
    "size": 100,
    "_source": [
        "description", "language", "en"
    ],
    "knn": {
        "field": "vector_description.predicted_value",
        "k": 50,
        "num_candidates": 1000,
        "query_vector_builder": {
            "text_embedding": {
                "model_id": ".multilingual-e5-small_linux-x86_64_search",
                "model_text": "query: 고양이"
            }
        }
    }
}

response = es.search(index="coco_multi", body=query_body)
print(response)
