# Lost In Translation? Multilingual Embedding Models Are All You Need*

This notebook by Quynh Nguyen shows how cross-lingual vector search overcomes language barriers, enabling you to query and retrieve information in any language from both single and multilingual datasets. It accompanies the piece *Lost In Translation? Multilingual Embedding Models Are All You Need* from [Elasticsearch Labs](https://www.elastic.co/search-labs).

In [None]:
import requests
import json
import os

### Download multilingual coco dataset
### Here we are retrieving first 100 rows for this example
### Alternatively, you can use dataset library from Hugging Face
url = "https://datasets-server.huggingface.co/rows?dataset=romrawinjp%2Fmultilingual-coco&config=default&split=restval&offset=0&length=100"
# Make the GET request
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()

    # Define the output file path
    output_file = "multilingual_coco_sample.json"

    # Save the JSON data to a file
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

    print(f"Data successfully downloaded and saved to {output_file}")
else:
    print(f"Failed to download data: {response.status_code}")
    print(response.text)

Data successfully downloaded and saved to multilingual_coco_sample.json


In [None]:
from getpass import getpass

# Get credentials securely for localhost Elasticsearch
print("Enter your Elasticsearch credentials:")
cloud_id = input("Enter your cloud_id: ")
api_key = getpass("Enter your api_key: ")

In [None]:
from elasticsearch import Elasticsearch

try:
    es = Elasticsearch(
        hosts=[{"host": "localhost", "port": 9200, "scheme": "https"}],
        basic_auth=("elastic", "qaf_admin"),
        verify_certs=False,  # Set to True if you have valid SSL certificates
        # Alternatively, you can use Elastic cloud_id and api_key
        # api_key=getpass("API Key: ")
        # cloud_id=getpass("Cloud ID: "),
    )

    # Test the connection
    if not es.ping():
        raise Exception("Failed to connect to Elasticsearch")

    print("Successfully connected to Elasticsearch")

except Exception as e:
    print(f"Error connecting to Elasticsearch: {e}")
    print("Please check your credentials")
    raise

Successfully connected to Elasticsearch


  _transport = transport_class(


In [16]:
# Define the index mapping
index_name = "coco"
mapping = {
    "mappings": {
        "properties": {
            "language": {"type": "keyword"},
            "description": {"type": "text"},
            "en": {"type": "text"},
            "image_url": {"type": "keyword"},
        }
    }
}

# Create the index if it doesn't exist
if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name, body=mapping)

# Load the JSON data
with open("./multilingual_coco_sample.json", "r") as f:
    data = json.load(f)

rows = data["rows"]
# List of languages to process
languages = ["en", "es", "de", "it", "vi", "th"]

bulk_data = []
for obj in rows:
    row = obj["row"]
    image_url = row.get("image")
    image_url = image_url["src"]

    # Process each language
    for lang in languages:
        # Skip if language not present in this row
        if lang not in row:
            continue

        # Get all descriptions for this language
        descriptions = row[lang]
        first_eng_caption = row["en"][0]

        # Prepare bulk indexing data
        for description in descriptions:
            if description == "":
                continue
            # Add index operation
            bulk_data.append({"index": {"_index": index_name}})
            # Add document
            bulk_data.append(
                {
                    "language": lang,
                    "description": description,
                    "en": first_eng_caption,
                    "image_url": image_url,
                }
            )

# Perform bulk indexing
if bulk_data:
    try:
        response = es.bulk(operations=bulk_data)
        if response["errors"]:
            print("Some documents failed to index")
        else:
            print(f"Successfully bulk indexed {len(bulk_data)} documents")
    except Exception as e:
        print(f"Error during bulk indexing: {str(e)}")

print("Indexing complete!")

Successfully bulk indexed 4840 documents
Indexing complete!


