<a href="https://colab.research.google.com/github/deltorobarba/sciences/blob/master/ai_translation_tuning_llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Test LLMs as Translators**

*Task: Automatically translate and evaluate with different Gemini models for multiple languages*

In [None]:
# Just update project ID, GCP bucket name and name of TMX file manually:

PROJECT_ID = "YOUR-PROJECT-ID"               # <--- UPDATE THIS
LOCATION = "us-central1"
BUCKET_NAME = "translations-eval" # <--- UPDATE THIS
BUCKET_URI = f"gs://{BUCKET_NAME}"
TMX_GCS_PATH = "samples.tmx"    # <--- UPLOAD THIS
LOCAL_TMX_FILE = "samples.tmx"

###### *Setup and Dependencies*

In [None]:
%pip install --upgrade google-cloud-aiplatform google-cloud-storage -q
%pip install matplotlib seaborn langdetect -q
%pip install --upgrade --user --quiet google-cloud-aiplatform[evaluation]

!pip install google-cloud-translate==2.0.1 -q
!pip install --upgrade google-cloud-translate -q

import xml.etree.ElementTree as ET
import json
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
from sklearn.model_selection import train_test_split
from google.cloud import aiplatform, storage
from google.cloud import translate_v3 as translate
import vertexai
from vertexai.tuning import sft
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
import re
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# For language detection
from langdetect import detect, detect_langs

In [None]:
# Initialize Vertex AI
vertexai.init(project=PROJECT_ID, location=LOCATION)

# Utility Functions
def download_from_gcs(bucket_name, source_blob_name, destination_file_name):
    """Downloads a file from GCS and returns the local path."""
    print(f"--- Downloading {source_blob_name} ---")
    storage_client = storage.Client(project=PROJECT_ID)
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)
    print(f"Successfully downloaded to {destination_file_name}")
    return destination_file_name

def upload_to_gcs(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to GCS and returns the GCS URI."""
    print(f"--- Uploading {source_file_name} ---")
    storage_client = storage.Client(project=PROJECT_ID)
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    gcs_uri = f"gs://{bucket_name}/{destination_blob_name}"
    print(f"Successfully uploaded to {gcs_uri}")
    return gcs_uri

In [None]:
from google import genai
from google.genai.types import (
    FunctionDeclaration,
    GenerateContentConfig,
    GoogleSearch,
    HarmBlockThreshold,
    HarmCategory,
    MediaResolution,
    Part,
    Retrieval,
    SafetySetting,
    Tool,
    ToolCodeExecution,
    VertexAISearch,
)
from IPython.display import HTML, Markdown, display

In [None]:
import os
import vertexai

client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)
vertexai.init(project=PROJECT_ID, location=LOCATION)

In [None]:
!pip install google-generativeai seaborn matplotlib -q

import pandas as pd
import google.generativeai as genai
from scipy.stats import entropy
import numpy as np
import json
import os

In [None]:
# you may need to change that to your customer key
if not client.vertexai:
  print("Using Gemini Developer API.")
elif client._api_client.project:
  print(
      f"Using Vertex AI with project: {client._api_client.project} in location:"
      f" {client._api_client.location}"
  )
elif client._api_client.api_key:
  print(
      "Using Vertex AI in express mode with API key:"
      f" {client._api_client.api_key[:5]}...{client._api_client.api_key[-5:]}"
  )

Using Vertex AI with project: lunar-352813 in location: us-central1


In [None]:
# Generic test if LLM model works
MODEL_ID = "gemini-2.5-flash"  # @param {type: "string"}
response = client.models.generate_content(
    model=MODEL_ID, contents="What is the name of the largest search engine?"
)

display(Markdown(response.text))

###### *Preprocessing*

In [None]:
# @title Preprocessing (Load TMX file, clean up, create ground-truth tables)

import json
import os
import pandas as pd
import xml.etree.ElementTree as ET
from collections import defaultdict
from sklearn.model_selection import train_test_split
from google.cloud import storage

### TMX Data Loading
class TMXLoader:
    def __init__(self, tmx_path: str):
        self.tmx_path = tmx_path
        self.tree = ET.parse(tmx_path)
        self.root = self.tree.getroot()
        self.lang_attr = '{http://www.w3.org/XML/1998/namespace}lang'
        self.source_lang = None

    def detect_source_language(self):
        """Automatically detect the source language, preferring English variants."""
        all_langs = {tuv.get(self.lang_attr) for tuv in self.root.findall('.//tuv') if tuv.get(self.lang_attr)}
        en_variants = [lang for lang in all_langs if lang.startswith('en')]
        if en_variants:
            self.source_lang = en_variants[0]
        elif all_langs:
            self.source_lang = sorted(list(all_langs))[0]
        print(f"Detected Source Language: {self.source_lang}")

    def extract_data(self):
        """Extracts all source-target translation pairs from the TMX file."""
        if not self.source_lang:
            self.detect_source_language()

        data = defaultdict(list)
        print("\nExtracting translation data from TMX file...")

        for tu in self.root.findall('.//tu'):
            # Find the source text within the translation unit
            source_tuv = tu.find(f".//tuv[@{self.lang_attr}='{self.source_lang}']")
            if source_tuv is None or source_tuv.find('seg') is None or source_tuv.find('seg').text is None:
                continue
            source_text = source_tuv.find('seg').text.strip()

            # Find all corresponding target texts and create pairs
            for tuv in tu.findall(f".//tuv"):
                lang_code = tuv.get(self.lang_attr)
                if not lang_code or lang_code == self.source_lang:
                    continue

                seg = tuv.find('seg')
                if seg is not None and seg.text:
                    target_text = seg.text.strip()
                    data[lang_code].append({
                        'source': source_text,
                        'target': target_text,
                        'source_words': len(source_text.split()),
                        'target_words': len(target_text.split())
                    })

        print(f"Successfully extracted data for {len(data)} languages.")
        return data

### Google Cloud Storage
def download_from_gcs(bucket_name, source_blob_name, destination_file_name):
    """Downloads a file from GCS."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)
    print(f"Downloaded GCS file '{source_blob_name}' to '{destination_file_name}'")

def upload_to_gcs(bucket_name, source_file_name, destination_blob_name):
    """Uploads a local file to a GCS bucket and returns the GCS URI."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    gcs_uri = f"gs://{bucket_name}/{destination_blob_name}"
    print(f"   Successfully uploaded {source_file_name} to {gcs_uri}")
    os.remove(source_file_name) # Clean up local file
    return gcs_uri

### Data Processing
def create_tuning_data_from_samples(samples, lang_code):
    """Convert samples to the required 'contents' format for tuning."""
    tuning_records = []
    for sample in samples:
        json_record = {
            "contents": [
                {"role": "user", "parts": [{"text": f"English: {sample['source']} {lang_code}: "}]},
                {"role": "model", "parts": [{"text": sample['target']}]}
            ]
        }
        tuning_records.append(json_record)
    return tuning_records

def preprocess_and_upload_language(lang_code, samples):
    """Preprocess, split, and upload data for a single language."""
    print(f"\n--- Processing {lang_code}... ---")

    # Remove duplicates and samples with fewer than 3 words
    unique_samples = []
    seen_pairs = set()
    for sample in samples:
        pair = (sample['source'], sample['target'])
        if pair not in seen_pairs and sample['source_words'] >= 3 and sample['target_words'] >= 3:
            seen_pairs.add(pair)
            unique_samples.append(sample)

    print(f"   Filtered from {len(samples)} to {len(unique_samples)} unique quality samples.")

    if len(unique_samples) < 10:
        print(f"   ⚠️ Skipping {lang_code} - insufficient samples after filtering.")
        return None

    # Convert to tuning format
    tuning_records = create_tuning_data_from_samples(unique_samples, lang_code)

    # Split into train/eval sets
    train_data, eval_data = train_test_split(tuning_records, test_size=0.2, random_state=42)
    print(f"   Split data: {len(train_data)} training, {len(eval_data)} evaluation.")

    # Save and upload training data
    train_file_local = f"{lang_code.replace(':', '_')}_tuning_train_split.jsonl"
    with open(train_file_local, 'w', encoding='utf-8') as f:
        for item in train_data:
            f.write(json.dumps(item) + '\n')
    train_gcs_uri = upload_to_gcs(BUCKET_NAME, train_file_local, f"datasets/{train_file_local}")

    # Save and upload evaluation data
    eval_file_local = f"{lang_code.replace(':', '_')}_tuning_eval_split.jsonl"
    with open(eval_file_local, 'w', encoding='utf-8') as f:
        for item in eval_data:
            f.write(json.dumps(item) + '\n')
    eval_gcs_uri = upload_to_gcs(BUCKET_NAME, eval_file_local, f"datasets/{eval_file_local}")

    return {
        'lang_code': lang_code,
        'train_uri': train_gcs_uri,
        'eval_uri': eval_gcs_uri,
        'train_count': len(train_data),
        'eval_count': len(eval_data)
    }

# MAIN EXECUTION
if __name__ == "__main__":

    # PART 1: LOAD TMX FILE
    print("==============================================")
    print("LOAD AND EXTRACT DATA FROM TMX FILE")
    print("==============================================")
    download_from_gcs(BUCKET_NAME, TMX_GCS_PATH, LOCAL_TMX_FILE)
    loader = TMXLoader(LOCAL_TMX_FILE)
    all_language_data = loader.extract_data()

    # PART 2: PREPROCESSING AND UPLOADING ALL LANGUAGES
    print("==============================================")
    print("PREPROCESS AND UPLOAD ALL LANGUAGES")
    print("==============================================")
    preprocessing_results = []
    if all_language_data:
        for lang_code, samples in all_language_data.items():
            result = preprocess_and_upload_language(lang_code, samples)
            if result:
                preprocessing_results.append(result)

        if preprocessing_results:
            print("\n Preprocessing Summary:")
            prep_df = pd.DataFrame(preprocessing_results)
            prep_df = prep_df[['lang_code', 'train_count', 'eval_count']]
            prep_df.columns = ['Language', 'Training Samples', 'Evaluation Samples']
            prep_df['Total'] = prep_df['Training Samples'] + prep_df['Evaluation Samples']
            print(prep_df[['Language', 'Total']].to_string(index=False))
        else:
            print("\n No languages had sufficient data for preprocessing.")
    else:
        print("\n No data found in TMX file to process.")

    # PART 3: CREATE GROUND-TRUTH TABLES FOR ALL LANGUAGES
    print("==============================================")
    print("PART 3: CREATE GROUND-TRUTH TABLES FROM EVALUATION DATA")
    print("==============================================")
    ground_truth_dfs = {} # Initialize outside the conditional
    if not preprocessing_results:
        print("\n No preprocessing results found. Skipping ground-truth table creation.")
    else:
        print(f"Found {len(preprocessing_results)} languages. Starting table creation...")
        storage_client = storage.Client()

        for lang_info in preprocessing_results:
            lang_code = lang_info['lang_code']
            gcs_uri = lang_info['eval_uri']
            print(f"\n--- Building table for {lang_code} ---")

            try:
                # Download and parse data from GCS
                bucket_name, blob_name = gcs_uri.replace("gs://", "").split("/", 1)
                bucket = storage_client.bucket(bucket_name)
                blob = bucket.blob(blob_name)
                content = blob.download_as_text()
                eval_data = [json.loads(line) for line in content.strip().split('\n')]

                # Process records to extract source and reference texts
                prompts = []
                references = []
                for item in eval_data:
                    full_prompt_str = item['contents'][0]['parts'][0]['text']
                    english_text = full_prompt_str.replace("English: ", "").replace(f" {lang_code}: ", "").strip()
                    prompts.append(english_text)

                    reference_translation = item['contents'][1]['parts'][0]['text']
                    references.append(reference_translation)

                # Create the DataFrame for the current language
                df = pd.DataFrame({
                    "source_text": prompts,
                    "reference": references,
                })

                # Store the new DataFrame in our main dictionary
                ground_truth_dfs[lang_code] = df
                print(f" Ground truth DataFrame for {lang_code} created successfully.")

            except Exception as e:
                print(f" Failed to process data for {lang_code}. Error: {e}")
                continue # Skip to the next language

    # 4. Display results
    print("Preprocessing finished")
    if ground_truth_dfs:
        print(f"Created {len(ground_truth_dfs)} tables, stored in the 'ground_truth_dfs' dictionary")

        # Display the top few rows of each created table
        for lang_code, df in ground_truth_dfs.items():
            print(f"\n--- Table for: {lang_code} ---")
            display(df.head())
    else:
        print("No ground-truth tables were created.")


###### *Translations and Evaluation*

In [None]:
# @title Run Translation and Evaluation for Multiple Models

import pandas as pd
import re
from tqdm.auto import tqdm
from IPython.display import display, Markdown
from vertexai.evaluation import EvalTask
from vertexai.evaluation.metrics import pointwise_metric

# Define models to compare
MODELS_TO_COMPARE = [
    "gemini-2.5-pro",
    "gemini-2.5-flash",
    "gemini-2.5-flash-lite"
]

# Define evaluation metrics
EVALUATION_METRICS = [
    "bleu",
    pointwise_metric.Comet(version="COMET_22_SRC_REF"),
    pointwise_metric.MetricX(version="METRICX_24_SRC"),
]

# (A helper dictionary to create more natural prompts for the LLM)
language_name_map = {
    'de': 'German', 'fr': 'French', 'es': 'Spanish',
    'it': 'Italian', 'ja': 'Japanese', 'ru': 'Russian',
}

# Initialize tqdm for pandas integration
tqdm.pandas()

# Translation functions
  # english_text: The source text in English.
  # target_language: The target language name (e.g., 'German').
  # model_id: The specific model ID to use for the translation.)
def get_translation(english_text: str, target_language: str, model_id: str):
    prompt = f"Translate the following English text to {target_language}. Provide only the translated text and nothing else: '{english_text}'"
    try:
        response = client.models.generate_content(
            model=model_id,
            contents=prompt
        )
        return response.text.strip()
    except Exception as e:
        print(f" Error translating '{english_text[:30]}...' with {model_id}: {e}")
        return "TRANSLATION_ERROR"

# Translate and evaluate
all_model_eval_results = {}

# Check if ground truth data exists before starting
if 'ground_truth_dfs' not in locals() or not ground_truth_dfs:
    print(" 'ground_truth_dfs' dictionary not found or is empty. Run TMX preprocessing first")
else:
    # Outer loop for each model
    for model_id in MODELS_TO_COMPARE:
        print("\n" + "="*80)
        print(f" PROCESSING MODEL: {model_id}")
        print("="*80)

        current_model_evals = {}

        # Inner loop for each language
        for lang_code, df in ground_truth_dfs.items():
            target_language_name = language_name_map.get(lang_code, lang_code.upper())
            prediction_col_name = f'prediction_{model_id}' # Create unique column name

            print(f"\n---  Translating to {target_language_name} ({lang_code}) using {model_id} ---")

            # Translation (generate predictions and store in the new dynamic column)
            df[prediction_col_name] = df['source_text'].progress_apply(
                get_translation,
                target_language=target_language_name,
                model_id=model_id # Pass current model_id to function
            )
            print(f"   Translations for {lang_code} complete.")

            # Evaluation
            print(f"--- 📊 Evaluating translations for {lang_code} from {model_id} ---")
            eval_df_prepared = df[['source_text', 'reference', prediction_col_name]].copy()
            eval_df_prepared.rename(columns={
                'source_text': 'content',
                prediction_col_name: 'response', # Use dynamic prediction column
                'reference': 'reference'
            }, inplace=True)

            safe_lang_code = re.sub(r'[^a-z0-9-]', '', lang_code.lower())
            experiment_name = f"translation-eval-{safe_lang_code}-{model_id.replace('.', '-')}"

            try:
                eval_task = EvalTask(
                    dataset=eval_df_prepared,
                    metrics=EVALUATION_METRICS,
                    experiment=experiment_name
                )
                eval_result = eval_task.evaluate()
                current_model_evals[lang_code] = eval_result
                print(f"   Evaluation for {lang_code} complete.")

            except Exception as e:
                print(f"   An error occurred during evaluation for {lang_code}: {e}")
                continue

        # Store evaluation results for current model
        all_model_eval_results[model_id] = current_model_evals

# Summary and plots
print("\n" + "="*80)
print(" Evaluation Summary of all Translations")
print("="*80)

if not all_model_eval_results:
    print("\nNo evaluations were successfully completed.")
else:
    # Create flat list of all results for summary dataframe
    summary_data = []
    for model_id, lang_results in all_model_eval_results.items():
        for lang_code, result in lang_results.items():
            metrics_dict = result.summary_metrics
            metrics_dict['model_id'] = model_id
            metrics_dict['language'] = lang_code
            summary_data.append(metrics_dict)

    # Create and display final comparison df
    summary_df = pd.DataFrame(summary_data)
    if not summary_df.empty:
        # Reorder columns for better readability
        cols = ['model_id', 'language', 'row_count', 'bleu/mean', 'comet/mean', 'metricx/mean']
        # Add std deviation columns if they exist
        for metric in ['bleu/std', 'comet/std', 'metricx/std']:
            if metric in summary_df.columns:
                cols.append(metric)

        summary_df = summary_df[cols]
        print("\nComparative Results Across All Models and Languages:")
        display(summary_df.sort_values(by=['language', 'bleu/mean'], ascending=[True, False]))
    else:
        print("\nSummary DataFrame is empty. No results to display.")


# Display top rows of the updated df's to show prediction columns
print("\n" + "="*80)
print(" Show table with all translations and groundtruths")
print("="*80)
if 'ground_truth_dfs' in locals() and ground_truth_dfs:
    for lang_code, df in ground_truth_dfs.items():
        print(f"\n--- Updated Table for: {lang_code} ---")
        display(df.head())
else:
    print("\nNo DataFrames to display.")


 PROCESSING MODEL: gemini-2.5-pro

---  Translating to ES-LA (es-LA) using gemini-2.5-pro ---


  0%|          | 0/23 [00:00<?, ?it/s]

   Translations for es-LA complete.
--- 📊 Evaluating translations for es-LA from gemini-2.5-pro ---


INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 69 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 69/69 [00:10<00:00,  6.41it/s]
INFO:vertexai.evaluation._evaluation:All 69 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:10.779656649000117 seconds


   Evaluation for es-LA complete.

---  Translating to PT-BR (pt-BR) using gemini-2.5-pro ---


  0%|          | 0/24 [00:00<?, ?it/s]

   Translations for pt-BR complete.
--- 📊 Evaluating translations for pt-BR from gemini-2.5-pro ---


INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 72 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 72/72 [00:11<00:00,  6.39it/s]
INFO:vertexai.evaluation._evaluation:All 72 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:11.289213021000023 seconds


   Evaluation for pt-BR complete.

---  Translating to FR-FR (fr-FR) using gemini-2.5-pro ---


  0%|          | 0/22 [00:00<?, ?it/s]

   Translations for fr-FR complete.
--- 📊 Evaluating translations for fr-FR from gemini-2.5-pro ---


INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 66 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 66/66 [00:13<00:00,  5.07it/s]
INFO:vertexai.evaluation._evaluation:All 66 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:13.045028352000372 seconds


   Evaluation for fr-FR complete.

---  Translating to DE-DE (de-DE) using gemini-2.5-pro ---


  0%|          | 0/24 [00:00<?, ?it/s]

   Translations for de-DE complete.
--- 📊 Evaluating translations for de-DE from gemini-2.5-pro ---


INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 72 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 72/72 [00:11<00:00,  6.49it/s]
INFO:vertexai.evaluation._evaluation:All 72 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:11.10720225399973 seconds


   Evaluation for de-DE complete.

 PROCESSING MODEL: gemini-2.5-flash

---  Translating to ES-LA (es-LA) using gemini-2.5-flash ---


  0%|          | 0/23 [00:00<?, ?it/s]

   Translations for es-LA complete.
--- 📊 Evaluating translations for es-LA from gemini-2.5-flash ---


INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 69 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 69/69 [00:13<00:00,  5.12it/s]
INFO:vertexai.evaluation._evaluation:All 69 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:13.487954268000067 seconds


   Evaluation for es-LA complete.

---  Translating to PT-BR (pt-BR) using gemini-2.5-flash ---


  0%|          | 0/24 [00:00<?, ?it/s]

   Translations for pt-BR complete.
--- 📊 Evaluating translations for pt-BR from gemini-2.5-flash ---


INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 72 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 72/72 [00:14<00:00,  5.12it/s]
INFO:vertexai.evaluation._evaluation:All 72 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:14.068098491 seconds


   Evaluation for pt-BR complete.

---  Translating to FR-FR (fr-FR) using gemini-2.5-flash ---


  0%|          | 0/22 [00:00<?, ?it/s]

   Translations for fr-FR complete.
--- 📊 Evaluating translations for fr-FR from gemini-2.5-flash ---


INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 66 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 66/66 [00:10<00:00,  6.31it/s]
INFO:vertexai.evaluation._evaluation:All 66 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:10.476824360000137 seconds


   Evaluation for fr-FR complete.

---  Translating to DE-DE (de-DE) using gemini-2.5-flash ---


  0%|          | 0/24 [00:00<?, ?it/s]

   Translations for de-DE complete.
--- 📊 Evaluating translations for de-DE from gemini-2.5-flash ---


INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 72 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 72/72 [00:11<00:00,  6.10it/s]
INFO:vertexai.evaluation._evaluation:All 72 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:11.812744988000304 seconds


   Evaluation for de-DE complete.

 PROCESSING MODEL: gemini-2.5-flash-lite

---  Translating to ES-LA (es-LA) using gemini-2.5-flash-lite ---


  0%|          | 0/23 [00:00<?, ?it/s]

   Translations for es-LA complete.
--- 📊 Evaluating translations for es-LA from gemini-2.5-flash-lite ---


INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 69 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 69/69 [00:11<00:00,  5.91it/s]
INFO:vertexai.evaluation._evaluation:All 69 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:11.682412756000303 seconds


   Evaluation for es-LA complete.

---  Translating to PT-BR (pt-BR) using gemini-2.5-flash-lite ---


  0%|          | 0/24 [00:00<?, ?it/s]

   Translations for pt-BR complete.
--- 📊 Evaluating translations for pt-BR from gemini-2.5-flash-lite ---


INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 72 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 72/72 [00:11<00:00,  6.52it/s]
INFO:vertexai.evaluation._evaluation:All 72 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:11.049716374000127 seconds


   Evaluation for pt-BR complete.

---  Translating to FR-FR (fr-FR) using gemini-2.5-flash-lite ---


  0%|          | 0/22 [00:00<?, ?it/s]

   Translations for fr-FR complete.
--- 📊 Evaluating translations for fr-FR from gemini-2.5-flash-lite ---


INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 66 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 66/66 [00:13<00:00,  5.00it/s]
INFO:vertexai.evaluation._evaluation:All 66 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:13.226636766999945 seconds


   Evaluation for fr-FR complete.

---  Translating to DE-DE (de-DE) using gemini-2.5-flash-lite ---


  0%|          | 0/24 [00:00<?, ?it/s]

   Translations for de-DE complete.
--- 📊 Evaluating translations for de-DE from gemini-2.5-flash-lite ---


INFO:vertexai.evaluation._evaluation:Computing metrics with a total of 72 Vertex Gen AI Evaluation Service API requests.
100%|██████████| 72/72 [00:11<00:00,  6.52it/s]
INFO:vertexai.evaluation._evaluation:All 72 metric requests are successfully computed.
INFO:vertexai.evaluation._evaluation:Evaluation Took:11.063164114999836 seconds


   Evaluation for de-DE complete.

 Evaluation Summary of all Translations

Comparative Results Across All Models and Languages:


Unnamed: 0,model_id,language,row_count,bleu/mean,comet/mean,metricx/mean,bleu/std,comet/std,metricx/std
11,gemini-2.5-flash-lite,de-DE,24,0.409719,0.779779,4.465307,0.277426,0.126604,6.00875
7,gemini-2.5-flash,de-DE,24,0.378595,0.781043,4.300259,0.233443,0.140287,5.774682
3,gemini-2.5-pro,de-DE,24,0.364834,0.773558,4.038591,0.229436,0.129498,4.880899
0,gemini-2.5-pro,es-LA,23,0.424663,0.807851,11.214301,0.272162,0.117504,8.690299
4,gemini-2.5-flash,es-LA,23,0.386401,0.78079,10.575122,0.272113,0.128758,7.906113
8,gemini-2.5-flash-lite,es-LA,23,0.369435,0.751988,12.386021,0.247048,0.132669,8.3217
10,gemini-2.5-flash-lite,fr-FR,22,0.432038,0.787586,11.437713,0.286472,0.139202,7.181651
2,gemini-2.5-pro,fr-FR,22,0.400389,0.800953,8.623495,0.284463,0.115791,6.825564
6,gemini-2.5-flash,fr-FR,22,0.363257,0.781015,9.958714,0.236781,0.113815,7.549825
1,gemini-2.5-pro,pt-BR,24,0.435284,0.831895,13.155427,0.268252,0.139897,8.83974



 Show table with all translations and groundtruths

--- Updated Table for: es-LA ---


Unnamed: 0,source_text,reference,prediction_gemini-2.5-pro,prediction_gemini-2.5-flash,prediction_gemini-2.5-flash-lite
0,"About to say oh thank you, I was tell.","Pero, bueno, gracias.","Estaba por decir ""oh, gracias"", te lo juro.","Estaba a punto de decir 'oh, gracias', eso iba...","A punto de decir oh gracias, estaba contando."
1,They adapt to the fact that they it’s better f...,Se adaptan al hecho de que es mejor para la su...,Se adaptan al hecho de que es mejor para la su...,Se adaptan al hecho de que es mejor para la su...,Se adaptan al hecho de que es mejor para la su...
2,"What is up, Daddy Gang?","¿Cómo están, seguidores de Daddy?","¿Qué onda, Daddy Gang?","¿Qué onda, Daddy Gang?","¿Qué hay, Papá Gang?"
3,"OK, so I was, I had just taken the LSAT. I wan...","Bueno, acababa de rendir el examen LSAT. Querí...","Bueno, entonces, acababa de presentar el LSAT....","Bueno, entonces, acababa de tomar el LSAT. Que...","OK, entonces yo estaba, acababa de hacer el ex..."
4,"Of course, you know, it’s not me saying this, ...","Por supuesto, no soy yo quien dice esto. Hay u...","Claro, ya sabes, no lo digo yo, sino que una f...","Claro, sabes, no lo digo yo, sino que una cita...","Por supuesto, ya sabes, no lo digo yo, sino un..."



--- Updated Table for: pt-BR ---


Unnamed: 0,source_text,reference,prediction_gemini-2.5-pro,prediction_gemini-2.5-flash,prediction_gemini-2.5-flash-lite
0,Is it important?,Isso é importante?,É importante?,É importante?,É importante?
1,But I didn't know we had such an impact.,Mas eu não sabia que nosso impacto era tão gra...,Mas eu não sabia que nós tínhamos tanto impacto.,Mas eu não sabia que tínhamos tanto impacto.,Mas eu não sabia que tínhamos tal impacto.
2,"You know, the fact is that if sleep wasn’t imp...","Sabe, o fato é que, se o sono não fosse import...","Sabe, o fato é que, se o sono não fosse import...","Você sabe, o fato é que, se o sono não fosse i...","Sabe, o fato é que se o sono não fosse importa..."
3,"It is your founding father, Alex Cooper with.",Sou Alex Cooper e você está ouvindo,"É o seu pai fundador, Alex Cooper com.","É o seu pai fundador, Alex Cooper, com.","É seu pai fundador, Alex Cooper com."
4,"OK, so I was, I had just taken the LSAT. I wan...","Tá, eu tinha acabado de fazer o LSAT. Eu queri...","Certo, então, eu tinha acabado de prestar o LS...","OK, então eu tinha acabado de fazer o LSAT. Eu...","OK, então eu tinha acabado de fazer o LSAT. Qu..."



--- Updated Table for: fr-FR ---


Unnamed: 0,source_text,reference,prediction_gemini-2.5-pro,prediction_gemini-2.5-flash,prediction_gemini-2.5-flash-lite
0,But that whole natural selection thing when it...,Tout ce processus de sélection naturelle pour ...,Mais toute cette histoire de sélection naturel...,Mais toute cette histoire de sélection naturel...,Mais cette histoire de sélection naturelle che...
1,"I'm not kidding you. I was, and I think the en...","Sans mentir, je l'étais, mais je crois que le ...","Je ne plaisante pas. J'étais, et je pense que ...","Je ne plaisante pas. J'étais, et je pense que ...","Je ne te fais pas de plaisanterie. J'étais, et..."
2,"Something like that, yeah.","Quelque chose comme ça, oui.","Quelque chose comme ça, ouais.","Quelque chose comme ça, ouais.","Quelque chose comme ça, oui."
3,Does my e-mail de dunking on YouTube or on Twi...,C'est mon adresse sur YouTube et sur Twitter.,Est-ce que mon e-mail se dissocie de YouTube o...,Est-ce que mon e-mail de clash est pour YouTub...,"Mon e-mail, c'est pour dunker sur YouTube ou s..."
4,This show was essentially the original X Facto...,C'était essentiellement le précurseur de X Fac...,Cette émission était essentiellement le premie...,"Cette émission, c'était un peu le ""X Factor"" a...",Ce spectacle était essentiellement l'X Factor ...



--- Updated Table for: de-DE ---


Unnamed: 0,source_text,reference,prediction_gemini-2.5-pro,prediction_gemini-2.5-flash,prediction_gemini-2.5-flash-lite
0,"And he was like, I’ll just take thousands of y...","Und er meinte so: „Ach, das dauerte Tausende v...","Und er meinte so, er nehme sich einfach Tausen...","Und er so: ""Ich nehm' mir einfach Tausende von...",Und er sagte so etwas wie: „Ich nehme einfach ...
1,Very nice to meet you by the way.,"Ich freue mich übrigens sehr, Sie kennenzulernen.","Übrigens, sehr schön Sie kennenzulernen.","Übrigens, sehr schön, Sie kennenzulernen.","Sehr nett, Sie kennenzulernen, übrigens."
2,"Of course, you know, it’s not me saying this, ...","Natürlich ist das nicht nur etwas, was ich beh...","Wissen Sie, das sage natürlich nicht ich, sond...","Natürlich, wissen Sie, das sage nicht ich, abe...","Natürlich, Sie wissen, das sage nicht ich, son..."
3,It’s a really fascinating thing that seeds do ...,"Es ist wirklich faszinierend, dass sich Samen ...","Es ist wirklich faszinierend, dass sich Samen ...","Es ist eine wirklich faszinierende Sache, dass...","Es ist eine wirklich faszinierende Sache, dass..."
4,"It is your founding father, Alex Cooper with.","Hier ist euer Gründervater, Alex Cooper mit","Es ist euer Gründungsvater, Alex Cooper.","Das ist euer Gründervater, Alex Cooper, dabei.","Ihr Gründervater, Alex Cooper, mit dem."
