<a href="https://colab.research.google.com/github/deltorobarba/sciences/blob/master/ai_evaluations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Vertex AI Evaluations**

In [None]:
# Update project ID, GCP bucket name and name of TMX file manually:

PROJECT_ID = "YOUR-PROJECT-ID"               # <--- UPDATE THIS
LOCATION = "us-central1"
BUCKET_NAME = "translations-eval" # <--- UPDATE THIS
BUCKET_URI = f"gs://{BUCKET_NAME}"
TMX_GCS_PATH = "samples.tmx"    # <--- UPLOAD THIS
LOCAL_TMX_FILE = "samples.tmx"

In [None]:
%pip install --upgrade google-cloud-aiplatform google-cloud-storage -q
%pip install matplotlib seaborn langdetect -q
%pip install --upgrade --user --quiet google-cloud-aiplatform[evaluation]

!pip install google-cloud-translate==2.0.1 -q
!pip install --upgrade google-cloud-translate -q

import xml.etree.ElementTree as ET
import json
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict
from sklearn.model_selection import train_test_split
from google.cloud import aiplatform, storage
from google.cloud import translate_v3 as translate
import vertexai
from vertexai.tuning import sft
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
import re
from typing import Dict, List, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# For language detection
from langdetect import detect, detect_langs

In [None]:
# Initialize Vertex AI
vertexai.init(project=PROJECT_ID, location=LOCATION)

# Utility Functions
def download_from_gcs(bucket_name, source_blob_name, destination_file_name):
    """Downloads a file from GCS and returns the local path."""
    print(f"--- Downloading {source_blob_name} ---")
    storage_client = storage.Client(project=PROJECT_ID)
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)
    print(f"Successfully downloaded to {destination_file_name}")
    return destination_file_name

def upload_to_gcs(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to GCS and returns the GCS URI."""
    print(f"--- Uploading {source_file_name} ---")
    storage_client = storage.Client(project=PROJECT_ID)
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    gcs_uri = f"gs://{bucket_name}/{destination_blob_name}"
    print(f"Successfully uploaded to {gcs_uri}")
    return gcs_uri

In [None]:
# @title Vertex AI Evaluations
# ============================================================================
import pandas as pd
import re
from vertexai.evaluation import EvalTask
from vertexai.evaluation.metrics import pointwise_metric

try:
    import notebook_utils
except ImportError:
    print("Warning: 'notebook_utils' module not found. Result display might be basic.")
    class notebook_utils:
        @staticmethod
        def display_eval_result(result):
            print(result)

metrics = [
    "bleu",
    pointwise_metric.Comet(version="COMET_22_SRC_REF"),
    pointwise_metric.MetricX(version="METRICX_24_SRC"),
]

all_eval_results = {}

# Check if df dictionary exists
if 'ground_truth_dfs' not in locals() or not ground_truth_dfs:
    print("\n 'ground_truth_dfs' dictionary not found or is empty. Please re-run the previous steps first.")
else:
    for lang_code, df in ground_truth_dfs.items():
        print(f"\n\n--- Starting Evaluation for: {lang_code.upper()} ---")

        eval_df_prepared = df.copy()
        eval_df_prepared.rename(columns={
            'source_text': 'content',    # Original (English) text
            'prediction': 'response',    # Model's translated output
            'reference_text': 'reference' # Human-translated reference text
        }, inplace=True, errors='ignore') # Added errors='ignore' for safety

        if 'reference' not in eval_df_prepared.columns:
            print(f"    Skipping {lang_code}: The required 'reference' column was not found.")
            print("      Please ensure your DataFrame has a 'reference_text' column.")
            continue

        safe_lang_code = re.sub(r'[^a-z0-9-]', '', lang_code.lower())
        experiment_name = f"translation-eval-{safe_lang_code}"
        print(f"   Vertex AI Experiment Name: {experiment_name}")

        try:
            print("   Initializing evaluation task...")
            eval_task = EvalTask(
                dataset=eval_df_prepared,
                metrics=metrics,
                experiment=experiment_name
            )

            print("   Running evaluation... (This can take several minutes per language)")
            eval_result = eval_task.evaluate()
            all_eval_results[lang_code] = eval_result
            print(f"  Evaluation for {lang_code} complete.")

            print(f"\n--- Results for {lang_code} ---")
            notebook_utils.display_eval_result(eval_result)

        except Exception as e:
            print(f"  An error occurred during evaluation for {lang_code}: {e}")
            print("   Skipping to the next language.")

# --- Final Summary ---
print("\n" + "="*80)
print("AUTOMATIC EVALUATIONS SUMMARY")
print("="*80)

if not all_eval_results:
    print("\nNo evaluations were successfully completed.")
else:
    print("\n")
    print("* row_count is number of sentences evaluated (more sentences make the results more reliable)")
    print("* bleu/mean measures word and phrase overlap with a human reference (precision). On a scale of 0 to 1, a score above 0.4 is usable and above 0.6 is high quality.")
    print("* bleu/std measures consistency of BLEU score. A low value is good (stable quality); a high value is bad (erratic quality).")
    print("* comet/mean measures semantic similarity (meaning) using an AI model. A score above 0.8 is good; below 0.6 suggests errors.")
    print("* comet/std measures consistency of semantic quality. A low value is good (reliable meaning); a high value is bad (unreliable meaning).")
    print("* metricx/mean measures semantic quality using an advanced AI model. A score above 10 is good, above 15 is excellent, and below 5 is poor.")
    print("* metricx/std measures consistency of semantic quality. A low value is good (consistently accurate); a high value is bad (unpredictable).")
    print("\n ")
    summary_data = []
    for lang_code, result in all_eval_results.items():
        metrics_dict = result.summary_metrics
        metrics_dict['language'] = lang_code
        summary_data.append(metrics_dict)

    summary_df = pd.DataFrame(summary_data)
    cols = ['language'] + [col for col in summary_df.columns if col != 'language']
    summary_df = summary_df[cols]

    display(summary_df)