In [None]:
import pandas as pd
import os
import json
from tqdm.auto import tqdm
import time
import requests

# --- Configuration ---
CSV_FILE_PATH = '/content/drive/MyDrive/THESIS/translation-check.csv'
OUTPUT_CSV_PATH = '/content/drive/MyDrive/THESIS/entity_translation_evaluation_mistral.csv'
SAMPLE_PERCENTAGE = 0.20 # 20% of the rows for evaluation
SOURCE_COLUMN = 'e.canonical_text' # Column name for Chinese entity
TARGET_COLUMN = 'e.entity_en'      # Column name for English translation

# --- Mistral API Configuration ---
MISTRAL_MODEL_NAME = "mistral-large-latest"
MISTRAL_API_BASE_URL = "https://api.mistral.ai/v1/chat/completions"

# --- LLM API Call Function ---
def call_mistral_api(source_text: str, target_text: str, api_key: str) -> dict:
    """
    Calls the Mistral LLM API to evaluate the translation.
    Returns a dictionary with scores and comments.
    Handles API errors and returns default values on failure.
    """
    if not api_key:
        return {
            "accuracy_score": 0, "fluency_score": 0, "conciseness_score": 0,
            "standard_terminology_score": 0, "comments": "Error: MISTRAL_API_KEY was not provided."
        }

    prompt_messages = [{
        "role": "user",
        "content": f"""
        You are an expert linguist specializing in evaluating machine translations of medical/scientific entities and terms from Chinese to English.
        Your task is to evaluate the English translation of a Chinese entity/term based on the following four criteria:

        1.  **Accuracy (1-5):** Does the English term precisely and correctly convey the core meaning of the Chinese entity?
        2.  **Fluency (1-5):** Is the English term grammatically correct, natural-sounding, and easy to read?
        3.  **Conciseness (1-5):** Is the English term brief and to-the-point?
        4.  **Standard Terminology (1-5):** Does the English term use widely accepted terminology within the medical/scientific domain?

        Provide your evaluation in a JSON format with the following keys:
        -   `accuracy_score`: integer (1-5)
        -   `fluency_score`: integer (1-5)
        -   `conciseness_score`: integer (1-5)
        -   `standard_terminology_score`: integer (1-5)
        -   `comments`: string (optional, brief explanation for scores)

        ---
        Chinese Entity: {source_text}
        English Translation: {target_text}
        ---

        JSON Evaluation:
        """
    }]

    headers = {
        "Content-Type": "application/json",
        "Accept": "application/json",
        "Authorization": f"Bearer {api_key}"
    }

    payload = {
        "model": MISTRAL_MODEL_NAME,
        "messages": prompt_messages,
        "response_format": {"type": "json_object"},
        "temperature": 0.0
    }

    try:
        response = requests.post(MISTRAL_API_BASE_URL, headers=headers, json=payload, timeout=70)
        response.raise_for_status()
        llm_output = response.json()['choices'][0]['message']['content']
        return json.loads(llm_output)
    except requests.exceptions.RequestException as e:
        print(f"Request Error calling Mistral API for '{source_text}': {e}")
        return {
            "accuracy_score": 0, "fluency_score": 0, "conciseness_score": 0,
            "standard_terminology_score": 0, "comments": f"API Request Error: {e}"
        }
    except json.JSONDecodeError as e:
        print(f"JSON Decode Error from Mistral API for '{source_text}': {e}")
        return {
            "accuracy_score": 0, "fluency_score": 0, "conciseness_score": 0,
            "standard_terminology_score": 0, "comments": f"JSON Parsing Error: {e}"
        }
    except Exception as e:
        print(f"Unexpected Error calling Mistral API for '{source_text}': {e}")
        return {
            "accuracy_score": 0, "fluency_score": 0, "conciseness_score": 0,
            "standard_terminology_score": 0, "comments": f"Unexpected LLM API Error: {e}"
        }

# --- Main Script Execution ---
if __name__ == "__main__":

    MISTRAL_API_KEY = 'SfBqjLoEhc8lEsaqGaOO4nD14HdoIijR'

    if not MISTRAL_API_KEY:
        print("Error: MISTRAL_API_KEY environment variable not set.")
        print("Please add it as a secret in your Colab environment.")
        exit()

    # --- Load Data ---
    if not os.path.exists(CSV_FILE_PATH):
        print(f"Error: CSV file not found at '{CSV_FILE_PATH}'.")
        print("Please ensure the path is correct and your Google Drive is mounted.")
        exit()

    try:
        df = pd.read_csv(CSV_FILE_PATH)
        print(f"Loaded {len(df)} rows from '{CSV_FILE_PATH}'.")
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        exit()

    if SOURCE_COLUMN not in df.columns or TARGET_COLUMN not in df.columns:
        print(f"Error: CSV must contain '{SOURCE_COLUMN}' and '{TARGET_COLUMN}' columns.")
        print(f"Available columns are: {df.columns.to_list()}")
        exit()

    # --- Sample Data ---
    if len(df) == 0:
        print("No data to sample.")
        exit()

    sample_size = int(len(df) * SAMPLE_PERCENTAGE)
    if sample_size == 0:
        sample_size = 1

    sampled_df = df.sample(n=sample_size, random_state=42)
    print(f"Selected {len(sampled_df)} rows ({SAMPLE_PERCENTAGE*100:.0f}%) for evaluation.")

    # --- Evaluate Translations with LLM ---
    results = []
    for index, row in tqdm(sampled_df.iterrows(), total=len(sampled_df), desc="Evaluating translations"):
        source_text = str(row[SOURCE_COLUMN])
        target_text = str(row[TARGET_COLUMN])

        # Pass the API key directly to the function
        evaluation = call_mistral_api(source_text, target_text, MISTRAL_API_KEY)

        results.append({
            SOURCE_COLUMN: source_text,
            TARGET_COLUMN: target_text,
            'LLM_Accuracy_Score': evaluation.get('accuracy_score', 0),
            'LLM_Fluency_Score': evaluation.get('fluency_score', 0),
            'LLM_Conciseness_Score': evaluation.get('conciseness_score', 0),
            'LLM_Standard_Terminology_Score': evaluation.get('standard_terminology_score', 0),
            'LLM_Comments': evaluation.get('comments', '')
        })
        time.sleep(0.5) # Avoid hitting API rate limits

    results_df = pd.DataFrame(results)

    # --- Save and Summarize Results ---
    results_df.to_csv(OUTPUT_CSV_PATH, index=False)
    print(f"\nEvaluation results saved to '{OUTPUT_CSV_PATH}'.")
    print("\n--- Evaluation Summary ---")
    valid_results_df = results_df[
        (results_df['LLM_Accuracy_Score'] > 0) &
        (results_df['LLM_Fluency_Score'] > 0) &
        (results_df['LLM_Conciseness_Score'] > 0) &
        (results_df['LLM_Standard_Terminology_Score'] > 0)
    ]

    if not valid_results_df.empty:
        avg_accuracy = valid_results_df['LLM_Accuracy_Score'].mean()
        avg_fluency = valid_results_df['LLM_Fluency_Score'].mean()
        avg_conciseness = valid_results_df['LLM_Conciseness_Score'].mean()
        avg_standard_terminology = valid_results_df['LLM_Standard_Terminology_Score'].mean()

        print(f"Average Accuracy Score (1-5): {avg_accuracy:.2f}")
        print(f"Average Fluency Score (1-5): {avg_fluency:.2f}")
        print(f"Average Conciseness Score (1-5): {avg_conciseness:.2f}")
        print(f"Average Standard Terminology Score (1-5): {avg_standard_terminology:.2f}")
    else:
        print("No valid LLM evaluation results to summarize.")

    error_count = len(results_df) - len(valid_results_df)
    if error_count > 0:
        print(f"\nWarning: {error_count} samples could not be evaluated due to API errors.")

Loaded 4920 rows from '/content/drive/MyDrive/THESIS/translation-check.csv'.
Selected 984 rows (20%) for evaluation.


Evaluating translations:   0%|          | 0/984 [00:00<?, ?it/s]

Request Error calling Mistral API for '中长效促泌剂': HTTPSConnectionPool(host='api.mistral.ai', port=443): Read timed out. (read timeout=70)

Evaluation results saved to '/content/drive/MyDrive/THESIS/entity_translation_evaluation_mistral.csv'.

--- Evaluation Summary ---
Average Accuracy Score (1-5): 4.84
Average Fluency Score (1-5): 4.93
Average Conciseness Score (1-5): 4.91
Average Standard Terminology Score (1-5): 4.78

