In [None]:
!pip install nltk rouge-score matplotlib seaborn

In [None]:
# Add this cell at the beginning of your notebook

!pip install sacrebleu bert-score torchmetrics nltk rouge-score datasets transformers groq pandas tqdm matplotlib seaborn

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

# Optional: if you want to check installed versions

import pkg_resources
packages = ['sacrebleu', 'bert_score', 'torchmetrics', 'nltk', 'rouge_score',

           'datasets', 'transformers', 'groq', 'pandas', 'tqdm']

for package in packages:
    try:
        version = pkg_resources.get_distribution(package).version
        print(f"{package}: {version}")
    except pkg_resources.DistributionNotFound:
        print(f"{package}: Not installed")

In [None]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from sacrebleu.metrics import BLEU, CHRF, TER
from bert_score import BERTScorer
from torchmetrics.text import TranslationEditRate, WordErrorRate, CharErrorRate
from rouge_score import rouge_scorer
from groq import Groq
import torch
from tqdm import tqdm
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

# Initialize Groq client with placeholder API key
client = Groq(api_key="YOUR_API_KEY")

# Define models with their context lengths
MODELS = {
    "gemma2-9b-it": {"provider": "Google", "context_length": 8192},
    "gemma-7b-it": {"provider": "Google", "context_length": 8192},
    "llama3-groq-70b-8192-tool-use-preview": {"provider": "Groq", "context_length": 8192},
    "llama3-groq-8b-8192-tool-use-preview": {"provider": "Groq", "context_length": 8192},
    "llama-3.1-70b-versatile": {"provider": "Meta", "context_length": 8192},
    "llama-3.1-8b-instant": {"provider": "Meta", "context_length": 8192},
    "mixtral-8x7b-32768": {"provider": "Mistral", "context_length": 32768},
    "llama-3.2-90b-vision-preview": {"provider": "Meta", "context_length": 128000}
}

def load_translation_data(language_pair, num_samples=20):
    """Load dataset for specified language pair."""
    try:
        # Try loading from validation set first
        dataset = load_dataset("wmt19", language_pair, split="validation")
    except ValueError:
        # If validation not available, try train set
        dataset = load_dataset("wmt19", language_pair, split="train")

    # Select the specified number of samples
    return dataset.select(range(min(num_samples, len(dataset))))

def translate_text(text, model_name, source_lang, target_lang):
    """Translate text using specified model via Groq API."""
    language_names = {
    'cs': 'Czech',
    'en': 'English',
    'de': 'German',
    'fi': 'Finnish',
    'fr': 'French',
    'gu': 'Gujarati',
    'kk': 'Kazakh',
    'lt': 'Lithuanian',
    'ru': 'Russian',
    'zh': 'Chinese'
    }

    source_lang_name = language_names[source_lang]
    target_lang_name = language_names[target_lang]

    prompt = f"""Translate the following {source_lang_name} text to {target_lang_name}:
    {text}
    Translation:"""

    # Get model's max context length
    max_length = MODELS[model_name]["context_length"]

    # Truncate input if necessary to fit context length (leaving room for prompt and response)
    safe_length = max_length - 500  # Reserve tokens for prompt and response
    if len(text) > safe_length:
        text = text[:safe_length] + "..."

    chat_completion = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.1,
    )
    return chat_completion.choices[0].message.content.strip()

def calculate_metrics(references, hypotheses):
    """Calculate various MT evaluation metrics."""
    # Initialize metrics
    bleu = BLEU()
    chrf = CHRF()
    ter_metric = TER()
    bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    wer = WordErrorRate()
    cer = CharErrorRate()
    rouge_metrics = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # Calculate BLEU and variants
    bleu_score = bleu.corpus_score(hypotheses, [references]).score
    chrf_score = chrf.corpus_score(hypotheses, [references]).score
    ter_score = ter_metric.corpus_score(hypotheses, [references]).score

    # Calculate BERTScore
    P, R, F1 = bert_scorer.score(hypotheses, references)
    bert_score = F1.mean().item()

    # Calculate WER and CER
    wer_score = wer(hypotheses, references).item()
    cer_score = cer(hypotheses, references).item()

    # Calculate ROUGE scores
    rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
    for hyp, ref in zip(hypotheses, references):
        scores = rouge_metrics.score(ref, hyp)
        rouge_scores['rouge1'] += scores['rouge1'].fmeasure
        rouge_scores['rouge2'] += scores['rouge2'].fmeasure
        rouge_scores['rougeL'] += scores['rougeL'].fmeasure

    for key in rouge_scores:
        rouge_scores[key] /= len(hypotheses)

    return {
        "BLEU": bleu_score,
        "chrF": chrf_score,
        "TER": ter_score,
        "BERTScore": bert_score,
        "WER": wer_score,
        "CER": cer_score,
        "ROUGE-1": rouge_scores['rouge1'],
        "ROUGE-2": rouge_scores['rouge2'],
        "ROUGE-L": rouge_scores['rougeL']
    }

def evaluate_models(dataset, source_lang, target_lang):
    """Evaluate multiple models on the dataset."""
    results = {}

    for model_name, model_info in MODELS.items():
        print(f"\nEvaluating {model_name} ({model_info['provider']})...")
        translations = []
        references = []
        source_texts = []

        for example in tqdm(dataset):
            source_text = example['translation'][source_lang]
            reference = example['translation'][target_lang]

            try:
                translation = translate_text(source_text, model_name, source_lang, target_lang)

                source_texts.append(source_text)
                translations.append(translation)
                references.append(reference)
            except Exception as e:
                print(f"Error with {model_name} on text: {str(e)}")
                continue

        if translations:  # Only calculate metrics if we have translations
            # Calculate all metrics
            metrics = calculate_metrics(references, translations)
            results[f"{model_name} ({model_info['provider']})"] = metrics

            # Save translations and source texts for review
            pd.DataFrame({
                'Source': source_texts,
                'Reference': references,
                'Translation': translations
            }).to_csv(f'translations_{model_name}_{source_lang}-{target_lang}.csv', index=False)

    return pd.DataFrame(results).T

def visualize_results(results, pair_name):
    """Create visualizations for the evaluation results."""
    import matplotlib.pyplot as plt
    import seaborn as sns

    plt.figure(figsize=(20, 10))
    sns.heatmap(results, annot=True, cmap='YlOrRd', fmt='.3f')
    plt.title(f'Translation Metrics Comparison for {pair_name}')
    plt.ylabel('Models')
    plt.xlabel('Metrics')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig(f'mt_evaluation_heatmap_{pair_name}.png')
    plt.close()

def main():
    # Language pairs to evaluate
    language_pairs = [
    ("cs-en", "Czech-English"),
    ("de-en", "German-English"),
    ("fi-en", "Finnish-English"),
    ("fr-de", "French-German"),
    ("gu-en", "Gujarati-English"),
    ("kk-en", "Kazakh-English"),
    ("lt-en", "Lithuanian-English"),
    ("ru-en", "Russian-English"),
    ("zh-en", "Chinese-English")
    ]

    all_results = {}

    for pair_code, pair_name in language_pairs:
        print(f"\nEvaluating {pair_name} translations...")

        # Load dataset
        dataset = load_translation_data(pair_code, num_samples=10000)

        # Get source and target language codes
        source_lang, target_lang = pair_code.split("-")

        # Evaluate models
        results = evaluate_models(dataset, source_lang, target_lang)

        # Save results
        results.to_csv(f"mt_evaluation_results_{pair_code}.csv")

        # Create visualizations
        visualize_results(results, pair_name)

        # Store results
        all_results[pair_name] = results

        # Print results
        print(f"\nResults for {pair_name}:")
        print(results)

    # Create combined visualization
    plt.figure(figsize=(25, 12))
    for idx, (pair_name, results) in enumerate(all_results.items()):
        plt.subplot(1, 2, idx+1)
        sns.heatmap(results, annot=True, cmap='YlOrRd', fmt='.3f')
        plt.title(f'Results for {pair_name}')
        plt.ylabel('Models')
        plt.xlabel('Metrics')
        plt.xticks(rotation=45)
        plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig('combined_results.png')
    plt.close()

if __name__ == "__main__":
    main()