In [1]:
# Add this cell at the beginning of your notebook
# !pip install sacrebleu bert-score torchmetrics nltk rouge-score datasets transformers groq pandas tqdm matplotlib seaborn

In [2]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

# Optional: if you want to check installed versions
# import pkg_resources
# packages = ['sacrebleu', 'bert_score', 'torchmetrics', 'nltk', 'rouge_score', 
#            'datasets', 'transformers', 'groq', 'pandas', 'tqdm']
#            
# for package in packages:
#     try:
#         version = pkg_resources.get_distribution(package).version
#         print(f"{package}: {version}")
#     except pkg_resources.DistributionNotFound:
#         print(f"{package}: Not installed")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\subhe\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\subhe\AppData\Roaming\nltk_data...


True

In [None]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from sacrebleu.metrics import BLEU, CHRF, TER
from bert_score import BERTScorer
from torchmetrics.text import TranslationEditRate, WordErrorRate, CharErrorRate
from rouge_score import rouge_scorer
from groq import Groq
import torch
from tqdm import tqdm
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

# Initialize Groq client with placeholder API key
client = Groq(api_key="YOUR API KEY")

# Define models with their context lengths
MODELS = {
    "gemma2-9b-it": {"provider": "Google", "context_length": 8192},
    "gemma-7b-it": {"provider": "Google", "context_length": 8192},
    "llama3-groq-70b-8192-tool-use-preview": {"provider": "Groq", "context_length": 8192},
    "llama3-groq-8b-8192-tool-use-preview": {"provider": "Groq", "context_length": 8192},
    "llama-3.1-70b-versatile": {"provider": "Meta", "context_length": 8192},
    "llama-3.1-8b-instant": {"provider": "Meta", "context_length": 8192},
    "mixtral-8x7b-32768": {"provider": "Mistral", "context_length": 32768},
    "llama-3.2-90b-vision-preview": {"provider": "Meta", "context_length": 128000}
}

def load_translation_data(language_pair, num_samples=20):
    """Load dataset for specified language pair."""
    try:
        # Try loading from validation set first
        dataset = load_dataset("wmt19", language_pair, split="validation")
    except ValueError:
        # If validation not available, try train set
        dataset = load_dataset("wmt19", language_pair, split="train")
    
    # Select the specified number of samples
    return dataset.select(range(min(num_samples, len(dataset))))

def translate_text(text, model_name, source_lang, target_lang):
    """Translate text using specified model via Groq API."""
    language_names = {
    'cs': 'Czech',
    'en': 'English',
    'de': 'German',
    'fi': 'Finnish',
    'fr': 'French',
    'gu': 'Gujarati',
    'kk': 'Kazakh',
    'lt': 'Lithuanian',
    'ru': 'Russian',
    'zh': 'Chinese'
    }
    
    source_lang_name = language_names[source_lang]
    target_lang_name = language_names[target_lang]
    
    prompt = f"""Translate the following {source_lang_name} text to {target_lang_name}:
    {text}
    Translation:"""
    
    # Get model's max context length
    max_length = MODELS[model_name]["context_length"]
    
    # Truncate input if necessary to fit context length (leaving room for prompt and response)
    safe_length = max_length - 500  # Reserve tokens for prompt and response
    if len(text) > safe_length:
        text = text[:safe_length] + "..."
    
    chat_completion = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.1,
    )
    return chat_completion.choices[0].message.content.strip()

def calculate_metrics(references, hypotheses, target_lang):
    """Calculate various MT evaluation metrics."""
    bleu_tokenizer = "zh" if target_lang == "zh" else "13a"
    bleu = BLEU(tokenizer=bleu_tokenizer)
    chrf = CHRF()
    ter_metric = TER()
    bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    wer = WordErrorRate()
    cer = CharErrorRate()
    rouge_metrics = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Calculate BLEU and variants
    bleu_score = bleu.corpus_score(hypotheses, [references]).score
    chrf_score = chrf.corpus_score(hypotheses, [references]).score
    ter_score = ter_metric.corpus_score(hypotheses, [references]).score
    
    # Calculate BERTScore
    P, R, F1 = bert_scorer.score(hypotheses, references)
    bert_score = F1.mean().item()
    
    # Calculate WER and CER
    wer_score = wer(hypotheses, references).item()
    cer_score = cer(hypotheses, references).item()
    
    # Calculate ROUGE scores
    rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
    for hyp, ref in zip(hypotheses, references):
        scores = rouge_metrics.score(ref, hyp)
        rouge_scores['rouge1'] += scores['rouge1'].fmeasure
        rouge_scores['rouge2'] += scores['rouge2'].fmeasure
        rouge_scores['rougeL'] += scores['rougeL'].fmeasure
    
    for key in rouge_scores:
        rouge_scores[key] /= len(hypotheses)

    return {
        "BLEU": bleu_score,
        "chrF": chrf_score,
        "TER": ter_score,
        "BERTScore": bert_score,
        "WER": wer_score,
        "CER": cer_score,
        "ROUGE-1": rouge_scores['rouge1'],
        "ROUGE-2": rouge_scores['rouge2'],
        "ROUGE-L": rouge_scores['rougeL']
    }

def evaluate_models(dataset, source_lang, target_lang):
    """Evaluate multiple models on the dataset."""
    results = {}
    
    for model_name, model_info in MODELS.items():
        print(f"\nEvaluating {model_name} ({model_info['provider']})...")
        translations = []
        references = []
        source_texts = []
        
        for example in tqdm(dataset):
            source_text = example['translation'][source_lang]
            reference = example['translation'][target_lang]
            
            try:
                translation = translate_text(source_text, model_name, source_lang, target_lang)
                
                source_texts.append(source_text)
                translations.append(translation)
                references.append(reference)
            except Exception as e:
                print(f"Error with {model_name} on text: {str(e)}")
                continue
        
        if translations:  # Only calculate metrics if we have translations
            # Calculate all metrics
            metrics = calculate_metrics(references, translations)
            results[f"{model_name} ({model_info['provider']})"] = metrics
            
            # Save translations and source texts for review
            pd.DataFrame({
                'Source': source_texts,
                'Reference': references,
                'Translation': translations
            }).to_csv(f'translations_{model_name}_{source_lang}-{target_lang}.csv', index=False)
    
    return pd.DataFrame(results).T

def visualize_results(results, pair_name):
    """Create visualizations for the evaluation results."""
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    plt.figure(figsize=(20, 10))
    sns.heatmap(results, annot=True, cmap='YlOrRd', fmt='.3f')
    plt.title(f'Translation Metrics Comparison for {pair_name}')
    plt.ylabel('Models')
    plt.xlabel('Metrics')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig(f'mt_evaluation_heatmap_{pair_name}.png')
    plt.close()

def main():
    # Language pairs to evaluate
    language_pairs = [
    ("cs-en", "Czech-English"),
    ("de-en", "German-English"),
    ("fi-en", "Finnish-English"),
    ("fr-de", "French-German"),
    ("gu-en", "Gujarati-English"),
    ("kk-en", "Kazakh-English"),
    ("lt-en", "Lithuanian-English"),
    ("ru-en", "Russian-English"),
    ("zh-en", "Chinese-English")
    ]
    
    all_results = {}
    
    for pair_code, pair_name in language_pairs:
        print(f"\nEvaluating {pair_name} translations...")
        
        # Load dataset
        dataset = load_translation_data(pair_code, num_samples=100)
        
        # Get source and target language codes
        source_lang, target_lang = pair_code.split("-")
        
        # Evaluate models
        results = evaluate_models(dataset, source_lang, target_lang)
        
        # Save results
        results.to_csv(f"mt_evaluation_results_{pair_code}.csv")
        
        # Create visualizations
        visualize_results(results, pair_name)
        
        # Store results
        all_results[pair_name] = results
        
        # Print results
        print(f"\nResults for {pair_name}:")
        print(results)
    
    # Create combined visualization
#     plt.figure(figsize=(25, 12))
#     for idx, (pair_name, results) in enumerate(all_results.items()):
#         plt.subplot(1, 2, idx+1)
#         sns.heatmap(results, annot=True, cmap='YlOrRd', fmt='.3f')
#         plt.title(f'Results for {pair_name}')
#         plt.ylabel('Models')
#         plt.xlabel('Metrics')
#         plt.xticks(rotation=45)
#         plt.yticks(rotation=0)
#     plt.tight_layout()
#     plt.savefig('combined_results.png')
#     plt.close()

if __name__ == "__main__":
    main()


Evaluating Czech-English translations...


Downloading readme: 100%|██████████| 11.3k/11.3k [00:00<00:00, 5.65MB/s]
Downloading data: 100%|██████████| 195M/195M [01:10<00:00, 2.75MB/s] 
Downloading data:  48%|████▊     | 105M/216M [00:36<00:43, 2.55MB/s] 

In [None]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from sacrebleu.metrics import BLEU, CHRF, TER
from bert_score import BERTScorer
from torchmetrics.text import TranslationEditRate, WordErrorRate, CharErrorRate
from rouge_score import rouge_scorer
from groq import Groq
import torch
from tqdm import tqdm
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

# Initialize Groq client with placeholder API key
client = Groq(api_key="YOUR API KEY")

# Define models with their context lengths
MODELS = {
    "gemma2-9b-it": {"provider": "Google", "context_length": 8192},
    "gemma-7b-it": {"provider": "Google", "context_length": 8192},
    "llama3-groq-70b-8192-tool-use-preview": {"provider": "Groq", "context_length": 8192},
    "llama3-groq-8b-8192-tool-use-preview": {"provider": "Groq", "context_length": 8192},
    "llama-3.1-70b-versatile": {"provider": "Meta", "context_length": 8192},
    "llama-3.1-8b-instant": {"provider": "Meta", "context_length": 8192},
    "mixtral-8x7b-32768": {"provider": "Mistral", "context_length": 32768},
    "llama-3.2-90b-vision-preview": {"provider": "Meta", "context_length": 128000}
}

def load_translation_data(language_pair, num_samples=20):
    """Load dataset for specified language pair."""
    try:
        # Try loading from validation set first
        dataset = load_dataset("wmt18", language_pair, split="validation")
    except ValueError:
        # If validation not available, try train set
        dataset = load_dataset("wmt18", language_pair, split="train")
    
    # Select the specified number of samples
    return dataset.select(range(min(num_samples, len(dataset))))

def translate_text(text, model_name, source_lang, target_lang):
    """Translate text using specified model via Groq API."""
    language_names = {
    'en': 'English',
    'et': 'Estonian',
    'tr': 'Turkish'
    }
    
    source_lang_name = language_names[source_lang]
    target_lang_name = language_names[target_lang]
    
    prompt = f"""Translate the following {source_lang_name} text to {target_lang_name}:
    {text}
    Translation:"""
    
    # Get model's max context length
    max_length = MODELS[model_name]["context_length"]
    
    # Truncate input if necessary to fit context length (leaving room for prompt and response)
    safe_length = max_length - 500  # Reserve tokens for prompt and response
    if len(text) > safe_length:
        text = text[:safe_length] + "..."
    
    chat_completion = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.1,
    )
    return chat_completion.choices[0].message.content.strip()

def calculate_metrics(references, hypotheses):
    """Calculate various MT evaluation metrics."""
    # Initialize metrics
    bleu = BLEU()
    chrf = CHRF()
    ter_metric = TER()
    bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    wer = WordErrorRate()
    cer = CharErrorRate()
    rouge_metrics = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Calculate BLEU and variants
    bleu_score = bleu.corpus_score(hypotheses, [references]).score
    chrf_score = chrf.corpus_score(hypotheses, [references]).score
    ter_score = ter_metric.corpus_score(hypotheses, [references]).score
    
    # Calculate BERTScore
    P, R, F1 = bert_scorer.score(hypotheses, references)
    bert_score = F1.mean().item()
    
    # Calculate WER and CER
    wer_score = wer(hypotheses, references).item()
    cer_score = cer(hypotheses, references).item()
    
    # Calculate ROUGE scores
    rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
    for hyp, ref in zip(hypotheses, references):
        scores = rouge_metrics.score(ref, hyp)
        rouge_scores['rouge1'] += scores['rouge1'].fmeasure
        rouge_scores['rouge2'] += scores['rouge2'].fmeasure
        rouge_scores['rougeL'] += scores['rougeL'].fmeasure
    
    for key in rouge_scores:
        rouge_scores[key] /= len(hypotheses)

    return {
        "BLEU": bleu_score,
        "chrF": chrf_score,
        "TER": ter_score,
        "BERTScore": bert_score,
        "WER": wer_score,
        "CER": cer_score,
        "ROUGE-1": rouge_scores['rouge1'],
        "ROUGE-2": rouge_scores['rouge2'],
        "ROUGE-L": rouge_scores['rougeL']
    }

def evaluate_models(dataset, source_lang, target_lang):
    """Evaluate multiple models on the dataset."""
    results = {}
    
    for model_name, model_info in MODELS.items():
        print(f"\nEvaluating {model_name} ({model_info['provider']})...")
        translations = []
        references = []
        source_texts = []
        
        for example in tqdm(dataset):
            source_text = example['translation'][source_lang]
            reference = example['translation'][target_lang]
            
            try:
                translation = translate_text(source_text, model_name, source_lang, target_lang)
                
                source_texts.append(source_text)
                translations.append(translation)
                references.append(reference)
            except Exception as e:
                print(f"Error with {model_name} on text: {str(e)}")
                continue
        
        if translations:  # Only calculate metrics if we have translations
            # Calculate all metrics
            metrics = calculate_metrics(references, translations)
            results[f"{model_name} ({model_info['provider']})"] = metrics
            
            # Save translations and source texts for review
            pd.DataFrame({
                'Source': source_texts,
                'Reference': references,
                'Translation': translations
            }).to_csv(f'translations_{model_name}_{source_lang}-{target_lang}.csv', index=False)
    
    return pd.DataFrame(results).T

def visualize_results(results, pair_name):
    """Create visualizations for the evaluation results."""
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    plt.figure(figsize=(20, 10))
    sns.heatmap(results, annot=True, cmap='YlOrRd', fmt='.3f')
    plt.title(f'Translation Metrics Comparison for {pair_name}')
    plt.ylabel('Models')
    plt.xlabel('Metrics')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig(f'mt_evaluation_heatmap_{pair_name}.png')
    plt.close()

def main():
    # Language pairs to evaluate
    language_pairs = [
    ("et-en", "Estonian-English"),
    ("tr-en", "Turkish-English"),
    ]
    
    all_results = {}
    
    for pair_code, pair_name in language_pairs:
        print(f"\nEvaluating {pair_name} translations...")
        
        # Load dataset
        dataset = load_translation_data(pair_code, num_samples=100)
        
        # Get source and target language codes
        source_lang, target_lang = pair_code.split("-")
        
        # Evaluate models
        results = evaluate_models(dataset, source_lang, target_lang)
        
        # Save results
        results.to_csv(f"mt_evaluation_results_{pair_code}.csv")
        
        # Create visualizations
        visualize_results(results, pair_name)
        
        # Store results
        all_results[pair_name] = results
        
        # Print results
        print(f"\nResults for {pair_name}:")
        print(results)
    
    # Create combined visualization
#     plt.figure(figsize=(25, 12))
#     for idx, (pair_name, results) in enumerate(all_results.items()):
#         plt.subplot(1, 2, idx+1)
#         sns.heatmap(results, annot=True, cmap='YlOrRd', fmt='.3f')
#         plt.title(f'Results for {pair_name}')
#         plt.ylabel('Models')
#         plt.xlabel('Metrics')
#         plt.xticks(rotation=45)
#         plt.yticks(rotation=0)
#     plt.tight_layout()
#     plt.savefig('combined_results.png')
#     plt.close()

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm



Evaluating Estonian-English translations...


Downloading readme: 100%|██████████| 11.7k/11.7k [00:00<?, ?B/s]
Downloading data: 100%|██████████| 145M/145M [01:04<00:00, 2.26MB/s] 
Downloading data: 100%|██████████| 138M/138M [01:24<00:00, 1.64MB/s] 
Downloading data: 100%|██████████| 310k/310k [00:01<00:00, 267kB/s]
Downloading data: 100%|██████████| 324k/324k [00:01<00:00, 248kB/s]
Generating train split: 100%|██████████| 2175873/2175873 [00:02<00:00, 805169.66 examples/s]
Generating validation split: 100%|██████████| 2000/2000 [00:00<00:00, 330507.39 examples/s]
Generating test split: 100%|██████████| 2000/2000 [00:00<00:00, 241663.06 examples/s]



Evaluating gemma2-9b-it (Google)...


100%|██████████| 100/100 [03:17<00:00,  1.98s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating gemma-7b-it (Google)...


 55%|█████▌    | 55/100 [01:34<03:06,  4.14s/it]

Error with gemma-7b-it on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


100%|██████████| 100/100 [03:36<00:00,  2.16s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating llama3-groq-70b-8192-tool-use-preview (Groq)...


 23%|██▎       | 23/100 [00:54<17:28, 13.62s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Request timed out.


 24%|██▍       | 24/100 [01:26<24:33, 19.39s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Request timed out.


 25%|██▌       | 25/100 [01:59<29:11, 23.35s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Request timed out.


 26%|██▌       | 26/100 [02:32<32:18, 26.19s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Request timed out.


 27%|██▋       | 27/100 [03:05<34:15, 28.16s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Request timed out.


 28%|██▊       | 28/100 [03:38<35:31, 29.60s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Request timed out.


 29%|██▉       | 29/100 [04:10<36:12, 30.59s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Request timed out.


 30%|███       | 30/100 [04:43<36:22, 31.19s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Request timed out.


 31%|███       | 31/100 [05:16<36:27, 31.70s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Request timed out.


 32%|███▏      | 32/100 [05:48<36:13, 31.96s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Request timed out.


 33%|███▎      | 33/100 [06:21<36:00, 32.25s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Request timed out.


 34%|███▍      | 34/100 [06:54<35:33, 32.33s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Request timed out.


 68%|██████▊   | 68/100 [07:52<02:06,  3.94s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 69%|██████▉   | 69/100 [07:53<01:28,  2.84s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 70%|███████   | 70/100 [07:53<01:02,  2.08s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 71%|███████   | 71/100 [07:56<01:04,  2.22s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 72%|███████▏  | 72/100 [07:58<01:04,  2.32s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 73%|███████▎  | 73/100 [08:01<01:04,  2.39s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 74%|███████▍  | 74/100 [08:03<01:03,  2.44s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 75%|███████▌  | 75/100 [08:06<01:01,  2.47s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 76%|███████▌  | 76/100 [08:08<00:59,  2.49s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 77%|███████▋  | 77/100 [08:11<00:57,  2.51s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 78%|███████▊  | 78/100 [08:13<00:55,  2.52s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 79%|███████▉  | 79/100 [08:16<00:53,  2.53s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 80%|████████  | 80/100 [08:18<00:50,  2.54s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 81%|████████  | 81/100 [08:21<00:48,  2.54s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 82%|████████▏ | 82/100 [08:24<00:45,  2.54s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 83%|████████▎ | 83/100 [08:26<00:43,  2.55s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 84%|████████▍ | 84/100 [08:29<00:40,  2.55s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 85%|████████▌ | 85/100 [08:31<00:38,  2.56s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 86%|████████▌ | 86/100 [08:34<00:35,  2.56s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 87%|████████▋ | 87/100 [08:36<00:33,  2.56s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 88%|████████▊ | 88/100 [08:39<00:30,  2.55s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 89%|████████▉ | 89/100 [08:41<00:28,  2.56s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 90%|█████████ | 90/100 [08:44<00:25,  2.56s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 91%|█████████ | 91/100 [08:47<00:23,  2.56s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 92%|█████████▏| 92/100 [08:49<00:20,  2.56s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 93%|█████████▎| 93/100 [08:52<00:17,  2.56s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 94%|█████████▍| 94/100 [08:54<00:15,  2.56s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 95%|█████████▌| 95/100 [08:57<00:12,  2.56s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 96%|█████████▌| 96/100 [08:59<00:10,  2.55s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 97%|█████████▋| 97/100 [09:02<00:07,  2.55s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 98%|█████████▊| 98/100 [09:05<00:05,  2.56s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


 99%|█████████▉| 99/100 [09:07<00:02,  2.56s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


100%|██████████| 100/100 [09:10<00:00,  5.50s/it]

Error with llama3-groq-70b-8192-tool-use-preview on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating llama3-groq-8b-8192-tool-use-preview (Groq)...


100%|██████████| 100/100 [03:14<00:00,  1.94s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating llama-3.1-70b-versatile (Meta)...


100%|██████████| 100/100 [03:21<00:00,  2.01s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating llama-3.1-8b-instant (Meta)...


100%|██████████| 100/100 [03:15<00:00,  1.95s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating mixtral-8x7b-32768 (Mistral)...


100%|██████████| 100/100 [03:19<00:00,  2.00s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating llama-3.2-90b-vision-preview (Meta)...


100%|██████████| 100/100 [06:48<00:00,  4.08s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Estonian-English:
                                                   BLEU       chrF  \
gemma2-9b-it (Google)                         22.692077  54.284388   
gemma-7b-it (Google)                          14.305919  39.535810   
llama3-groq-70b-8192-tool-use-preview (Groq)  31.177287  55.382670   
llama3-groq-8b-8192-tool-use-preview (Groq)   20.639136  48.178823   
llama-3.1-70b-versatile (Meta)                33.637959  58.032144   
llama-3.1-8b-instant (Meta)                   20.354680  50.746537   
mixtral-8x7b-32768 (Mistral)                   8.271410  37.085844   
llama-3.2-90b-vision-preview (Meta)           33.042196  57.815897   

                                                     TER  BERTScore       WER  \
gemma2-9b-it (Google)                         109.499683   0.552166  1.129196   
gemma-7b-it (Google)                           91.326861   0.454517  0.936570   
llama3-groq-70b-8192-tool-use-preview (Groq)   56.453423   0.738808  0.604938   
llama3-groq-8b

Downloading data: 100%|██████████| 36.8M/36.8M [00:14<00:00, 2.48MB/s]
Downloading data: 100%|██████████| 469k/469k [00:01<00:00, 466kB/s]
Downloading data: 100%|██████████| 493k/493k [00:01<00:00, 485kB/s]
Generating train split: 100%|██████████| 205756/205756 [00:00<00:00, 748609.02 examples/s]
Generating validation split: 100%|██████████| 3007/3007 [00:00<00:00, 376823.19 examples/s]
Generating test split: 100%|██████████| 3000/3000 [00:00<00:00, 397426.23 examples/s]



Evaluating gemma2-9b-it (Google)...


100%|██████████| 100/100 [03:20<00:00,  2.00s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating gemma-7b-it (Google)...


 94%|█████████▍| 94/100 [03:30<00:25,  4.17s/it]

Error with gemma-7b-it on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


100%|██████████| 100/100 [03:40<00:00,  2.21s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating llama3-groq-70b-8192-tool-use-preview (Groq)...


100%|██████████| 100/100 [03:22<00:00,  2.02s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating llama3-groq-8b-8192-tool-use-preview (Groq)...


100%|██████████| 100/100 [03:13<00:00,  1.93s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating llama-3.1-70b-versatile (Meta)...


100%|██████████| 100/100 [03:22<00:00,  2.03s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating llama-3.1-8b-instant (Meta)...


100%|██████████| 100/100 [03:15<00:00,  1.96s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating mixtral-8x7b-32768 (Mistral)...


100%|██████████| 100/100 [03:16<00:00,  1.96s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating llama-3.2-90b-vision-preview (Meta)...


100%|██████████| 100/100 [06:48<00:00,  4.08s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Turkish-English:
                                                   BLEU       chrF  \
gemma2-9b-it (Google)                         16.070297  52.063664   
gemma-7b-it (Google)                          15.506035  46.181584   
llama3-groq-70b-8192-tool-use-preview (Groq)  25.592725  55.765616   
llama3-groq-8b-8192-tool-use-preview (Groq)   15.444460  49.193648   
llama-3.1-70b-versatile (Meta)                31.228095  59.341961   
llama-3.1-8b-instant (Meta)                   22.177494  54.300592   
mixtral-8x7b-32768 (Mistral)                  11.467470  45.508898   
llama-3.2-90b-vision-preview (Meta)           30.780356  59.038161   

                                                     TER  BERTScore       WER  \
gemma2-9b-it (Google)                         131.019037   0.501492  1.372900   
gemma-7b-it (Google)                           82.797518   0.563855  0.887197   
llama3-groq-70b-8192-tool-use-preview (Groq)   68.924972   0.680156  0.748600   
llama3-groq-8b-

In [None]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from sacrebleu.metrics import BLEU, CHRF, TER
from bert_score import BERTScorer
from torchmetrics.text import TranslationEditRate, WordErrorRate, CharErrorRate
from rouge_score import rouge_scorer
from groq import Groq
import torch
from tqdm import tqdm
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

# Initialize Groq client with placeholder API key
client = Groq(api_key="YOUR API KEY")

# Define models with their context lengths
MODELS = {
    "gemma2-9b-it": {"provider": "Google", "context_length": 8192},
    "gemma-7b-it": {"provider": "Google", "context_length": 8192},
    "llama3-groq-70b-8192-tool-use-preview": {"provider": "Groq", "context_length": 8192},
    "llama3-groq-8b-8192-tool-use-preview": {"provider": "Groq", "context_length": 8192},
    "llama-3.1-70b-versatile": {"provider": "Meta", "context_length": 8192},
    "llama-3.1-8b-instant": {"provider": "Meta", "context_length": 8192},
    "mixtral-8x7b-32768": {"provider": "Mistral", "context_length": 32768},
    "llama-3.2-90b-vision-preview": {"provider": "Meta", "context_length": 128000}
}

def load_translation_data(language_pair, num_samples=20):
    """Load dataset for specified language pair."""
    try:
        # Try loading from validation set first
        dataset = load_dataset("csebuetnlp/BanglaNMT", split="validation")
    except ValueError:
        # If validation not available, try train set
        dataset = load_dataset("csebuetnlp/BanglaNMT", split="train")
    
    # Select the specified number of samples
    return dataset.select(range(min(num_samples, len(dataset))))

def translate_text(text, model_name, source_lang, target_lang):
    """Translate text using specified model via Groq API."""
    language_names = {
    'en': 'English',
    'bn': 'Bengali'
    }
    
    source_lang_name = language_names[source_lang]
    target_lang_name = language_names[target_lang]
    
    prompt = f"""Translate the following {source_lang_name} text to {target_lang_name}:
    {text}
    Translation:"""
    
    # Get model's max context length
    max_length = MODELS[model_name]["context_length"]
    
    # Truncate input if necessary to fit context length (leaving room for prompt and response)
    safe_length = max_length - 500  # Reserve tokens for prompt and response
    if len(text) > safe_length:
        text = text[:safe_length] + "..."
    
    chat_completion = client.chat.completions.create(
        model=model_name,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.1,
    )
    return chat_completion.choices[0].message.content.strip()

def calculate_metrics(references, hypotheses):
    """Calculate various MT evaluation metrics."""
    # Initialize metrics
    bleu = BLEU()
    chrf = CHRF()
    ter_metric = TER()
    bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    wer = WordErrorRate()
    cer = CharErrorRate()
    rouge_metrics = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    # Calculate BLEU and variants
    bleu_score = bleu.corpus_score(hypotheses, [references]).score
    chrf_score = chrf.corpus_score(hypotheses, [references]).score
    ter_score = ter_metric.corpus_score(hypotheses, [references]).score
    
    # Calculate BERTScore
    P, R, F1 = bert_scorer.score(hypotheses, references)
    bert_score = F1.mean().item()
    
    # Calculate WER and CER
    wer_score = wer(hypotheses, references).item()
    cer_score = cer(hypotheses, references).item()
    
    # Calculate ROUGE scores
    rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
    for hyp, ref in zip(hypotheses, references):
        scores = rouge_metrics.score(ref, hyp)
        rouge_scores['rouge1'] += scores['rouge1'].fmeasure
        rouge_scores['rouge2'] += scores['rouge2'].fmeasure
        rouge_scores['rougeL'] += scores['rougeL'].fmeasure
    
    for key in rouge_scores:
        rouge_scores[key] /= len(hypotheses)

    return {
        "BLEU": bleu_score,
        "chrF": chrf_score,
        "TER": ter_score,
        "BERTScore": bert_score,
        "WER": wer_score,
        "CER": cer_score,
        "ROUGE-1": rouge_scores['rouge1'],
        "ROUGE-2": rouge_scores['rouge2'],
        "ROUGE-L": rouge_scores['rougeL']
    }

def evaluate_models(dataset, source_lang, target_lang):
    """Evaluate multiple models on the dataset."""
    results = {}
    
    for model_name, model_info in MODELS.items():
        print(f"\nEvaluating {model_name} ({model_info['provider']})...")
        translations = []
        references = []
        source_texts = []
        
        for example in tqdm(dataset):
            source_text = example[source_lang]
            reference = example[target_lang]
            
            try:
                translation = translate_text(source_text, model_name, source_lang, target_lang)
                
                source_texts.append(source_text)
                translations.append(translation)
                references.append(reference)
            except Exception as e:
                print(f"Error with {model_name} on text: {str(e)}")
                continue
        
        if translations:  # Only calculate metrics if we have translations
            # Calculate all metrics
            metrics = calculate_metrics(references, translations)
            results[f"{model_name} ({model_info['provider']})"] = metrics
            
            # Save translations and source texts for review
            pd.DataFrame({
                'Source': source_texts,
                'Reference': references,
                'Translation': translations
            }).to_csv(f'translations_{model_name}_{source_lang}-{target_lang}.csv', index=False)
    
    return pd.DataFrame(results).T

def visualize_results(results, pair_name):
    """Create visualizations for the evaluation results."""
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    plt.figure(figsize=(20, 10))
    sns.heatmap(results, annot=True, cmap='YlOrRd', fmt='.3f')
    plt.title(f'Translation Metrics Comparison for {pair_name}')
    plt.ylabel('Models')
    plt.xlabel('Metrics')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig(f'mt_evaluation_heatmap_{pair_name}.png')
    plt.close()

def main():
    # Language pairs to evaluate
    language_pairs = [
    ("en-bn", "English-Bengali"),
    
    ("bn-en", "Bengali-English")
    ]
    
    all_results = {}
    
    for pair_code, pair_name in language_pairs:
        print(f"\nEvaluating {pair_name} translations...")
        
        # Load dataset
        dataset = load_translation_data(pair_code, num_samples=100)
        
        # Get source and target language codes
        source_lang, target_lang = pair_code.split("-")
        
        # Evaluate models
        results = evaluate_models(dataset, source_lang, target_lang)
        
        # Save results
        results.to_csv(f"mt_evaluation_results_{pair_code}.csv")
        
        # Create visualizations
        visualize_results(results, pair_name)
        
        # Store results
        all_results[pair_name] = results
        
        # Print results
        print(f"\nResults for {pair_name}:")
        print(results)
    
    # Create combined visualization
#     plt.figure(figsize=(25, 12))
#     for idx, (pair_name, results) in enumerate(all_results.items()):
#         plt.subplot(1, 2, idx+1)
#         sns.heatmap(results, annot=True, cmap='YlOrRd', fmt='.3f')
#         plt.title(f'Results for {pair_name}')
#         plt.ylabel('Models')
#         plt.xlabel('Metrics')
#         plt.xticks(rotation=45)
#         plt.yticks(rotation=0)
#     plt.tight_layout()
#     plt.savefig('combined_results.png')
#     plt.close()

if __name__ == "__main__":
    main()


Evaluating English-Bengali translations...


0000.parquet:   0%|          | 0.00/271M [00:00<?, ?B/s]

0001.parquet:   0%|          | 0.00/79.5M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/189k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/118k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2379749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/597 [00:00<?, ? examples/s]


Evaluating gemma2-9b-it (Google)...


100%|██████████| 100/100 [02:55<00:00,  1.76s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating gemma-7b-it (Google)...


 12%|█▏        | 12/100 [00:17<04:14,  2.89s/it]

Error with gemma-7b-it on text: Error code: 503 - {'error': {'message': 'Service Unavailable', 'type': 'internal_server_error'}}


100%|██████████| 100/100 [03:15<00:00,  1.96s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating llama3-groq-70b-8192-tool-use-preview (Groq)...


100%|██████████| 100/100 [03:21<00:00,  2.01s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating llama3-groq-8b-8192-tool-use-preview (Groq)...


100%|██████████| 100/100 [02:47<00:00,  1.68s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating llama-3.1-70b-versatile (Meta)...


100%|██████████| 100/100 [03:26<00:00,  2.06s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating llama-3.1-8b-instant (Meta)...


100%|██████████| 100/100 [03:03<00:00,  1.83s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating mixtral-8x7b-32768 (Mistral)...


100%|██████████| 100/100 [03:03<00:00,  1.83s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating llama-3.2-90b-vision-preview (Meta)...


100%|██████████| 100/100 [06:49<00:00,  4.10s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for English-Bengali:
                                                   BLEU       chrF  \
gemma2-9b-it (Google)                          4.438925  43.498924   
gemma-7b-it (Google)                           5.859950  41.545205   
llama3-groq-70b-8192-tool-use-preview (Groq)   6.563603  50.798672   
llama3-groq-8b-8192-tool-use-preview (Groq)    8.252401  47.519645   
llama-3.1-70b-versatile (Meta)                20.611771  62.359896   
llama-3.1-8b-instant (Meta)                    5.053935  43.872480   
mixtral-8x7b-32768 (Mistral)                   0.561554  16.602645   
llama-3.2-90b-vision-preview (Meta)           20.443994  62.428199   

                                                     TER  BERTScore       WER  \
gemma2-9b-it (Google)                         205.621096   0.644759  2.081194   
gemma-7b-it (Google)                           86.226283   0.721300  0.881940   
llama3-groq-70b-8192-tool-use-preview (Groq)  166.967384   0.769497  1.733518   
llama3-groq-8b-

100%|██████████| 100/100 [02:46<00:00,  1.66s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating gemma-7b-it (Google)...


100%|██████████| 100/100 [03:14<00:00,  1.95s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating llama3-groq-70b-8192-tool-use-preview (Groq)...


100%|██████████| 100/100 [02:53<00:00,  1.74s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating llama3-groq-8b-8192-tool-use-preview (Groq)...


100%|██████████| 100/100 [02:43<00:00,  1.63s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating llama-3.1-70b-versatile (Meta)...


100%|██████████| 100/100 [02:51<00:00,  1.71s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating llama-3.1-8b-instant (Meta)...


100%|██████████| 100/100 [02:44<00:00,  1.65s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating mixtral-8x7b-32768 (Mistral)...


100%|██████████| 100/100 [02:48<00:00,  1.68s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluating llama-3.2-90b-vision-preview (Meta)...


100%|██████████| 100/100 [06:16<00:00,  3.76s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Bengali-English:
                                                   BLEU       chrF  \
gemma2-9b-it (Google)                         19.042866  60.409431   
gemma-7b-it (Google)                          19.646189  54.649268   
llama3-groq-70b-8192-tool-use-preview (Groq)  32.644674  65.805306   
llama3-groq-8b-8192-tool-use-preview (Groq)   22.501088  59.748951   
llama-3.1-70b-versatile (Meta)                40.706086  70.517376   
llama-3.1-8b-instant (Meta)                   24.188652  61.473872   
mixtral-8x7b-32768 (Mistral)                  19.222994  56.893622   
llama-3.2-90b-vision-preview (Meta)           39.878165  69.967829   

                                                     TER  BERTScore       WER  \
gemma2-9b-it (Google)                         123.041998   0.502867  1.290011   
gemma-7b-it (Google)                           78.490352   0.592894  0.851873   
llama3-groq-70b-8192-tool-use-preview (Groq)   56.072645   0.717561  0.648695   
llama3-groq-8b-