In [1]:
!pip install numpy pandas datasets sacrebleu bert_score torchmetrics rouge_score transformers tqdm matplotlib seaborn torch

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l- done
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.0.0-py3-none-any.whl (19 kB)
Building wheels for collected packages: roug

In [2]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from sacrebleu.metrics import BLEU, CHRF, TER
from bert_score import BERTScorer
from torchmetrics.text import TranslationEditRate, WordErrorRate, CharErrorRate
from rouge_score import rouge_scorer
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import torch

warnings.filterwarnings('ignore')

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define models
model_names = ["Qwen/Qwen2.5-3B-Instruct"]

def load_translation_data(language_pair, num_samples=20):
    """Load dataset for specified language pair."""
    try:
        dataset = load_dataset("wmt19", language_pair, split="validation")
    except ValueError:
        dataset = load_dataset("wmt19", language_pair, split="train")
    return dataset.select(range(min(num_samples, len(dataset))))

def translate_text(text, source_lang, target_lang, model_name, tokenizer, model):
    """Translate text using the specified model."""
    prompt = f"Translate the following text from {source_lang} to {target_lang}: {text}"
    inputs = tokenizer(prompt, return_tensors="pt", max_length=256, truncation=True).to(device)
    
    # Generate translation with no gradients
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=256)
    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

def calculate_metrics(references, hypotheses):
    """Calculate various MT evaluation metrics."""
    bleu = BLEU()
    chrf = CHRF()
    ter_metric = TER()
    bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    wer = WordErrorRate()
    cer = CharErrorRate()
    rouge_metrics = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    bleu_score = bleu.corpus_score(hypotheses, [references]).score
    chrf_score = chrf.corpus_score(hypotheses, [references]).score
    ter_score = ter_metric.corpus_score(hypotheses, [references]).score
    P, R, F1 = bert_scorer.score(hypotheses, references)
    bert_score = F1.mean().item()
    wer_score = wer(hypotheses, references).item()
    cer_score = cer(hypotheses, references).item()
    rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
    
    for hyp, ref in zip(hypotheses, references):
        scores = rouge_metrics.score(ref, hyp)
        rouge_scores['rouge1'] += scores['rouge1'].fmeasure
        rouge_scores['rouge2'] += scores['rouge2'].fmeasure
        rouge_scores['rougeL'] += scores['rougeL'].fmeasure
    
    for key in rouge_scores:
        rouge_scores[key] /= len(hypotheses)

    return {
        "BLEU": bleu_score,
        "chrF": chrf_score,
        "TER": ter_score,
        "BERTScore": bert_score,
        "WER": wer_score,
        "CER": cer_score,
        "ROUGE-1": rouge_scores['rouge1'],
        "ROUGE-2": rouge_scores['rouge2'],
        "ROUGE-L": rouge_scores['rougeL']
    }

def plot_confusion_matrix(references, translations, model_name, pair_code):
    """Generate and save a confusion matrix."""
    matrix = np.zeros((len(references), len(translations)))
    for i, ref in enumerate(references):
        for j, hyp in enumerate(translations):
            matrix[i, j] = len(set(ref.split()) & set(hyp.split())) / len(set(ref.split()) | set(hyp.split()))
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(matrix, annot=False, cmap="coolwarm", cbar=True)
    plt.title(f"Confusion Matrix for {model_name} ({pair_code})")
    plt.xlabel("Hypotheses")
    plt.ylabel("References")
    plt.tight_layout()
    plt.savefig(f"confusion_matrix_{model_name}_{pair_code}.png")
    plt.close()

def evaluate_model(dataset, source_lang, target_lang, model_name):
    """Evaluate a specific model on the dataset."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
    
    translations = []
    references = []
    source_texts = []
    
    for example in tqdm(dataset, desc=f"Translating with {model_name}"):
        source_text = example['translation'][source_lang]
        reference = example['translation'][target_lang]
        
        try:
            translation = translate_text(source_text, source_lang, target_lang, model_name, tokenizer, model)
            source_texts.append(source_text)
            translations.append(translation)
            references.append(reference)
        except Exception as e:
            print(f"Error during translation with {model_name}: {str(e)}")
            continue

    # Clear GPU memory after processing
    model.to("cpu")
    torch.cuda.empty_cache()

    if translations:
        metrics = calculate_metrics(references, translations)
        return source_texts, references, translations, metrics
    return [], [], [], {}

def main():
    language_pairs = [
        ("cs-en", "Czech-English"),
        ("de-en", "German-English"),
        ("fi-en", "Finnish-English"),
        ("fr-de", "French-German"),
        ("gu-en", "Gujarati-English"),
        ("kk-en", "Kazakh-English"),
        ("lt-en", "Lithuanian-English"),
        ("ru-en", "Russian-English"),
        ("zh-en", "Chinese-English")
    ]
    
    for pair_code, pair_name in language_pairs:
        print(f"\nEvaluating {pair_name} translations...")
        dataset = load_translation_data(pair_code, num_samples=50)
        source_lang, target_lang = pair_code.split("-")
        
        for model_name in model_names:
            # Original Direction
            print(f"Evaluating {model_name} in original direction ({source_lang} -> {target_lang})...")
            source_texts, references, translations, metrics = evaluate_model(dataset, source_lang, target_lang, model_name)
            
            # Save metrics
            results_df = pd.DataFrame([metrics], index=[model_name.split("/")[-1]])
            results_df.to_csv(f"mt_evaluation_results_{model_name.split('/')[-1]}_{pair_code}_original.csv")
            
            # Save translations for review
            translations_df = pd.DataFrame({
                "Source": source_texts,
                "Reference": references,
                "Translation": translations
            })
            translations_df.to_csv(f"translations_{model_name.split('/')[-1]}_{pair_code}_original.csv", index=False)
            
            # Plot and save confusion matrix
            # plot_confusion_matrix(references, translations, model_name.split("/")[-1], f"{pair_code}_original")
            
            # Print metrics
            print(f"\nResults for {model_name} ({pair_name}) in original direction:")
            print(results_df)

            # Reverse Direction
            print(f"Evaluating {model_name} in reverse direction ({target_lang} -> {source_lang})...")
            source_texts, references, translations, metrics = evaluate_model(dataset, target_lang, source_lang, model_name)
            
            # Save metrics
            results_df = pd.DataFrame([metrics], index=[model_name.split("/")[-1]])
            results_df.to_csv(f"mt_evaluation_results_{model_name.split('/')[-1]}_{pair_code}_reverse.csv")
            
            # Save translations for review
            translations_df = pd.DataFrame({
                "Source": source_texts,
                "Reference": references,
                "Translation": translations
            })
            translations_df.to_csv(f"translations_{model_name.split('/')[-1]}_{pair_code}_reverse.csv", index=False)
            
            # Plot and save confusion matrix
            # plot_confusion_matrix(references, translations, model_name.split("/")[-1], f"{pair_code}_reverse")
            
            # Print metrics
            print(f"\nResults for {model_name} ({pair_name}) in reverse direction:")
            print(results_df)



if __name__ == "__main__":
    main()

Using device: cuda

Evaluating Czech-English translations...


README.md:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/254M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/473k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7270695 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2983 [00:00<?, ? examples/s]

Evaluating Qwen/Qwen2.5-3B-Instruct in original direction (cs -> en)...


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Translating with Qwen/Qwen2.5-3B-Instruct:   0%|          | 0/50 [00:00<?, ?it/s]Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 50/50 [05:00<00:00,  6.00s/it]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Czech-English) in original direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  4.065102  28.156581  635.982814  -0.068864  6.385607   

                          CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  6.233159  0.183702  0.093547  0.159198  
Evaluating Qwen/Qwen2.5-3B-Instruct in reverse direction (en -> cs)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 50/50 [07:14<00:00,  8.69s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Czech-English) in reverse direction:
                         BLEU       chrF        TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  1.082017  18.929349  826.59176  -0.093284  8.274656   

                          CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  7.261577  0.110577  0.033405  0.083133  

Evaluating German-English translations...


Downloading data:   0%|          | 0/16 [00:00<?, ?files/s]

train-00000-of-00016.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00001-of-00016.parquet:   0%|          | 0.00/130M [00:00<?, ?B/s]

train-00002-of-00016.parquet:   0%|          | 0.00/102M [00:00<?, ?B/s]

train-00003-of-00016.parquet:   0%|          | 0.00/176M [00:00<?, ?B/s]

train-00004-of-00016.parquet:   0%|          | 0.00/282M [00:00<?, ?B/s]

train-00005-of-00016.parquet:   0%|          | 0.00/183M [00:00<?, ?B/s]

train-00006-of-00016.parquet:   0%|          | 0.00/251M [00:00<?, ?B/s]

train-00007-of-00016.parquet:   0%|          | 0.00/336M [00:00<?, ?B/s]

train-00008-of-00016.parquet:   0%|          | 0.00/232M [00:00<?, ?B/s]

train-00009-of-00016.parquet:   0%|          | 0.00/224M [00:00<?, ?B/s]

train-00010-of-00016.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

train-00011-of-00016.parquet:   0%|          | 0.00/340M [00:00<?, ?B/s]

train-00012-of-00016.parquet:   0%|          | 0.00/401M [00:00<?, ?B/s]

train-00013-of-00016.parquet:   0%|          | 0.00/307M [00:00<?, ?B/s]

train-00014-of-00016.parquet:   0%|          | 0.00/305M [00:00<?, ?B/s]

train-00015-of-00016.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/495k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/34782245 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2998 [00:00<?, ? examples/s]

Evaluating Qwen/Qwen2.5-3B-Instruct in original direction (de -> en)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 50/50 [05:54<00:00,  7.09s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (German-English) in original direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  4.550302  29.115449  684.275437   -0.01924  6.878726   

                          CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  6.964144  0.190299  0.109336  0.168401  
Evaluating Qwen/Qwen2.5-3B-Instruct in reverse direction (en -> de)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 50/50 [07:10<00:00,  8.61s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (German-English) in reverse direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  2.313882  25.559984  916.458853  -0.047912  9.183291   

                          CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  7.731013  0.129449  0.065872  0.115112  

Evaluating Finnish-English translations...


train-00000-of-00003.parquet:   0%|          | 0.00/350M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/177M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/445k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6587448 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Evaluating Qwen/Qwen2.5-3B-Instruct in original direction (fi -> en)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 50/50 [05:19<00:00,  6.40s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Finnish-English) in original direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  2.135567  23.070022  642.562432  -0.152893  6.451683   

                          CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  6.697603  0.160966  0.056326  0.134253  
Evaluating Qwen/Qwen2.5-3B-Instruct in reverse direction (en -> fi)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 50/50 [07:16<00:00,  8.73s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Finnish-English) in reverse direction:
                         BLEU       chrF          TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  0.485504  19.004876  1094.850498  -0.156589  10.95515   

                          CER  ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  7.768223  0.05636  0.014404  0.051168  

Evaluating French-German translations...


train-00000-of-00005.parquet:   0%|          | 0.00/368M [00:00<?, ?B/s]

train-00001-of-00005.parquet:   0%|          | 0.00/163M [00:00<?, ?B/s]

train-00002-of-00005.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

train-00003-of-00005.parquet:   0%|          | 0.00/245M [00:00<?, ?B/s]

train-00004-of-00005.parquet:   0%|          | 0.00/272M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/263k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9824476 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1512 [00:00<?, ? examples/s]

Evaluating Qwen/Qwen2.5-3B-Instruct in original direction (fr -> de)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 50/50 [06:21<00:00,  7.63s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (French-German) in original direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  2.344908  26.914463  738.383838  -0.115144  7.402918   

                          CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  6.198687  0.132945  0.061539  0.113429  
Evaluating Qwen/Qwen2.5-3B-Instruct in reverse direction (de -> fr)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 50/50 [06:11<00:00,  7.44s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (French-German) in reverse direction:
                         BLEU       chrF         TER  BERTScore      WER  \
Qwen2.5-3B-Instruct  2.865275  26.776206  667.810293  -0.123832  6.70333   

                          CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  6.419566  0.152408  0.082327  0.134374  

Evaluating Gujarati-English translations...


train-00000-of-00001.parquet:   0%|          | 0.00/361k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/370k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11670 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1998 [00:00<?, ? examples/s]

Evaluating Qwen/Qwen2.5-3B-Instruct in original direction (gu -> en)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct:   4%|▍         | 2/50 [00:08<02:56,  3.68s/it]

Error during translation with Qwen/Qwen2.5-3B-Instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with Qwen/Qwen2.5-3B-Instruct:  12%|█▏        | 6/50 [00:20<02:18,  3.14s/it]

Error during translation with Qwen/Qwen2.5-3B-Instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with Qwen/Qwen2.5-3B-Instruct:  78%|███████▊  | 39/50 [03:13<01:06,  6.08s/it]

Error during translation with Qwen/Qwen2.5-3B-Instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with Qwen/Qwen2.5-3B-Instruct:  84%|████████▍ | 42/50 [03:24<00:40,  5.06s/it]

Error during translation with Qwen/Qwen2.5-3B-Instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 50/50 [03:54<00:00,  4.69s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Gujarati-English) in original direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  1.062091  15.219199  332.374101  -0.717185  3.333813   

                         CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  3.11378  0.164092  0.031864  0.136155  
Evaluating Qwen/Qwen2.5-3B-Instruct in reverse direction (en -> gu)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 50/50 [07:56<00:00,  9.54s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Gujarati-English) in reverse direction:
                         BLEU      chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  0.153213  7.232302  593.530239   0.143685  5.935302   

                          CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  5.327559  0.015428  0.003184  0.015428  

Evaluating Kazakh-English translations...


train-00000-of-00001.parquet:   0%|          | 0.00/5.30M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/462k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/126583 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2066 [00:00<?, ? examples/s]

Evaluating Qwen/Qwen2.5-3B-Instruct in original direction (kk -> en)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct:  26%|██▌       | 13/50 [01:30<04:15,  6.90s/it]

Error during translation with Qwen/Qwen2.5-3B-Instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 50/50 [05:37<00:00,  6.76s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Kazakh-English) in original direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  0.202638  10.945883  357.459095  -0.631855  3.586141   

                          CER  ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  3.793848  0.11511  0.009488  0.092388  
Evaluating Qwen/Qwen2.5-3B-Instruct in reverse direction (en -> kk)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 50/50 [07:22<00:00,  8.84s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Kazakh-English) in reverse direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  0.085055  13.632382  605.568445   0.140347  6.056845   

                          CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  4.807844  0.005691  0.000597  0.005691  

Evaluating Lithuanian-English translations...


train-00000-of-00002.parquet:   0%|          | 0.00/160M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/125M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/351k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2344893 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Evaluating Qwen/Qwen2.5-3B-Instruct in original direction (lt -> en)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 50/50 [06:23<00:00,  7.67s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Lithuanian-English) in original direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  1.750699  21.768814  635.490605  -0.240061  6.367432   

                          CER   ROUGE-1  ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  6.702392  0.139715  0.04899  0.109279  
Evaluating Qwen/Qwen2.5-3B-Instruct in reverse direction (en -> lt)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 50/50 [07:09<00:00,  8.59s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Lithuanian-English) in reverse direction:
                         BLEU       chrF        TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  0.357164  17.401679  861.55914  -0.212066  8.620968   

                          CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  6.628376  0.065161  0.013157  0.055137  

Evaluating Russian-English translations...


Resolving data files:   0%|          | 0/28 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/28 [00:00<?, ?files/s]

train-00000-of-00028.parquet:   0%|          | 0.00/135M [00:00<?, ?B/s]

train-00001-of-00028.parquet:   0%|          | 0.00/150M [00:00<?, ?B/s]

train-00002-of-00028.parquet:   0%|          | 0.00/130M [00:00<?, ?B/s]

train-00003-of-00028.parquet:   0%|          | 0.00/167M [00:00<?, ?B/s]

train-00004-of-00028.parquet:   0%|          | 0.00/152M [00:00<?, ?B/s]

train-00005-of-00028.parquet:   0%|          | 0.00/158M [00:00<?, ?B/s]

train-00006-of-00028.parquet:   0%|          | 0.00/125M [00:00<?, ?B/s]

train-00007-of-00028.parquet:   0%|          | 0.00/141M [00:00<?, ?B/s]

train-00008-of-00028.parquet:   0%|          | 0.00/101M [00:00<?, ?B/s]

train-00009-of-00028.parquet:   0%|          | 0.00/245M [00:00<?, ?B/s]

train-00010-of-00028.parquet:   0%|          | 0.00/125M [00:00<?, ?B/s]

train-00011-of-00028.parquet:   0%|          | 0.00/266M [00:00<?, ?B/s]

train-00012-of-00028.parquet:   0%|          | 0.00/270M [00:00<?, ?B/s]

train-00013-of-00028.parquet:   0%|          | 0.00/254M [00:00<?, ?B/s]

train-00014-of-00028.parquet:   0%|          | 0.00/268M [00:00<?, ?B/s]

train-00015-of-00028.parquet:   0%|          | 0.00/258M [00:00<?, ?B/s]

train-00016-of-00028.parquet:   0%|          | 0.00/263M [00:00<?, ?B/s]

train-00017-of-00028.parquet:   0%|          | 0.00/262M [00:00<?, ?B/s]

train-00018-of-00028.parquet:   0%|          | 0.00/266M [00:00<?, ?B/s]

train-00019-of-00028.parquet:   0%|          | 0.00/261M [00:00<?, ?B/s]

train-00020-of-00028.parquet:   0%|          | 0.00/270M [00:00<?, ?B/s]

train-00021-of-00028.parquet:   0%|          | 0.00/268M [00:00<?, ?B/s]

train-00022-of-00028.parquet:   0%|          | 0.00/270M [00:00<?, ?B/s]

train-00023-of-00028.parquet:   0%|          | 0.00/274M [00:00<?, ?B/s]

train-00024-of-00028.parquet:   0%|          | 0.00/269M [00:00<?, ?B/s]

train-00025-of-00028.parquet:   0%|          | 0.00/269M [00:00<?, ?B/s]

train-00026-of-00028.parquet:   0%|          | 0.00/275M [00:00<?, ?B/s]

train-00027-of-00028.parquet:   0%|          | 0.00/273M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/611k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/37492126 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Evaluating Qwen/Qwen2.5-3B-Instruct in original direction (ru -> en)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 50/50 [05:27<00:00,  6.55s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Russian-English) in original direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  4.435978  31.235149  479.984837  -0.341326  4.830933   

                          CER  ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  4.771794  0.28061  0.134312  0.233252  
Evaluating Qwen/Qwen2.5-3B-Instruct in reverse direction (en -> ru)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 50/50 [06:54<00:00,  8.30s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Russian-English) in reverse direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  2.632447  27.107905  624.150597   0.288813  6.247934   

                          CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  5.107092  0.008457  0.001541  0.008457  

Evaluating Chinese-English translations...


train-00000-of-00013.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

train-00001-of-00013.parquet:   0%|          | 0.00/284M [00:00<?, ?B/s]

train-00002-of-00013.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

train-00003-of-00013.parquet:   0%|          | 0.00/289M [00:00<?, ?B/s]

train-00004-of-00013.parquet:   0%|          | 0.00/288M [00:00<?, ?B/s]

train-00005-of-00013.parquet:   0%|          | 0.00/289M [00:00<?, ?B/s]

train-00006-of-00013.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

train-00007-of-00013.parquet:   0%|          | 0.00/291M [00:00<?, ?B/s]

train-00008-of-00013.parquet:   0%|          | 0.00/307M [00:00<?, ?B/s]

train-00009-of-00013.parquet:   0%|          | 0.00/291M [00:00<?, ?B/s]

train-00010-of-00013.parquet:   0%|          | 0.00/185M [00:00<?, ?B/s]

train-00011-of-00013.parquet:   0%|          | 0.00/342M [00:00<?, ?B/s]

train-00012-of-00013.parquet:   0%|          | 0.00/263M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/728k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25984574 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3981 [00:00<?, ? examples/s]

Evaluating Qwen/Qwen2.5-3B-Instruct in original direction (zh -> en)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 50/50 [05:35<00:00,  6.72s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Chinese-English) in original direction:
                         BLEU      chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  8.251641  39.54083  281.562882  -0.222626  2.869353   

                          CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  3.083601  0.359382  0.195788  0.295161  
Evaluating Qwen/Qwen2.5-3B-Instruct in reverse direction (en -> zh)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 50/50 [05:51<00:00,  7.02s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Chinese-English) in reverse direction:
                        BLEU       chrF          TER  BERTScore        WER  \
Qwen2.5-3B-Instruct  0.12628  13.712057  4587.341772   0.170533  45.873417   

                           CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  10.468085  0.023821  0.011114  0.022753  
