In [1]:
!pip install numpy pandas datasets sacrebleu bert_score torchmetrics rouge_score transformers tqdm matplotlib seaborn torch

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l- done
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.0.0-py3-none-any.whl (19 kB)
Building wheels for collected packages: roug

In [None]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from sacrebleu.metrics import BLEU, CHRF, TER
from bert_score import BERTScorer
from torchmetrics.text import TranslationEditRate, WordErrorRate, CharErrorRate
from rouge_score import rouge_scorer
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import torch

warnings.filterwarnings('ignore')

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define models
model_names = ["Qwen/Qwen2.5-0.5B-Instruct"]

def load_translation_data(language_pair, num_samples=100):
    """Load dataset for specified language pair."""
    try:
        dataset = load_dataset("wmt19", language_pair, split="validation")
    except ValueError:
        dataset = load_dataset("wmt19", language_pair, split="train")
    return dataset.select(range(min(num_samples, len(dataset))))

def translate_text(text, source_lang, target_lang, model_name, tokenizer, model):
    """Translate text using the specified model."""
    prompt = f"Translate the following text from {source_lang} to {target_lang}: {text}"
    inputs = tokenizer(prompt, return_tensors="pt", max_length=256, truncation=True).to(device)
    
    # Generate translation with no gradients
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=256)
    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

def calculate_metrics(references, hypotheses, target_lang):
    """Calculate various MT evaluation metrics."""
    bleu_tokenizer = "zh" if target_lang == "zh" else "13a"
    bleu = BLEU(tokenizer=bleu_tokenizer)
    chrf = CHRF()
    ter_metric = TER()
    bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    wer = WordErrorRate()
    cer = CharErrorRate()
    rouge_metrics = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    bleu_score = bleu.corpus_score(hypotheses, [references]).score
    chrf_score = chrf.corpus_score(hypotheses, [references]).score
    ter_score = ter_metric.corpus_score(hypotheses, [references]).score
    P, R, F1 = bert_scorer.score(hypotheses, references)
    bert_score = F1.mean().item()
    wer_score = wer(hypotheses, references).item()
    cer_score = cer(hypotheses, references).item()
    rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
    
    for hyp, ref in zip(hypotheses, references):
        scores = rouge_metrics.score(ref, hyp)
        rouge_scores['rouge1'] += scores['rouge1'].fmeasure
        rouge_scores['rouge2'] += scores['rouge2'].fmeasure
        rouge_scores['rougeL'] += scores['rougeL'].fmeasure
    
    for key in rouge_scores:
        rouge_scores[key] /= len(hypotheses)

    return {
        "BLEU": bleu_score,
        "chrF": chrf_score,
        "TER": ter_score,
        "BERTScore": bert_score,
        "WER": wer_score,
        "CER": cer_score,
        "ROUGE-1": rouge_scores['rouge1'],
        "ROUGE-2": rouge_scores['rouge2'],
        "ROUGE-L": rouge_scores['rougeL']
    }

def plot_confusion_matrix(references, translations, model_name, pair_code):
    """Generate and save a confusion matrix."""
    matrix = np.zeros((len(references), len(translations)))
    for i, ref in enumerate(references):
        for j, hyp in enumerate(translations):
            matrix[i, j] = len(set(ref.split()) & set(hyp.split())) / len(set(ref.split()) | set(hyp.split()))
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(matrix, annot=False, cmap="coolwarm", cbar=True)
    plt.title(f"Confusion Matrix for {model_name} ({pair_code})")
    plt.xlabel("Hypotheses")
    plt.ylabel("References")
    plt.tight_layout()
    plt.savefig(f"confusion_matrix_{model_name}_{pair_code}.png")
    plt.close()

def evaluate_model(dataset, source_lang, target_lang, model_name):
    """Evaluate a specific model on the dataset."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
    
    translations = []
    references = []
    source_texts = []
    
    for example in tqdm(dataset, desc=f"Translating with {model_name}"):
        source_text = example['translation'][source_lang]
        reference = example['translation'][target_lang]
        
        try:
            translation = translate_text(source_text, source_lang, target_lang, model_name, tokenizer, model)
            source_texts.append(source_text)
            translations.append(translation)
            references.append(reference)
        except Exception as e:
            print(f"Error during translation with {model_name}: {str(e)}")
            continue

    # Clear GPU memory after processing
    model.to("cpu")
    torch.cuda.empty_cache()

    if translations:
        metrics = calculate_metrics(references, translations)
        return source_texts, references, translations, metrics
    return [], [], [], {}

def main():
    language_pairs = [
        ("cs-en", "Czech-English"),
        ("de-en", "German-English"),
        ("fi-en", "Finnish-English"),
        ("fr-de", "French-German"),
        ("gu-en", "Gujarati-English"),
        ("kk-en", "Kazakh-English"),
        ("lt-en", "Lithuanian-English"),
        ("ru-en", "Russian-English"),
        ("zh-en", "Chinese-English")
    ]
    
    for pair_code, pair_name in language_pairs:
        print(f"\nEvaluating {pair_name} translations...")
        dataset = load_translation_data(pair_code, num_samples=100)
        source_lang, target_lang = pair_code.split("-")
        
        for model_name in model_names:
            # Original Direction
            print(f"Evaluating {model_name} in original direction ({source_lang} -> {target_lang})...")
            source_texts, references, translations, metrics = evaluate_model(dataset, source_lang, target_lang, model_name)
            
            # Save metrics
            results_df = pd.DataFrame([metrics], index=[model_name.split("/")[-1]])
            results_df.to_csv(f"mt_evaluation_results_{model_name.split('/')[-1]}_{pair_code}_original.csv")
            
            # Save translations for review
            translations_df = pd.DataFrame({
                "Source": source_texts,
                "Reference": references,
                "Translation": translations
            })
            translations_df.to_csv(f"translations_{model_name.split('/')[-1]}_{pair_code}_original.csv", index=False)
            
            # Plot and save confusion matrix
            # plot_confusion_matrix(references, translations, model_name.split("/")[-1], f"{pair_code}_original")
            
            # Print metrics
            print(f"\nResults for {model_name} ({pair_name}) in original direction:")
            print(results_df)

            # Reverse Direction
            print(f"Evaluating {model_name} in reverse direction ({target_lang} -> {source_lang})...")
            source_texts, references, translations, metrics = evaluate_model(dataset, target_lang, source_lang, model_name)
            
            # Save metrics
            results_df = pd.DataFrame([metrics], index=[model_name.split("/")[-1]])
            results_df.to_csv(f"mt_evaluation_results_{model_name.split('/')[-1]}_{pair_code}_reverse.csv")
            
            # Save translations for review
            translations_df = pd.DataFrame({
                "Source": source_texts,
                "Reference": references,
                "Translation": translations
            })
            translations_df.to_csv(f"translations_{model_name.split('/')[-1]}_{pair_code}_reverse.csv", index=False)
            
            # Plot and save confusion matrix
            # plot_confusion_matrix(references, translations, model_name.split("/")[-1], f"{pair_code}_reverse")
            
            # Print metrics
            print(f"\nResults for {model_name} ({pair_name}) in reverse direction:")
            print(results_df)



if __name__ == "__main__":
    main()


Using device: cuda

Evaluating Czech-English translations...


README.md:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/254M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/473k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7270695 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2983 [00:00<?, ? examples/s]

Evaluating Qwen/Qwen2.5-0.5B-Instruct in original direction (cs -> en)...


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Translating with Qwen/Qwen2.5-0.5B-Instruct:   0%|          | 0/100 [00:00<?, ?it/s]Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
Translating with Qwen/Qwen2.5-0.5B-Instruct: 100%|██████████| 100/100 [08:14<00:00,  4.95s/it]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-0.5B-Instruct (Czech-English) in original direction:
                           BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-0.5B-Instruct  0.839422  16.682168  862.492546  -0.235061  8.647585   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-0.5B-Instruct  8.607452  0.095563  0.021502  0.071452  
Evaluating Qwen/Qwen2.5-0.5B-Instruct in reverse direction (en -> cs)...


Translating with Qwen/Qwen2.5-0.5B-Instruct: 100%|██████████| 100/100 [09:18<00:00,  5.59s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-0.5B-Instruct (Czech-English) in reverse direction:
                           BLEU     chrF          TER  BERTScore        WER  \
Qwen2.5-0.5B-Instruct  0.104634  9.26703  1327.272727  -0.392382  13.280602   

                             CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-0.5B-Instruct  11.647971  0.034056  0.003753  0.025724  

Evaluating German-English translations...


Downloading data:   0%|          | 0/16 [00:00<?, ?files/s]

train-00000-of-00016.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00001-of-00016.parquet:   0%|          | 0.00/130M [00:00<?, ?B/s]

train-00002-of-00016.parquet:   0%|          | 0.00/102M [00:00<?, ?B/s]

train-00003-of-00016.parquet:   0%|          | 0.00/176M [00:00<?, ?B/s]

train-00004-of-00016.parquet:   0%|          | 0.00/282M [00:00<?, ?B/s]

train-00005-of-00016.parquet:   0%|          | 0.00/183M [00:00<?, ?B/s]

train-00006-of-00016.parquet:   0%|          | 0.00/251M [00:00<?, ?B/s]

train-00007-of-00016.parquet:   0%|          | 0.00/336M [00:00<?, ?B/s]

train-00008-of-00016.parquet:   0%|          | 0.00/232M [00:00<?, ?B/s]

train-00009-of-00016.parquet:   0%|          | 0.00/224M [00:00<?, ?B/s]

train-00010-of-00016.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

train-00011-of-00016.parquet:   0%|          | 0.00/340M [00:00<?, ?B/s]

train-00012-of-00016.parquet:   0%|          | 0.00/401M [00:00<?, ?B/s]

train-00013-of-00016.parquet:   0%|          | 0.00/307M [00:00<?, ?B/s]

train-00014-of-00016.parquet:   0%|          | 0.00/305M [00:00<?, ?B/s]

train-00015-of-00016.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/495k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/34782245 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2998 [00:00<?, ? examples/s]

Evaluating Qwen/Qwen2.5-0.5B-Instruct in original direction (de -> en)...


Translating with Qwen/Qwen2.5-0.5B-Instruct: 100%|██████████| 100/100 [08:34<00:00,  5.14s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-0.5B-Instruct (German-English) in original direction:
                           BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-0.5B-Instruct  2.451773  22.557548  764.271654  -0.121821  7.672244   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-0.5B-Instruct  7.741893  0.136667  0.057136  0.108906  
Evaluating Qwen/Qwen2.5-0.5B-Instruct in reverse direction (en -> de)...


Translating with Qwen/Qwen2.5-0.5B-Instruct: 100%|██████████| 100/100 [09:10<00:00,  5.51s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-0.5B-Instruct (German-English) in reverse direction:
                          BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-0.5B-Instruct  1.04596  18.595145  989.617798  -0.189026  9.907587   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-0.5B-Instruct  7.948365  0.077633  0.026846  0.064214  

Evaluating Finnish-English translations...


train-00000-of-00003.parquet:   0%|          | 0.00/350M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/177M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/445k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6587448 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Evaluating Qwen/Qwen2.5-0.5B-Instruct in original direction (fi -> en)...


Translating with Qwen/Qwen2.5-0.5B-Instruct: 100%|██████████| 100/100 [08:16<00:00,  4.97s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-0.5B-Instruct (Finnish-English) in original direction:
                           BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-0.5B-Instruct  0.536817  16.655071  753.685311  -0.277551  7.555149   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-0.5B-Instruct  7.612884  0.088477  0.011138  0.065787  
Evaluating Qwen/Qwen2.5-0.5B-Instruct in reverse direction (en -> fi)...


Translating with Qwen/Qwen2.5-0.5B-Instruct: 100%|██████████| 100/100 [09:16<00:00,  5.57s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-0.5B-Instruct (Finnish-English) in reverse direction:
                           BLEU       chrF          TER  BERTScore        WER  \
Qwen2.5-0.5B-Instruct  0.153529  13.169384  1358.911672  -0.365386  13.590694   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-0.5B-Instruct  8.785488  0.024103  0.003202  0.020232  

Evaluating French-German translations...


train-00000-of-00005.parquet:   0%|          | 0.00/368M [00:00<?, ?B/s]

train-00001-of-00005.parquet:   0%|          | 0.00/163M [00:00<?, ?B/s]

train-00002-of-00005.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

train-00003-of-00005.parquet:   0%|          | 0.00/245M [00:00<?, ?B/s]

train-00004-of-00005.parquet:   0%|          | 0.00/272M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/263k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9824476 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1512 [00:00<?, ? examples/s]

Evaluating Qwen/Qwen2.5-0.5B-Instruct in original direction (fr -> de)...


Translating with Qwen/Qwen2.5-0.5B-Instruct: 100%|██████████| 100/100 [08:28<00:00,  5.08s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-0.5B-Instruct (French-German) in original direction:
                           BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-0.5B-Instruct  1.847409  20.023817  721.344765  -0.299912  7.218863   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-0.5B-Instruct  5.892802  0.066405  0.029442  0.056855  
Evaluating Qwen/Qwen2.5-0.5B-Instruct in reverse direction (de -> fr)...


Translating with Qwen/Qwen2.5-0.5B-Instruct: 100%|██████████| 100/100 [08:22<00:00,  5.03s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-0.5B-Instruct (French-German) in reverse direction:
                           BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-0.5B-Instruct  2.219449  21.689881  676.889471  -0.201239  6.779817   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-0.5B-Instruct  6.487334  0.108872  0.043889  0.082758  

Evaluating Gujarati-English translations...


train-00000-of-00001.parquet:   0%|          | 0.00/361k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/370k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11670 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1998 [00:00<?, ? examples/s]

Evaluating Qwen/Qwen2.5-0.5B-Instruct in original direction (gu -> en)...


Translating with Qwen/Qwen2.5-0.5B-Instruct:   2%|▏         | 2/100 [00:05<03:44,  2.29s/it]

Error during translation with Qwen/Qwen2.5-0.5B-Instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with Qwen/Qwen2.5-0.5B-Instruct:   6%|▌         | 6/100 [00:12<02:58,  1.90s/it]

Error during translation with Qwen/Qwen2.5-0.5B-Instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with Qwen/Qwen2.5-0.5B-Instruct:  39%|███▉      | 39/100 [02:02<03:50,  3.78s/it]

Error during translation with Qwen/Qwen2.5-0.5B-Instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with Qwen/Qwen2.5-0.5B-Instruct:  42%|████▏     | 42/100 [02:09<03:02,  3.14s/it]

Error during translation with Qwen/Qwen2.5-0.5B-Instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with Qwen/Qwen2.5-0.5B-Instruct:  78%|███████▊  | 78/100 [03:50<01:05,  2.98s/it]

Error during translation with Qwen/Qwen2.5-0.5B-Instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with Qwen/Qwen2.5-0.5B-Instruct: 100%|██████████| 100/100 [04:53<00:00,  2.93s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-0.5B-Instruct (Gujarati-English) in original direction:
                           BLEU      chrF         TER  BERTScore       WER  \
Qwen2.5-0.5B-Instruct  0.141033  7.803456  247.603196  -0.810341  2.478695   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-0.5B-Instruct  2.183885  0.111323  0.003647  0.094692  
Evaluating Qwen/Qwen2.5-0.5B-Instruct in reverse direction (en -> gu)...


Translating with Qwen/Qwen2.5-0.5B-Instruct: 100%|██████████| 100/100 [09:25<00:00,  5.65s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-0.5B-Instruct (Gujarati-English) in reverse direction:
                           BLEU      chrF          TER  BERTScore        WER  \
Qwen2.5-0.5B-Instruct  0.022506  0.251207  1291.528926   -0.83351  12.915289   

                             CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-0.5B-Instruct  12.398872  0.005014  0.001071  0.005014  

Evaluating Kazakh-English translations...


train-00000-of-00001.parquet:   0%|          | 0.00/5.30M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/462k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/126583 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2066 [00:00<?, ? examples/s]

Evaluating Qwen/Qwen2.5-0.5B-Instruct in original direction (kk -> en)...


Translating with Qwen/Qwen2.5-0.5B-Instruct:  13%|█▎        | 13/100 [00:56<06:13,  4.29s/it]

Error during translation with Qwen/Qwen2.5-0.5B-Instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with Qwen/Qwen2.5-0.5B-Instruct: 100%|██████████| 100/100 [06:57<00:00,  4.17s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-0.5B-Instruct (Kazakh-English) in original direction:
                           BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-0.5B-Instruct  0.167927  11.786487  416.234968  -0.593519  4.175301   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-0.5B-Instruct  4.230723  0.110583  0.007142  0.086071  
Evaluating Qwen/Qwen2.5-0.5B-Instruct in reverse direction (en -> kk)...


Translating with Qwen/Qwen2.5-0.5B-Instruct: 100%|██████████| 100/100 [09:14<00:00,  5.54s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-0.5B-Instruct (Kazakh-English) in reverse direction:
                           BLEU     chrF          TER  BERTScore        WER  \
Qwen2.5-0.5B-Instruct  0.012978  0.80924  1095.521542  -0.651436  10.955782   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-0.5B-Instruct  8.598508  0.003285  0.001035  0.003142  

Evaluating Lithuanian-English translations...


train-00000-of-00002.parquet:   0%|          | 0.00/160M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/125M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/351k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2344893 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Evaluating Qwen/Qwen2.5-0.5B-Instruct in original direction (lt -> en)...


Translating with Qwen/Qwen2.5-0.5B-Instruct: 100%|██████████| 100/100 [08:21<00:00,  5.02s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-0.5B-Instruct (Lithuanian-English) in original direction:
                           BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-0.5B-Instruct  0.448329  17.437597  668.967229   -0.31964  6.708044   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-0.5B-Instruct  6.665042  0.089891  0.014329  0.066022  
Evaluating Qwen/Qwen2.5-0.5B-Instruct in reverse direction (en -> lt)...


Translating with Qwen/Qwen2.5-0.5B-Instruct: 100%|██████████| 100/100 [09:21<00:00,  5.62s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-0.5B-Instruct (Lithuanian-English) in reverse direction:
                           BLEU       chrF          TER  BERTScore        WER  \
Qwen2.5-0.5B-Instruct  0.112645  12.632424  1178.506494  -0.478395  11.787013   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-0.5B-Instruct  8.699214  0.029546  0.004985  0.025478  

Evaluating Russian-English translations...


Resolving data files:   0%|          | 0/28 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/28 [00:00<?, ?files/s]

train-00000-of-00028.parquet:   0%|          | 0.00/135M [00:00<?, ?B/s]

train-00001-of-00028.parquet:   0%|          | 0.00/150M [00:00<?, ?B/s]

train-00002-of-00028.parquet:   0%|          | 0.00/130M [00:00<?, ?B/s]

train-00003-of-00028.parquet:   0%|          | 0.00/167M [00:00<?, ?B/s]

train-00004-of-00028.parquet:   0%|          | 0.00/152M [00:00<?, ?B/s]

train-00005-of-00028.parquet:   0%|          | 0.00/158M [00:00<?, ?B/s]

train-00006-of-00028.parquet:   0%|          | 0.00/125M [00:00<?, ?B/s]

train-00007-of-00028.parquet:   0%|          | 0.00/141M [00:00<?, ?B/s]

train-00008-of-00028.parquet:   0%|          | 0.00/101M [00:00<?, ?B/s]

train-00009-of-00028.parquet:   0%|          | 0.00/245M [00:00<?, ?B/s]

train-00010-of-00028.parquet:   0%|          | 0.00/125M [00:00<?, ?B/s]

train-00011-of-00028.parquet:   0%|          | 0.00/266M [00:00<?, ?B/s]

train-00012-of-00028.parquet:   0%|          | 0.00/270M [00:00<?, ?B/s]

train-00013-of-00028.parquet:   0%|          | 0.00/254M [00:00<?, ?B/s]

train-00014-of-00028.parquet:   0%|          | 0.00/268M [00:00<?, ?B/s]

train-00015-of-00028.parquet:   0%|          | 0.00/258M [00:00<?, ?B/s]

train-00016-of-00028.parquet:   0%|          | 0.00/263M [00:00<?, ?B/s]

train-00017-of-00028.parquet:   0%|          | 0.00/262M [00:00<?, ?B/s]

train-00018-of-00028.parquet:   0%|          | 0.00/266M [00:00<?, ?B/s]

train-00019-of-00028.parquet:   0%|          | 0.00/261M [00:00<?, ?B/s]

train-00020-of-00028.parquet:   0%|          | 0.00/270M [00:00<?, ?B/s]

train-00021-of-00028.parquet:   0%|          | 0.00/268M [00:00<?, ?B/s]

train-00022-of-00028.parquet:   0%|          | 0.00/270M [00:00<?, ?B/s]

train-00023-of-00028.parquet:   0%|          | 0.00/274M [00:00<?, ?B/s]

train-00024-of-00028.parquet:   0%|          | 0.00/269M [00:00<?, ?B/s]

train-00025-of-00028.parquet:   0%|          | 0.00/269M [00:00<?, ?B/s]

train-00026-of-00028.parquet:   0%|          | 0.00/275M [00:00<?, ?B/s]

train-00027-of-00028.parquet:   0%|          | 0.00/273M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/611k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/37492126 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Evaluating Qwen/Qwen2.5-0.5B-Instruct in original direction (ru -> en)...


Translating with Qwen/Qwen2.5-0.5B-Instruct: 100%|██████████| 100/100 [08:10<00:00,  4.90s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-0.5B-Instruct (Russian-English) in original direction:
                           BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-0.5B-Instruct  1.913803  22.969138  673.679727  -0.406318  6.757666   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-0.5B-Instruct  6.518428  0.202246  0.069257  0.150153  
Evaluating Qwen/Qwen2.5-0.5B-Instruct in reverse direction (en -> ru)...


Translating with Qwen/Qwen2.5-0.5B-Instruct: 100%|██████████| 100/100 [09:11<00:00,  5.52s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-0.5B-Instruct (Russian-English) in reverse direction:
                           BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-0.5B-Instruct  0.553309  11.929676  912.642431  -0.042048  9.132393   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-0.5B-Instruct  7.428799  0.003595  0.000264  0.003501  

Evaluating Chinese-English translations...


train-00000-of-00013.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

train-00001-of-00013.parquet:   0%|          | 0.00/284M [00:00<?, ?B/s]

train-00002-of-00013.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

train-00003-of-00013.parquet:   0%|          | 0.00/289M [00:00<?, ?B/s]

train-00004-of-00013.parquet:   0%|          | 0.00/288M [00:00<?, ?B/s]

train-00005-of-00013.parquet:   0%|          | 0.00/289M [00:00<?, ?B/s]

train-00006-of-00013.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

train-00007-of-00013.parquet:   0%|          | 0.00/291M [00:00<?, ?B/s]

train-00008-of-00013.parquet:   0%|          | 0.00/307M [00:00<?, ?B/s]

train-00009-of-00013.parquet:   0%|          | 0.00/291M [00:00<?, ?B/s]

train-00010-of-00013.parquet:   0%|          | 0.00/185M [00:00<?, ?B/s]

train-00011-of-00013.parquet:   0%|          | 0.00/342M [00:00<?, ?B/s]

train-00012-of-00013.parquet:   0%|          | 0.00/263M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/728k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25984574 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3981 [00:00<?, ? examples/s]

Evaluating Qwen/Qwen2.5-0.5B-Instruct in original direction (zh -> en)...


Translating with Qwen/Qwen2.5-0.5B-Instruct: 100%|██████████| 100/100 [09:01<00:00,  5.41s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-0.5B-Instruct (Chinese-English) in original direction:
                           BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-0.5B-Instruct  3.055662  28.867378  444.358974  -0.245966  4.483333   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-0.5B-Instruct  4.586035  0.219233  0.076213  0.155504  
Evaluating Qwen/Qwen2.5-0.5B-Instruct in reverse direction (en -> zh)...


Translating with Qwen/Qwen2.5-0.5B-Instruct: 100%|██████████| 100/100 [08:33<00:00,  5.14s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-0.5B-Instruct (Chinese-English) in reverse direction:
                           BLEU      chrF          TER  BERTScore        WER  \
Qwen2.5-0.5B-Instruct  0.042384  5.335408  9906.293706  -0.110036  99.062935   

                             CER   ROUGE-1   ROUGE-2  ROUGE-L  
Qwen2.5-0.5B-Instruct  17.551081  0.013815  0.003908  0.01372  


In [3]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from sacrebleu.metrics import BLEU, CHRF, TER
from bert_score import BERTScorer
from torchmetrics.text import TranslationEditRate, WordErrorRate, CharErrorRate
from rouge_score import rouge_scorer
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import torch

warnings.filterwarnings('ignore')

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define models
model_names = ["Qwen/Qwen2.5-1.5B-Instruct"]

def load_translation_data(language_pair, num_samples=100):
    """Load dataset for specified language pair."""
    try:
        dataset = load_dataset("wmt19", language_pair, split="validation")
    except ValueError:
        dataset = load_dataset("wmt19", language_pair, split="train")
    return dataset.select(range(min(num_samples, len(dataset))))

def translate_text(text, source_lang, target_lang, model_name, tokenizer, model):
    """Translate text using the specified model."""
    prompt = f"Translate the following text from {source_lang} to {target_lang}: {text}"
    inputs = tokenizer(prompt, return_tensors="pt", max_length=256, truncation=True).to(device)
    
    # Generate translation with no gradients
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=256)
    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

def calculate_metrics(references, hypotheses):
    """Calculate various MT evaluation metrics."""
    bleu = BLEU()
    chrf = CHRF()
    ter_metric = TER()
    bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    wer = WordErrorRate()
    cer = CharErrorRate()
    rouge_metrics = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    bleu_score = bleu.corpus_score(hypotheses, [references]).score
    chrf_score = chrf.corpus_score(hypotheses, [references]).score
    ter_score = ter_metric.corpus_score(hypotheses, [references]).score
    P, R, F1 = bert_scorer.score(hypotheses, references)
    bert_score = F1.mean().item()
    wer_score = wer(hypotheses, references).item()
    cer_score = cer(hypotheses, references).item()
    rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
    
    for hyp, ref in zip(hypotheses, references):
        scores = rouge_metrics.score(ref, hyp)
        rouge_scores['rouge1'] += scores['rouge1'].fmeasure
        rouge_scores['rouge2'] += scores['rouge2'].fmeasure
        rouge_scores['rougeL'] += scores['rougeL'].fmeasure
    
    for key in rouge_scores:
        rouge_scores[key] /= len(hypotheses)

    return {
        "BLEU": bleu_score,
        "chrF": chrf_score,
        "TER": ter_score,
        "BERTScore": bert_score,
        "WER": wer_score,
        "CER": cer_score,
        "ROUGE-1": rouge_scores['rouge1'],
        "ROUGE-2": rouge_scores['rouge2'],
        "ROUGE-L": rouge_scores['rougeL']
    }

def plot_confusion_matrix(references, translations, model_name, pair_code):
    """Generate and save a confusion matrix."""
    matrix = np.zeros((len(references), len(translations)))
    for i, ref in enumerate(references):
        for j, hyp in enumerate(translations):
            matrix[i, j] = len(set(ref.split()) & set(hyp.split())) / len(set(ref.split()) | set(hyp.split()))
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(matrix, annot=False, cmap="coolwarm", cbar=True)
    plt.title(f"Confusion Matrix for {model_name} ({pair_code})")
    plt.xlabel("Hypotheses")
    plt.ylabel("References")
    plt.tight_layout()
    plt.savefig(f"confusion_matrix_{model_name}_{pair_code}.png")
    plt.close()

def evaluate_model(dataset, source_lang, target_lang, model_name):
    """Evaluate a specific model on the dataset."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
    
    translations = []
    references = []
    source_texts = []
    
    for example in tqdm(dataset, desc=f"Translating with {model_name}"):
        source_text = example['translation'][source_lang]
        reference = example['translation'][target_lang]
        
        try:
            translation = translate_text(source_text, source_lang, target_lang, model_name, tokenizer, model)
            source_texts.append(source_text)
            translations.append(translation)
            references.append(reference)
        except Exception as e:
            print(f"Error during translation with {model_name}: {str(e)}")
            continue

    # Clear GPU memory after processing
    model.to("cpu")
    torch.cuda.empty_cache()

    if translations:
        metrics = calculate_metrics(references, translations)
        return source_texts, references, translations, metrics
    return [], [], [], {}

def main():
    language_pairs = [
        ("cs-en", "Czech-English"),
        ("de-en", "German-English"),
        ("fi-en", "Finnish-English"),
        ("fr-de", "French-German"),
        ("gu-en", "Gujarati-English"),
        ("kk-en", "Kazakh-English"),
        ("lt-en", "Lithuanian-English"),
        ("ru-en", "Russian-English"),
        ("zh-en", "Chinese-English")
    ]
    
    for pair_code, pair_name in language_pairs:
        print(f"\nEvaluating {pair_name} translations...")
        dataset = load_translation_data(pair_code, num_samples=100)
        source_lang, target_lang = pair_code.split("-")
        
        for model_name in model_names:
            # Original Direction
            print(f"Evaluating {model_name} in original direction ({source_lang} -> {target_lang})...")
            source_texts, references, translations, metrics = evaluate_model(dataset, source_lang, target_lang, model_name)
            
            # Save metrics
            results_df = pd.DataFrame([metrics], index=[model_name.split("/")[-1]])
            results_df.to_csv(f"mt_evaluation_results_{model_name.split('/')[-1]}_{pair_code}_original.csv")
            
            # Save translations for review
            translations_df = pd.DataFrame({
                "Source": source_texts,
                "Reference": references,
                "Translation": translations
            })
            translations_df.to_csv(f"translations_{model_name.split('/')[-1]}_{pair_code}_original.csv", index=False)
            
            # Plot and save confusion matrix
            # plot_confusion_matrix(references, translations, model_name.split("/")[-1], f"{pair_code}_original")
            
            # Print metrics
            print(f"\nResults for {model_name} ({pair_name}) in original direction:")
            print(results_df)

            # Reverse Direction
            print(f"Evaluating {model_name} in reverse direction ({target_lang} -> {source_lang})...")
            source_texts, references, translations, metrics = evaluate_model(dataset, target_lang, source_lang, model_name)
            
            # Save metrics
            results_df = pd.DataFrame([metrics], index=[model_name.split("/")[-1]])
            results_df.to_csv(f"mt_evaluation_results_{model_name.split('/')[-1]}_{pair_code}_reverse.csv")
            
            # Save translations for review
            translations_df = pd.DataFrame({
                "Source": source_texts,
                "Reference": references,
                "Translation": translations
            })
            translations_df.to_csv(f"translations_{model_name.split('/')[-1]}_{pair_code}_reverse.csv", index=False)
            
            # Plot and save confusion matrix
            # plot_confusion_matrix(references, translations, model_name.split("/")[-1], f"{pair_code}_reverse")
            
            # Print metrics
            print(f"\nResults for {model_name} ({pair_name}) in reverse direction:")
            print(results_df)



if __name__ == "__main__":
    main()

Using device: cuda

Evaluating Czech-English translations...
Evaluating Qwen/Qwen2.5-1.5B-Instruct in original direction (cs -> en)...


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/660 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Translating with Qwen/Qwen2.5-1.5B-Instruct: 100%|██████████| 100/100 [09:17<00:00,  5.58s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-1.5B-Instruct (Czech-English) in original direction:
                           BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-1.5B-Instruct  1.958305  20.818286  794.096601  -0.184207  7.970185   

                            CER   ROUGE-1  ROUGE-2   ROUGE-L  
Qwen2.5-1.5B-Instruct  8.002168  0.122494  0.04345  0.096913  
Evaluating Qwen/Qwen2.5-1.5B-Instruct in reverse direction (en -> cs)...


Translating with Qwen/Qwen2.5-1.5B-Instruct: 100%|██████████| 100/100 [11:09<00:00,  6.69s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-1.5B-Instruct (Czech-English) in reverse direction:
                           BLEU       chrF          TER  BERTScore        WER  \
Qwen2.5-1.5B-Instruct  0.353477  11.719229  1185.683608  -0.228267  11.863278   

                             CER   ROUGE-1  ROUGE-2   ROUGE-L  
Qwen2.5-1.5B-Instruct  10.588176  0.062857   0.0114  0.045379  

Evaluating German-English translations...
Evaluating Qwen/Qwen2.5-1.5B-Instruct in original direction (de -> en)...


Translating with Qwen/Qwen2.5-1.5B-Instruct: 100%|██████████| 100/100 [09:53<00:00,  5.93s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-1.5B-Instruct (German-English) in original direction:
                           BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-1.5B-Instruct  3.101783  24.876158  774.311024  -0.042571  7.784449   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-1.5B-Instruct  7.931051  0.160224  0.078503  0.128105  
Evaluating Qwen/Qwen2.5-1.5B-Instruct in reverse direction (en -> de)...


Translating with Qwen/Qwen2.5-1.5B-Instruct: 100%|██████████| 100/100 [10:45<00:00,  6.46s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-1.5B-Instruct (German-English) in reverse direction:
                           BLEU       chrF          TER  BERTScore       WER  \
Qwen2.5-1.5B-Instruct  1.258332  19.544498  1005.362236  -0.150549  10.06218   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-1.5B-Instruct  8.222445  0.083936  0.034376  0.070087  

Evaluating Finnish-English translations...
Evaluating Qwen/Qwen2.5-1.5B-Instruct in original direction (fi -> en)...


Translating with Qwen/Qwen2.5-1.5B-Instruct: 100%|██████████| 100/100 [09:02<00:00,  5.42s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-1.5B-Instruct (Finnish-English) in original direction:
                           BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-1.5B-Instruct  1.260242  20.379253  713.800314   -0.18598  7.166754   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-1.5B-Instruct  7.467144  0.129997  0.034185  0.096656  
Evaluating Qwen/Qwen2.5-1.5B-Instruct in reverse direction (en -> fi)...


Translating with Qwen/Qwen2.5-1.5B-Instruct: 100%|██████████| 100/100 [10:59<00:00,  6.59s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-1.5B-Instruct (Finnish-English) in reverse direction:
                           BLEU       chrF          TER  BERTScore        WER  \
Qwen2.5-1.5B-Instruct  0.177209  14.053288  1361.671924  -0.320742  13.620663   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-1.5B-Instruct  8.925506  0.032796  0.006375  0.028814  

Evaluating French-German translations...
Evaluating Qwen/Qwen2.5-1.5B-Instruct in original direction (fr -> de)...


Translating with Qwen/Qwen2.5-1.5B-Instruct: 100%|██████████| 100/100 [10:07<00:00,  6.08s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-1.5B-Instruct (French-German) in original direction:
                           BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-1.5B-Instruct  2.109827  21.542288  733.303249  -0.235032  7.339801   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-1.5B-Instruct  6.087678  0.082168  0.037345  0.069427  
Evaluating Qwen/Qwen2.5-1.5B-Instruct in reverse direction (de -> fr)...


Translating with Qwen/Qwen2.5-1.5B-Instruct: 100%|██████████| 100/100 [10:02<00:00,  6.02s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-1.5B-Instruct (French-German) in reverse direction:
                           BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-1.5B-Instruct  2.395744  22.510219  682.656182  -0.181641  6.840105   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-1.5B-Instruct  6.643724  0.109199  0.049942  0.088142  

Evaluating Gujarati-English translations...
Evaluating Qwen/Qwen2.5-1.5B-Instruct in original direction (gu -> en)...


Translating with Qwen/Qwen2.5-1.5B-Instruct:   2%|▏         | 2/100 [00:06<04:27,  2.73s/it]

Error during translation with Qwen/Qwen2.5-1.5B-Instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with Qwen/Qwen2.5-1.5B-Instruct:   6%|▌         | 6/100 [00:15<03:35,  2.29s/it]

Error during translation with Qwen/Qwen2.5-1.5B-Instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with Qwen/Qwen2.5-1.5B-Instruct:  39%|███▉      | 39/100 [02:28<04:29,  4.42s/it]

Error during translation with Qwen/Qwen2.5-1.5B-Instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with Qwen/Qwen2.5-1.5B-Instruct:  42%|████▏     | 42/100 [02:37<03:36,  3.74s/it]

Error during translation with Qwen/Qwen2.5-1.5B-Instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with Qwen/Qwen2.5-1.5B-Instruct:  78%|███████▊  | 78/100 [04:39<01:18,  3.55s/it]

Error during translation with Qwen/Qwen2.5-1.5B-Instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with Qwen/Qwen2.5-1.5B-Instruct: 100%|██████████| 100/100 [05:55<00:00,  3.55s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-1.5B-Instruct (Gujarati-English) in original direction:
                           BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-1.5B-Instruct  0.791933  13.620543  358.455393  -0.726716  3.599867   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-1.5B-Instruct  3.329845  0.139039  0.017276  0.105821  
Evaluating Qwen/Qwen2.5-1.5B-Instruct in reverse direction (en -> gu)...


Translating with Qwen/Qwen2.5-1.5B-Instruct: 100%|██████████| 100/100 [11:13<00:00,  6.74s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-1.5B-Instruct (Gujarati-English) in reverse direction:
                           BLEU     chrF        TER  BERTScore        WER  \
Qwen2.5-1.5B-Instruct  0.036969  1.92329  1006.5427  -0.381221  10.065427   

                           CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-1.5B-Instruct  9.53161  0.007046  0.001506  0.006874  

Evaluating Kazakh-English translations...
Evaluating Qwen/Qwen2.5-1.5B-Instruct in original direction (kk -> en)...


Translating with Qwen/Qwen2.5-1.5B-Instruct:  13%|█▎        | 13/100 [01:03<07:11,  4.96s/it]

Error during translation with Qwen/Qwen2.5-1.5B-Instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with Qwen/Qwen2.5-1.5B-Instruct: 100%|██████████| 100/100 [08:06<00:00,  4.87s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-1.5B-Instruct (Kazakh-English) in original direction:
                           BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-1.5B-Instruct  0.429551  16.460724  479.139685  -0.527514  4.813136   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-1.5B-Instruct  4.728486  0.139237  0.015782  0.097147  
Evaluating Qwen/Qwen2.5-1.5B-Instruct in reverse direction (en -> kk)...


Translating with Qwen/Qwen2.5-1.5B-Instruct: 100%|██████████| 100/100 [11:13<00:00,  6.74s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-1.5B-Instruct (Kazakh-English) in reverse direction:
                           BLEU      chrF          TER  BERTScore        WER  \
Qwen2.5-1.5B-Instruct  0.016317  3.386815  1032.199546   -0.35274  10.321996   

                            CER  ROUGE-1   ROUGE-2  ROUGE-L  
Qwen2.5-1.5B-Instruct  8.179684  0.00315  0.000575  0.00265  

Evaluating Lithuanian-English translations...
Evaluating Qwen/Qwen2.5-1.5B-Instruct in original direction (lt -> en)...


Translating with Qwen/Qwen2.5-1.5B-Instruct: 100%|██████████| 100/100 [09:28<00:00,  5.69s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-1.5B-Instruct (Lithuanian-English) in original direction:
                           BLEU       chrF         TER  BERTScore      WER  \
Qwen2.5-1.5B-Instruct  1.109456  20.102395  655.958292  -0.260421  6.57994   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-1.5B-Instruct  6.755601  0.122428  0.029407  0.086521  
Evaluating Qwen/Qwen2.5-1.5B-Instruct in reverse direction (en -> lt)...


Translating with Qwen/Qwen2.5-1.5B-Instruct: 100%|██████████| 100/100 [10:54<00:00,  6.54s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-1.5B-Instruct (Lithuanian-English) in reverse direction:
                           BLEU      chrF     TER  BERTScore        WER  \
Qwen2.5-1.5B-Instruct  0.122183  13.06747  1165.0  -0.393417  11.652597   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-1.5B-Instruct  8.740262  0.032099  0.005596  0.026977  

Evaluating Russian-English translations...


Resolving data files:   0%|          | 0/28 [00:00<?, ?it/s]

Evaluating Qwen/Qwen2.5-1.5B-Instruct in original direction (ru -> en)...


Translating with Qwen/Qwen2.5-1.5B-Instruct: 100%|██████████| 100/100 [09:17<00:00,  5.58s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-1.5B-Instruct (Russian-English) in original direction:
                           BLEU       chrF         TER  BERTScore      WER  \
Qwen2.5-1.5B-Instruct  2.862261  24.756069  631.643952  -0.360664  6.35477   

                            CER  ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-1.5B-Instruct  6.297227  0.21915  0.092865  0.169792  
Evaluating Qwen/Qwen2.5-1.5B-Instruct in reverse direction (en -> ru)...


Translating with Qwen/Qwen2.5-1.5B-Instruct: 100%|██████████| 100/100 [10:43<00:00,  6.44s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-1.5B-Instruct (Russian-English) in reverse direction:
                           BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-1.5B-Instruct  0.910769  13.020258  942.539338  -0.080519  9.427021   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-1.5B-Instruct  7.782375  0.003668  0.000678  0.003668  

Evaluating Chinese-English translations...
Evaluating Qwen/Qwen2.5-1.5B-Instruct in original direction (zh -> en)...


Translating with Qwen/Qwen2.5-1.5B-Instruct: 100%|██████████| 100/100 [10:10<00:00,  6.11s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-1.5B-Instruct (Chinese-English) in original direction:
                           BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-1.5B-Instruct  4.941835  32.362379  351.474359  -0.288602  3.564423   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-1.5B-Instruct  3.941724  0.276056  0.120375  0.212749  
Evaluating Qwen/Qwen2.5-1.5B-Instruct in reverse direction (en -> zh)...


Translating with Qwen/Qwen2.5-1.5B-Instruct: 100%|██████████| 100/100 [09:56<00:00,  5.96s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-1.5B-Instruct (Chinese-English) in reverse direction:
                           BLEU     chrF          TER  BERTScore         WER  \
Qwen2.5-1.5B-Instruct  0.050823  4.32966  10720.27972  -0.227371  107.202797   

                             CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-1.5B-Instruct  18.483522  0.014194  0.005341  0.014099  


In [4]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from sacrebleu.metrics import BLEU, CHRF, TER
from bert_score import BERTScorer
from torchmetrics.text import TranslationEditRate, WordErrorRate, CharErrorRate
from rouge_score import rouge_scorer
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import torch

warnings.filterwarnings('ignore')

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define models
model_names = ["Qwen/Qwen2.5-3B-Instruct"]

def load_translation_data(language_pair, num_samples=100):
    """Load dataset for specified language pair."""
    try:
        dataset = load_dataset("wmt19", language_pair, split="validation")
    except ValueError:
        dataset = load_dataset("wmt19", language_pair, split="train")
    return dataset.select(range(min(num_samples, len(dataset))))

def translate_text(text, source_lang, target_lang, model_name, tokenizer, model):
    """Translate text using the specified model."""
    prompt = f"Translate the following text from {source_lang} to {target_lang}: {text}"
    inputs = tokenizer(prompt, return_tensors="pt", max_length=256, truncation=True).to(device)
    
    # Generate translation with no gradients
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=256)
    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

def calculate_metrics(references, hypotheses):
    """Calculate various MT evaluation metrics."""
    bleu = BLEU()
    chrf = CHRF()
    ter_metric = TER()
    bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    wer = WordErrorRate()
    cer = CharErrorRate()
    rouge_metrics = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    bleu_score = bleu.corpus_score(hypotheses, [references]).score
    chrf_score = chrf.corpus_score(hypotheses, [references]).score
    ter_score = ter_metric.corpus_score(hypotheses, [references]).score
    P, R, F1 = bert_scorer.score(hypotheses, references)
    bert_score = F1.mean().item()
    wer_score = wer(hypotheses, references).item()
    cer_score = cer(hypotheses, references).item()
    rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
    
    for hyp, ref in zip(hypotheses, references):
        scores = rouge_metrics.score(ref, hyp)
        rouge_scores['rouge1'] += scores['rouge1'].fmeasure
        rouge_scores['rouge2'] += scores['rouge2'].fmeasure
        rouge_scores['rougeL'] += scores['rougeL'].fmeasure
    
    for key in rouge_scores:
        rouge_scores[key] /= len(hypotheses)

    return {
        "BLEU": bleu_score,
        "chrF": chrf_score,
        "TER": ter_score,
        "BERTScore": bert_score,
        "WER": wer_score,
        "CER": cer_score,
        "ROUGE-1": rouge_scores['rouge1'],
        "ROUGE-2": rouge_scores['rouge2'],
        "ROUGE-L": rouge_scores['rougeL']
    }

def plot_confusion_matrix(references, translations, model_name, pair_code):
    """Generate and save a confusion matrix."""
    matrix = np.zeros((len(references), len(translations)))
    for i, ref in enumerate(references):
        for j, hyp in enumerate(translations):
            matrix[i, j] = len(set(ref.split()) & set(hyp.split())) / len(set(ref.split()) | set(hyp.split()))
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(matrix, annot=False, cmap="coolwarm", cbar=True)
    plt.title(f"Confusion Matrix for {model_name} ({pair_code})")
    plt.xlabel("Hypotheses")
    plt.ylabel("References")
    plt.tight_layout()
    plt.savefig(f"confusion_matrix_{model_name}_{pair_code}.png")
    plt.close()

def evaluate_model(dataset, source_lang, target_lang, model_name):
    """Evaluate a specific model on the dataset."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
    
    translations = []
    references = []
    source_texts = []
    
    for example in tqdm(dataset, desc=f"Translating with {model_name}"):
        source_text = example['translation'][source_lang]
        reference = example['translation'][target_lang]
        
        try:
            translation = translate_text(source_text, source_lang, target_lang, model_name, tokenizer, model)
            source_texts.append(source_text)
            translations.append(translation)
            references.append(reference)
        except Exception as e:
            print(f"Error during translation with {model_name}: {str(e)}")
            continue

    # Clear GPU memory after processing
    model.to("cpu")
    torch.cuda.empty_cache()

    if translations:
        metrics = calculate_metrics(references, translations)
        return source_texts, references, translations, metrics
    return [], [], [], {}

def main():
    language_pairs = [
        ("cs-en", "Czech-English"),
        ("de-en", "German-English"),
        ("fi-en", "Finnish-English"),
        ("fr-de", "French-German"),
        ("gu-en", "Gujarati-English"),
        ("kk-en", "Kazakh-English"),
        ("lt-en", "Lithuanian-English"),
        ("ru-en", "Russian-English"),
        ("zh-en", "Chinese-English")
    ]
    
    for pair_code, pair_name in language_pairs:
        print(f"\nEvaluating {pair_name} translations...")
        dataset = load_translation_data(pair_code, num_samples=100)
        source_lang, target_lang = pair_code.split("-")
        
        for model_name in model_names:
            # Original Direction
            print(f"Evaluating {model_name} in original direction ({source_lang} -> {target_lang})...")
            source_texts, references, translations, metrics = evaluate_model(dataset, source_lang, target_lang, model_name)
            
            # Save metrics
            results_df = pd.DataFrame([metrics], index=[model_name.split("/")[-1]])
            results_df.to_csv(f"mt_evaluation_results_{model_name.split('/')[-1]}_{pair_code}_original.csv")
            
            # Save translations for review
            translations_df = pd.DataFrame({
                "Source": source_texts,
                "Reference": references,
                "Translation": translations
            })
            translations_df.to_csv(f"translations_{model_name.split('/')[-1]}_{pair_code}_original.csv", index=False)
            
            # Plot and save confusion matrix
            # plot_confusion_matrix(references, translations, model_name.split("/")[-1], f"{pair_code}_original")
            
            # Print metrics
            print(f"\nResults for {model_name} ({pair_name}) in original direction:")
            print(results_df)

            # Reverse Direction
            print(f"Evaluating {model_name} in reverse direction ({target_lang} -> {source_lang})...")
            source_texts, references, translations, metrics = evaluate_model(dataset, target_lang, source_lang, model_name)
            
            # Save metrics
            results_df = pd.DataFrame([metrics], index=[model_name.split("/")[-1]])
            results_df.to_csv(f"mt_evaluation_results_{model_name.split('/')[-1]}_{pair_code}_reverse.csv")
            
            # Save translations for review
            translations_df = pd.DataFrame({
                "Source": source_texts,
                "Reference": references,
                "Translation": translations
            })
            translations_df.to_csv(f"translations_{model_name.split('/')[-1]}_{pair_code}_reverse.csv", index=False)
            
            # Plot and save confusion matrix
            # plot_confusion_matrix(references, translations, model_name.split("/")[-1], f"{pair_code}_reverse")
            
            # Print metrics
            print(f"\nResults for {model_name} ({pair_name}) in reverse direction:")
            print(results_df)



if __name__ == "__main__":
    main()

Using device: cuda

Evaluating Czech-English translations...
Evaluating Qwen/Qwen2.5-3B-Instruct in original direction (cs -> en)...


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 100/100 [09:29<00:00,  5.69s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Czech-English) in original direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  3.248358  26.111695  684.019082  -0.091067  6.875373   

                          CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  6.828964  0.175073  0.083167  0.149022  
Evaluating Qwen/Qwen2.5-3B-Instruct in reverse direction (en -> cs)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 100/100 [13:55<00:00,  8.35s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Czech-English) in reverse direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  0.961923  17.503307  946.886185  -0.086503  9.482463   

                          CER   ROUGE-1   ROUGE-2  ROUGE-L  
Qwen2.5-3B-Instruct  8.418118  0.109635  0.034861  0.08747  

Evaluating German-English translations...
Evaluating Qwen/Qwen2.5-3B-Instruct in original direction (de -> en)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 100/100 [10:23<00:00,  6.23s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (German-English) in original direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  4.870943  30.809862  621.948819  -0.006349  6.250984   

                          CER  ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  6.289469  0.20781  0.119773  0.182032  
Evaluating Qwen/Qwen2.5-3B-Instruct in reverse direction (en -> de)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 100/100 [12:36<00:00,  7.57s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (German-English) in reverse direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  2.768603  27.419971  769.652025  -0.037403  7.714775   

                         CER   ROUGE-1  ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  6.53409  0.147239  0.07263  0.128356  

Evaluating Finnish-English translations...
Evaluating Qwen/Qwen2.5-3B-Instruct in original direction (fi -> en)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 100/100 [10:00<00:00,  6.00s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Finnish-English) in original direction:
                         BLEU      chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  2.716093  25.14421  596.915839  -0.139334  5.992159   

                          CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  6.155571  0.171118  0.067076  0.138023  
Evaluating Qwen/Qwen2.5-3B-Instruct in reverse direction (en -> fi)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 100/100 [13:52<00:00,  8.33s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Finnish-English) in reverse direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  0.435419  20.174824  998.501577  -0.128207  9.990537   

                          CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  6.934125  0.065687  0.018794  0.056256  

Evaluating French-German translations...
Evaluating Qwen/Qwen2.5-3B-Instruct in original direction (fr -> de)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 100/100 [11:59<00:00,  7.20s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (French-German) in original direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  3.773717  29.256912  591.561372  -0.106928  5.926895   

                          CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  5.072568  0.150607  0.073785  0.131158  
Evaluating Qwen/Qwen2.5-3B-Instruct in reverse direction (de -> fr)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 100/100 [12:03<00:00,  7.23s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (French-German) in reverse direction:
                         BLEU       chrF        TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  3.846659  28.426504  591.56837  -0.114419  5.923547   

                          CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  5.771163  0.165255  0.092322  0.143762  

Evaluating Gujarati-English translations...
Evaluating Qwen/Qwen2.5-3B-Instruct in original direction (gu -> en)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct:   2%|▏         | 2/100 [00:04<03:42,  2.27s/it]

Error during translation with Qwen/Qwen2.5-3B-Instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with Qwen/Qwen2.5-3B-Instruct:   6%|▌         | 6/100 [00:16<04:19,  2.76s/it]

Error during translation with Qwen/Qwen2.5-3B-Instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with Qwen/Qwen2.5-3B-Instruct:  39%|███▉      | 39/100 [03:10<06:02,  5.94s/it]

Error during translation with Qwen/Qwen2.5-3B-Instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with Qwen/Qwen2.5-3B-Instruct:  42%|████▏     | 42/100 [03:21<04:48,  4.98s/it]

Error during translation with Qwen/Qwen2.5-3B-Instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with Qwen/Qwen2.5-3B-Instruct:  78%|███████▊  | 78/100 [05:57<01:38,  4.48s/it]

Error during translation with Qwen/Qwen2.5-3B-Instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 100/100 [07:35<00:00,  4.56s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Gujarati-English) in original direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  1.604138  14.618264  307.323569  -0.740366  3.087217   

                          CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  2.841098  0.152692  0.032318  0.126187  
Evaluating Qwen/Qwen2.5-3B-Instruct in reverse direction (en -> gu)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 100/100 [14:41<00:00,  8.82s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Gujarati-English) in reverse direction:
                         BLEU    chrF        TER  BERTScore      WER  \
Qwen2.5-3B-Instruct  0.162421  7.5918  530.30303   0.190826  5.30303   

                          CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  4.778183  0.019287  0.005008  0.018708  

Evaluating Kazakh-English translations...
Evaluating Qwen/Qwen2.5-3B-Instruct in original direction (kk -> en)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct:  13%|█▎        | 13/100 [01:27<09:33,  6.59s/it]

Error during translation with Qwen/Qwen2.5-3B-Instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 100/100 [10:44<00:00,  6.44s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Kazakh-English) in original direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  0.387671  10.668713  348.473636  -0.623327  3.497687   

                          CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  3.724758  0.118923  0.012477  0.096299  
Evaluating Qwen/Qwen2.5-3B-Instruct in reverse direction (en -> kk)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 100/100 [14:02<00:00,  8.43s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Kazakh-English) in reverse direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  0.077044  13.168374  606.519274   0.165692  6.067461   

                          CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  4.851836  0.007999  0.002729  0.007999  

Evaluating Lithuanian-English translations...
Evaluating Qwen/Qwen2.5-3B-Instruct in original direction (lt -> en)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 100/100 [11:40<00:00,  7.00s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Lithuanian-English) in original direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  1.346549  21.011527  574.180735  -0.275586  5.761172   

                          CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  5.901659  0.126825  0.036996  0.098328  
Evaluating Qwen/Qwen2.5-3B-Instruct in reverse direction (en -> lt)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 100/100 [13:33<00:00,  8.13s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Lithuanian-English) in reverse direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  0.354375  18.192187  806.688312  -0.196933  8.070779   

                          CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  6.359476  0.065781  0.013067  0.054411  

Evaluating Russian-English translations...


Resolving data files:   0%|          | 0/28 [00:00<?, ?it/s]

Evaluating Qwen/Qwen2.5-3B-Instruct in original direction (ru -> en)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 100/100 [09:13<00:00,  5.54s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Russian-English) in original direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  4.953071  33.236137  492.333901  -0.298943  4.958262   

                         CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  4.84345  0.299247  0.155828  0.253642  
Evaluating Qwen/Qwen2.5-3B-Instruct in reverse direction (en -> ru)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 100/100 [13:27<00:00,  8.07s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Russian-English) in reverse direction:
                         BLEU      chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  2.425557  25.12731  754.910472   0.235797  7.555073   

                          CER  ROUGE-1   ROUGE-2  ROUGE-L  
Qwen2.5-3B-Instruct  6.325459  0.00597  0.000999  0.00597  

Evaluating Chinese-English translations...
Evaluating Qwen/Qwen2.5-3B-Instruct in original direction (zh -> en)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 100/100 [11:25<00:00,  6.85s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Chinese-English) in original direction:
                         BLEU       chrF         TER  BERTScore       WER  \
Qwen2.5-3B-Instruct  6.591684  36.634471  333.685897  -0.226839  3.396154   

                          CER   ROUGE-1   ROUGE-2  ROUGE-L  
Qwen2.5-3B-Instruct  3.634372  0.308377  0.154974  0.24984  
Evaluating Qwen/Qwen2.5-3B-Instruct in reverse direction (en -> zh)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with Qwen/Qwen2.5-3B-Instruct: 100%|██████████| 100/100 [11:25<00:00,  6.85s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Qwen/Qwen2.5-3B-Instruct (Chinese-English) in reverse direction:
                         BLEU       chrF          TER  BERTScore        WER  \
Qwen2.5-3B-Instruct  0.114036  12.633133  5425.174825   0.137431  54.251747   

                           CER   ROUGE-1   ROUGE-2   ROUGE-L  
Qwen2.5-3B-Instruct  11.491944  0.025097  0.011507  0.024933  
