In [1]:
!pip install sacrebleu bert-score torchmetrics nltk rouge-score datasets transformers groq pandas tqdm matplotlib seaborn flash_attn
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l- done
Collecting groq
  Downloading groq-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting flash_attn
  Downloading flash_attn-2.7.0.post2.tar.gz (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- \ | / done
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Collecting einops (from flash_attn)
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Downloading s

True

In [2]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from sacrebleu.metrics import BLEU, CHRF, TER
from bert_score import BERTScorer
from torchmetrics.text import TranslationEditRate, WordErrorRate, CharErrorRate
from rouge_score import rouge_scorer
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import torch

warnings.filterwarnings('ignore')

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define models
model_names = ["microsoft/Phi-3.5-mini-instruct"]

def load_translation_data(language_pair, num_samples=20):
    """Load dataset for specified language pair."""
    try:
        dataset = load_dataset("wmt19", language_pair, split="validation")
    except ValueError:
        dataset = load_dataset("wmt19", language_pair, split="train")
    return dataset.select(range(min(num_samples, len(dataset))))

def translate_text(text, source_lang, target_lang, model_name, tokenizer, model):
    """Translate text using the specified model."""
    prompt = f"Translate the following text from {source_lang} to {target_lang}: {text}"
    inputs = tokenizer(prompt, return_tensors="pt", max_length=256, truncation=True).to(device)
    
    # Generate translation with no gradients
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=256)
    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

def calculate_metrics(references, hypotheses):
    """Calculate various MT evaluation metrics."""
    bleu = BLEU()
    chrf = CHRF()
    ter_metric = TER()
    bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    wer = WordErrorRate()
    cer = CharErrorRate()
    rouge_metrics = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    bleu_score = bleu.corpus_score(hypotheses, [references]).score
    chrf_score = chrf.corpus_score(hypotheses, [references]).score
    ter_score = ter_metric.corpus_score(hypotheses, [references]).score
    P, R, F1 = bert_scorer.score(hypotheses, references)
    bert_score = F1.mean().item()
    wer_score = wer(hypotheses, references).item()
    cer_score = cer(hypotheses, references).item()
    rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
    
    for hyp, ref in zip(hypotheses, references):
        scores = rouge_metrics.score(ref, hyp)
        rouge_scores['rouge1'] += scores['rouge1'].fmeasure
        rouge_scores['rouge2'] += scores['rouge2'].fmeasure
        rouge_scores['rougeL'] += scores['rougeL'].fmeasure
    
    for key in rouge_scores:
        rouge_scores[key] /= len(hypotheses)

    return {
        "BLEU": bleu_score,
        "chrF": chrf_score,
        "TER": ter_score,
        "BERTScore": bert_score,
        "WER": wer_score,
        "CER": cer_score,
        "ROUGE-1": rouge_scores['rouge1'],
        "ROUGE-2": rouge_scores['rouge2'],
        "ROUGE-L": rouge_scores['rougeL']
    }

def plot_confusion_matrix(references, translations, model_name, pair_code):
    """Generate and save a confusion matrix."""
    matrix = np.zeros((len(references), len(translations)))
    for i, ref in enumerate(references):
        for j, hyp in enumerate(translations):
            matrix[i, j] = len(set(ref.split()) & set(hyp.split())) / len(set(ref.split()) | set(hyp.split()))
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(matrix, annot=False, cmap="coolwarm", cbar=True)
    plt.title(f"Confusion Matrix for {model_name} ({pair_code})")
    plt.xlabel("Hypotheses")
    plt.ylabel("References")
    plt.tight_layout()
    plt.savefig(f"confusion_matrix_{model_name}_{pair_code}.png")
    plt.close()

def evaluate_model(dataset, source_lang, target_lang, model_name):
    """Evaluate a specific model on the dataset."""
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).to(device)
    
    translations = []
    references = []
    source_texts = []
    
    for example in tqdm(dataset, desc=f"Translating with {model_name}"):
        source_text = example['translation'][source_lang]
        reference = example['translation'][target_lang]
        
        try:
            translation = translate_text(source_text, source_lang, target_lang, model_name, tokenizer, model)
            source_texts.append(source_text)
            translations.append(translation)
            references.append(reference)
        except Exception as e:
            print(f"Error during translation with {model_name}: {str(e)}")
            continue

    # Clear GPU memory after processing
    model.to("cpu")
    torch.cuda.empty_cache()

    if translations:
        metrics = calculate_metrics(references, translations)
        return source_texts, references, translations, metrics
    return [], [], [], {}

def main():
    language_pairs = [
        ("cs-en", "Czech-English"),
        ("de-en", "German-English"),
        ("fi-en", "Finnish-English"),
        ("fr-de", "French-German"),
        ("gu-en", "Gujarati-English"),
        ("kk-en", "Kazakh-English"),
        ("lt-en", "Lithuanian-English"),
        ("ru-en", "Russian-English"),
        ("zh-en", "Chinese-English")
    ]
    
    for pair_code, pair_name in language_pairs:
        print(f"\nEvaluating {pair_name} translations...")
        dataset = load_translation_data(pair_code, num_samples=50)
        source_lang, target_lang = pair_code.split("-")
        
        for model_name in model_names:
            print(f"Evaluating {model_name}...")
            source_texts, references, translations, metrics = evaluate_model(dataset, source_lang, target_lang, model_name)
            
            # Save metrics
            results_df = pd.DataFrame([metrics], index=[model_name.split("/")[-1]])
            results_df.to_csv(f"mt_evaluation_results_{model_name.split('/')[-1]}_{pair_code}.csv")
            
            # Save translations for review
            translations_df = pd.DataFrame({
                "Source": source_texts,
                "Reference": references,
                "Translation": translations
            })
            translations_df.to_csv(f"translations_{model_name.split('/')[-1]}_{pair_code}.csv", index=False)
            
            # Plot and save confusion matrix
            plot_confusion_matrix(references, translations, model_name.split("/")[-1], pair_code)
            
            # Print metrics
            print(f"\nResults for {model_name} ({pair_name}):")
            print(results_df)

if __name__ == "__main__":
    main()

Using device: cuda

Evaluating Czech-English translations...


README.md:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/254M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/473k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7270695 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2983 [00:00<?, ? examples/s]

Evaluating microsoft/Phi-3.5-mini-instruct...


tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

Translating with microsoft/Phi-3.5-mini-instruct:   0%|          | 0/50 [00:00<?, ?it/s]The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
Translating with microsoft/Phi-3.5-mini-instruct: 100%|██████████| 50/50 [10:42<00:00, 12.85s/it]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for microsoft/Phi-3.5-mini-instruct (Czech-English):
                           BLEU       chrF         TER  BERTScore       WER  \
Phi-3.5-mini-instruct  2.855092  24.579535  680.773362  -0.159256  6.839957   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Phi-3.5-mini-instruct  6.944374  0.147451  0.064292  0.125495  

Evaluating German-English translations...


Downloading data:   0%|          | 0/16 [00:00<?, ?files/s]

train-00000-of-00016.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00001-of-00016.parquet:   0%|          | 0.00/130M [00:00<?, ?B/s]

train-00002-of-00016.parquet:   0%|          | 0.00/102M [00:00<?, ?B/s]

train-00003-of-00016.parquet:   0%|          | 0.00/176M [00:00<?, ?B/s]

train-00004-of-00016.parquet:   0%|          | 0.00/282M [00:00<?, ?B/s]

train-00005-of-00016.parquet:   0%|          | 0.00/183M [00:00<?, ?B/s]

train-00006-of-00016.parquet:   0%|          | 0.00/251M [00:00<?, ?B/s]

train-00007-of-00016.parquet:   0%|          | 0.00/336M [00:00<?, ?B/s]

train-00008-of-00016.parquet:   0%|          | 0.00/232M [00:00<?, ?B/s]

train-00009-of-00016.parquet:   0%|          | 0.00/224M [00:00<?, ?B/s]

train-00010-of-00016.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

train-00011-of-00016.parquet:   0%|          | 0.00/340M [00:00<?, ?B/s]

train-00012-of-00016.parquet:   0%|          | 0.00/401M [00:00<?, ?B/s]

train-00013-of-00016.parquet:   0%|          | 0.00/307M [00:00<?, ?B/s]

train-00014-of-00016.parquet:   0%|          | 0.00/305M [00:00<?, ?B/s]

train-00015-of-00016.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/495k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/34782245 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2998 [00:00<?, ? examples/s]

Evaluating microsoft/Phi-3.5-mini-instruct...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with microsoft/Phi-3.5-mini-instruct: 100%|██████████| 50/50 [11:01<00:00, 13.24s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for microsoft/Phi-3.5-mini-instruct (German-English):
                           BLEU       chrF         TER  BERTScore       WER  \
Phi-3.5-mini-instruct  6.048018  33.366536  495.066804   0.037158  5.002056   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Phi-3.5-mini-instruct  5.209791  0.263017  0.156249  0.231389  

Evaluating Finnish-English translations...


train-00000-of-00003.parquet:   0%|          | 0.00/350M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/177M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/445k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6587448 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Evaluating microsoft/Phi-3.5-mini-instruct...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with microsoft/Phi-3.5-mini-instruct: 100%|██████████| 50/50 [10:46<00:00, 12.94s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for microsoft/Phi-3.5-mini-instruct (Finnish-English):
                           BLEU       chrF         TER  BERTScore       WER  \
Phi-3.5-mini-instruct  1.964264  22.814816  521.064061  -0.180719  5.236699   

                            CER   ROUGE-1   ROUGE-2  ROUGE-L  
Phi-3.5-mini-instruct  5.717581  0.164447  0.054478  0.13982  

Evaluating French-German translations...


train-00000-of-00005.parquet:   0%|          | 0.00/368M [00:00<?, ?B/s]

train-00001-of-00005.parquet:   0%|          | 0.00/163M [00:00<?, ?B/s]

train-00002-of-00005.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

train-00003-of-00005.parquet:   0%|          | 0.00/245M [00:00<?, ?B/s]

train-00004-of-00005.parquet:   0%|          | 0.00/272M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/263k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9824476 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1512 [00:00<?, ? examples/s]

Evaluating microsoft/Phi-3.5-mini-instruct...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with microsoft/Phi-3.5-mini-instruct: 100%|██████████| 50/50 [11:19<00:00, 13.59s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for microsoft/Phi-3.5-mini-instruct (French-German):
                           BLEU       chrF         TER  BERTScore      WER  \
Phi-3.5-mini-instruct  1.842651  23.709473  763.075196  -0.145473  7.63973   

                           CER   ROUGE-1  ROUGE-2   ROUGE-L  
Phi-3.5-mini-instruct  6.41285  0.110018  0.05527  0.100294  

Evaluating Gujarati-English translations...


train-00000-of-00001.parquet:   0%|          | 0.00/361k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/370k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11670 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1998 [00:00<?, ? examples/s]

Evaluating microsoft/Phi-3.5-mini-instruct...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with microsoft/Phi-3.5-mini-instruct:   2%|▏         | 1/50 [00:07<06:19,  7.75s/it]

Error during translation with microsoft/Phi-3.5-mini-instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.
Error during translation with microsoft/Phi-3.5-mini-instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with microsoft/Phi-3.5-mini-instruct:  10%|█         | 5/50 [00:13<01:46,  2.37s/it]

Error during translation with microsoft/Phi-3.5-mini-instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.
Error during translation with microsoft/Phi-3.5-mini-instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with microsoft/Phi-3.5-mini-instruct:  18%|█▊        | 9/50 [00:21<01:30,  2.20s/it]

Error during translation with microsoft/Phi-3.5-mini-instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with microsoft/Phi-3.5-mini-instruct:  22%|██▏       | 11/50 [00:22<00:58,  1.51s/it]

Error during translation with microsoft/Phi-3.5-mini-instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.
Error during translation with microsoft/Phi-3.5-mini-instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.
Error during translation with microsoft/Phi-3.5-mini-instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with microsoft/Phi-3.5-mini-instruct:  30%|███       | 15/50 [00:31<01:07,  1.92s/it]

Error during translation with microsoft/Phi-3.5-mini-instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.
Error during translation with microsoft/Phi-3.5-mini-instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with microsoft/Phi-3.5-mini-instruct:  70%|███████   | 35/50 [02:16<01:19,  5.31s/it]

Error during translation with microsoft/Phi-3.5-mini-instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with microsoft/Phi-3.5-mini-instruct:  78%|███████▊  | 39/50 [02:38<01:00,  5.46s/it]

Error during translation with microsoft/Phi-3.5-mini-instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.
Error during translation with microsoft/Phi-3.5-mini-instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with microsoft/Phi-3.5-mini-instruct:  84%|████████▍ | 42/50 [02:48<00:34,  4.30s/it]

Error during translation with microsoft/Phi-3.5-mini-instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with microsoft/Phi-3.5-mini-instruct:  92%|█████████▏| 46/50 [03:04<00:18,  4.73s/it]

Error during translation with microsoft/Phi-3.5-mini-instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.
Error during translation with microsoft/Phi-3.5-mini-instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.
Error during translation with microsoft/Phi-3.5-mini-instruct: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with microsoft/Phi-3.5-mini-instruct: 100%|██████████| 50/50 [03:05<00:00,  3.71s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for microsoft/Phi-3.5-mini-instruct (Gujarati-English):
                           BLEU       chrF         TER  BERTScore       WER  \
Phi-3.5-mini-instruct  0.288182  10.775027  199.238579  -0.742477  1.994924   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Phi-3.5-mini-instruct  1.976486  0.136586  0.001837  0.116136  

Evaluating Kazakh-English translations...


train-00000-of-00001.parquet:   0%|          | 0.00/5.30M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/462k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/126583 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2066 [00:00<?, ? examples/s]

Evaluating microsoft/Phi-3.5-mini-instruct...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with microsoft/Phi-3.5-mini-instruct: 100%|██████████| 50/50 [08:57<00:00, 10.75s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for microsoft/Phi-3.5-mini-instruct (Kazakh-English):
                           BLEU       chrF         TER  BERTScore       WER  \
Phi-3.5-mini-instruct  0.415659  18.895064  468.134958  -0.455021  4.699156   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Phi-3.5-mini-instruct  4.606499  0.131223  0.014041  0.103545  

Evaluating Lithuanian-English translations...


train-00000-of-00002.parquet:   0%|          | 0.00/160M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/125M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/351k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2344893 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Evaluating microsoft/Phi-3.5-mini-instruct...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with microsoft/Phi-3.5-mini-instruct: 100%|██████████| 50/50 [10:23<00:00, 12.47s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for microsoft/Phi-3.5-mini-instruct (Lithuanian-English):
                           BLEU       chrF         TER  BERTScore       WER  \
Phi-3.5-mini-instruct  1.180614  21.720618  470.041754   -0.24214  4.720251   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Phi-3.5-mini-instruct  4.974621  0.151934  0.037285  0.116273  

Evaluating Russian-English translations...


Resolving data files:   0%|          | 0/28 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/28 [00:00<?, ?files/s]

train-00000-of-00028.parquet:   0%|          | 0.00/135M [00:00<?, ?B/s]

train-00001-of-00028.parquet:   0%|          | 0.00/150M [00:00<?, ?B/s]

train-00002-of-00028.parquet:   0%|          | 0.00/130M [00:00<?, ?B/s]

train-00003-of-00028.parquet:   0%|          | 0.00/167M [00:00<?, ?B/s]

train-00004-of-00028.parquet:   0%|          | 0.00/152M [00:00<?, ?B/s]

train-00005-of-00028.parquet:   0%|          | 0.00/158M [00:00<?, ?B/s]

train-00006-of-00028.parquet:   0%|          | 0.00/125M [00:00<?, ?B/s]

train-00007-of-00028.parquet:   0%|          | 0.00/141M [00:00<?, ?B/s]

train-00008-of-00028.parquet:   0%|          | 0.00/101M [00:00<?, ?B/s]

train-00009-of-00028.parquet:   0%|          | 0.00/245M [00:00<?, ?B/s]

train-00010-of-00028.parquet:   0%|          | 0.00/125M [00:00<?, ?B/s]

train-00011-of-00028.parquet:   0%|          | 0.00/266M [00:00<?, ?B/s]

train-00012-of-00028.parquet:   0%|          | 0.00/270M [00:00<?, ?B/s]

train-00013-of-00028.parquet:   0%|          | 0.00/254M [00:00<?, ?B/s]

train-00014-of-00028.parquet:   0%|          | 0.00/268M [00:00<?, ?B/s]

train-00015-of-00028.parquet:   0%|          | 0.00/258M [00:00<?, ?B/s]

train-00016-of-00028.parquet:   0%|          | 0.00/263M [00:00<?, ?B/s]

train-00017-of-00028.parquet:   0%|          | 0.00/262M [00:00<?, ?B/s]

train-00018-of-00028.parquet:   0%|          | 0.00/266M [00:00<?, ?B/s]

train-00019-of-00028.parquet:   0%|          | 0.00/261M [00:00<?, ?B/s]

train-00020-of-00028.parquet:   0%|          | 0.00/270M [00:00<?, ?B/s]

train-00021-of-00028.parquet:   0%|          | 0.00/268M [00:00<?, ?B/s]

train-00022-of-00028.parquet:   0%|          | 0.00/270M [00:00<?, ?B/s]

train-00023-of-00028.parquet:   0%|          | 0.00/274M [00:00<?, ?B/s]

train-00024-of-00028.parquet:   0%|          | 0.00/269M [00:00<?, ?B/s]

train-00025-of-00028.parquet:   0%|          | 0.00/269M [00:00<?, ?B/s]

train-00026-of-00028.parquet:   0%|          | 0.00/275M [00:00<?, ?B/s]

train-00027-of-00028.parquet:   0%|          | 0.00/273M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/611k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/37492126 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Evaluating microsoft/Phi-3.5-mini-instruct...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with microsoft/Phi-3.5-mini-instruct: 100%|██████████| 50/50 [10:29<00:00, 12.58s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for microsoft/Phi-3.5-mini-instruct (Russian-English):
                           BLEU       chrF         TER  BERTScore       WER  \
Phi-3.5-mini-instruct  4.244894  30.819463  477.558757   -0.38695  4.812737   

                            CER   ROUGE-1   ROUGE-2   ROUGE-L  
Phi-3.5-mini-instruct  5.017196  0.287558  0.138601  0.242065  

Evaluating Chinese-English translations...


train-00000-of-00013.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

train-00001-of-00013.parquet:   0%|          | 0.00/284M [00:00<?, ?B/s]

train-00002-of-00013.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

train-00003-of-00013.parquet:   0%|          | 0.00/289M [00:00<?, ?B/s]

train-00004-of-00013.parquet:   0%|          | 0.00/288M [00:00<?, ?B/s]

train-00005-of-00013.parquet:   0%|          | 0.00/289M [00:00<?, ?B/s]

train-00006-of-00013.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

train-00007-of-00013.parquet:   0%|          | 0.00/291M [00:00<?, ?B/s]

train-00008-of-00013.parquet:   0%|          | 0.00/307M [00:00<?, ?B/s]

train-00009-of-00013.parquet:   0%|          | 0.00/291M [00:00<?, ?B/s]

train-00010-of-00013.parquet:   0%|          | 0.00/185M [00:00<?, ?B/s]

train-00011-of-00013.parquet:   0%|          | 0.00/342M [00:00<?, ?B/s]

train-00012-of-00013.parquet:   0%|          | 0.00/263M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/728k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25984574 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3981 [00:00<?, ? examples/s]

Evaluating microsoft/Phi-3.5-mini-instruct...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with microsoft/Phi-3.5-mini-instruct: 100%|██████████| 50/50 [08:58<00:00, 10.77s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for microsoft/Phi-3.5-mini-instruct (Chinese-English):
                           BLEU       chrF         TER  BERTScore       WER  \
Phi-3.5-mini-instruct  9.645664  43.208601  189.255189  -0.155542  1.944444   

                            CER   ROUGE-1  ROUGE-2   ROUGE-L  
Phi-3.5-mini-instruct  2.113674  0.399524   0.2107  0.336844  


In [3]:
# import numpy as np
# import pandas as pd
# from datasets import load_dataset
# from sacrebleu.metrics import BLEU, CHRF, TER
# from bert_score import BERTScorer
# from torchmetrics.text import TranslationEditRate, WordErrorRate, CharErrorRate
# from rouge_score import rouge_scorer
# from transformers import AutoTokenizer, AutoModelForCausalLM
# from tqdm import tqdm
# import warnings
# import matplotlib.pyplot as plt
# import seaborn as sns
# import torch

# warnings.filterwarnings('ignore')

# # Check if GPU is available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

# # Define models
# model_names = ["microsoft/Phi-3.5-mini-instruct"]

# def load_translation_data(language_pair, num_samples=20):
#     """Load dataset for specified language pair."""
#     try:
#         dataset = load_dataset("wmt19", language_pair, split="validation")
#     except ValueError:
#         dataset = load_dataset("wmt19", language_pair, split="train")
#     return dataset.select(range(min(num_samples, len(dataset))))

# def translate_text(text, source_lang, target_lang, model_name, tokenizer, model):
#     """Translate text using the specified model."""
#     prompt = f"Translate the following text from {source_lang} to {target_lang}: {text}"
#     inputs = tokenizer(prompt, return_tensors="pt", max_length=256, truncation=True).to(device)
    
#     # Generate translation with no gradients
#     with torch.no_grad():
#         outputs = model.generate(**inputs, max_length=256)
#     return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

# def calculate_metrics(references, hypotheses):
#     """Calculate various MT evaluation metrics."""
#     bleu = BLEU()
#     chrf = CHRF()
#     ter_metric = TER()
#     bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
#     wer = WordErrorRate()
#     cer = CharErrorRate()
#     rouge_metrics = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
#     bleu_score = bleu.corpus_score(hypotheses, [references]).score
#     chrf_score = chrf.corpus_score(hypotheses, [references]).score
#     ter_score = ter_metric.corpus_score(hypotheses, [references]).score
#     P, R, F1 = bert_scorer.score(hypotheses, references)
#     bert_score = F1.mean().item()
#     wer_score = wer(hypotheses, references).item()
#     cer_score = cer(hypotheses, references).item()
#     rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
    
#     for hyp, ref in zip(hypotheses, references):
#         scores = rouge_metrics.score(ref, hyp)
#         rouge_scores['rouge1'] += scores['rouge1'].fmeasure
#         rouge_scores['rouge2'] += scores['rouge2'].fmeasure
#         rouge_scores['rougeL'] += scores['rougeL'].fmeasure
    
#     for key in rouge_scores:
#         rouge_scores[key] /= len(hypotheses)

#     return {
#         "BLEU": bleu_score,
#         "chrF": chrf_score,
#         "TER": ter_score,
#         "BERTScore": bert_score,
#         "WER": wer_score,
#         "CER": cer_score,
#         "ROUGE-1": rouge_scores['rouge1'],
#         "ROUGE-2": rouge_scores['rouge2'],
#         "ROUGE-L": rouge_scores['rougeL']
#     }

# def evaluate_model(dataset, source_lang, target_lang, model_name):
#   """Evaluate a specific model on the dataset."""
#   tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
#   model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).to(device)
    
#     translations = []
#     references = []
#     source_texts = []
    
#     for example in tqdm(dataset, desc=f"Translating with {model_name}"):
#         source_text = example['translation'][source_lang]
#         reference = example['translation'][target_lang]
        
#         try:
#             translation = translate_text(source_text, source_lang, target_lang, model_name, tokenizer, model)
#             source_texts.append(source_text)
#             translations.append(translation)
#             references.append(reference)
#         except Exception as e:
#             print(f"Error during translation with {model_name}: {str(e)}")
#             continue

#     # Clear GPU memory after processing
#     model.to("cpu")
#     torch.cuda.empty_cache()

#     if translations:
#         metrics = calculate_metrics(references, translations)
#         return translations, metrics
#     return [], {}

# def main():
#     language_pairs = [
#         ("cs-en", "Czech-English"),
#         ("de-en", "German-English"),
#         ("fi-en", "Finnish-English"),
#         ("fr-de", "French-German"),
#         ("gu-en", "Gujarati-English"),
#         ("kk-en", "Kazakh-English"),
#         ("lt-en", "Lithuanian-English"),
#         ("ru-en", "Russian-English"),
#         ("zh-en", "Chinese-English")
#     ]
    
#     for pair_code, pair_name in language_pairs:
#         print(f"\nEvaluating {pair_name} translations...")
#         dataset = load_translation_data(pair_code, num_samples=50)
#         source_lang, target_lang = pair_code.split("-")
        
#         results = {}
#         all_translations = {}
        
#         for model_name in model_names:
#             print(f"Evaluating {model_name}...")
#             translations, metrics = evaluate_model(dataset, source_lang, target_lang, model_name)
#             model_key = model_name.split("/")[-1]  # Use shorter model name for keys
#             results[model_key] = metrics
#             all_translations[model_key] = translations
        
#         # Save results
#         pd.DataFrame(results).to_csv(f"mt_evaluation_results_{pair_code}.csv")
        
#         # Save translations for review
#         translations_df = pd.DataFrame({
#             "Source": [example['translation'][source_lang] for example in dataset],
#             "Reference": [example['translation'][target_lang] for example in dataset],
#             **{f"Translation_{key}": all_translations[key] for key in results.keys()}
#         })
#         translations_df.to_csv(f"translations_comparison_{model}{pair_code}.csv", index=False)
        
#         # Print results
#         print(f"\nResults for {pair_name}:")
#         print(pd.DataFrame(results))

# if __name__ == "__main__":
#     main()