In [1]:
!pip install sacrebleu bert-score torchmetrics nltk rouge-score datasets transformers groq pandas tqdm matplotlib seaborn
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l- done
Collecting groq
  Downloading groq-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading groq-0.12.

True

In [None]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from sacrebleu.metrics import BLEU, CHRF, TER
from bert_score import BERTScorer
from torchmetrics.text import TranslationEditRate, WordErrorRate, CharErrorRate
from rouge_score import rouge_scorer
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import torch

warnings.filterwarnings('ignore')

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define Microsoft Phi model and tokenizer
MODEL_NAME = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)  # Move model to GPU

def load_translation_data(language_pair, num_samples=20):
    """Load dataset for specified language pair."""
    try:
        dataset = load_dataset("wmt19", language_pair, split="validation")
    except ValueError:
        dataset = load_dataset("wmt19", language_pair, split="train")
    return dataset.select(range(min(num_samples, len(dataset))))

def translate_text(text, source_lang, target_lang):
    """Translate text using the Microsoft Phi model."""
    prompt = f"Translate the following text from {source_lang} to {target_lang}: {text}"
    inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True).to(device)  # Move tensors to GPU
    outputs = model.generate(**inputs, max_length=512)
    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

def calculate_metrics(references, hypotheses, target_lang):
    """Calculate various MT evaluation metrics."""
    bleu_tokenizer = "zh" if target_lang == "zh" else "13a"
    bleu = BLEU(tokenizer=bleu_tokenizer)
    chrf = CHRF()
    ter_metric = TER()
    bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    wer = WordErrorRate()
    cer = CharErrorRate()
    rouge_metrics = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    bleu_score = bleu.corpus_score(hypotheses, [references]).score
    chrf_score = chrf.corpus_score(hypotheses, [references]).score
    ter_score = ter_metric.corpus_score(hypotheses, [references]).score
    P, R, F1 = bert_scorer.score(hypotheses, references)
    bert_score = F1.mean().item()
    wer_score = wer(hypotheses, references).item()
    cer_score = cer(hypotheses, references).item()
    rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
    
    for hyp, ref in zip(hypotheses, references):
        scores = rouge_metrics.score(ref, hyp)
        rouge_scores['rouge1'] += scores['rouge1'].fmeasure
        rouge_scores['rouge2'] += scores['rouge2'].fmeasure
        rouge_scores['rougeL'] += scores['rougeL'].fmeasure
    
    for key in rouge_scores:
        rouge_scores[key] /= len(hypotheses)

    return {
        "BLEU": bleu_score,
        "chrF": chrf_score,
        "TER": ter_score,
        "BERTScore": bert_score,
        "WER": wer_score,
        "CER": cer_score,
        "ROUGE-1": rouge_scores['rouge1'],
        "ROUGE-2": rouge_scores['rouge2'],
        "ROUGE-L": rouge_scores['rougeL']
    }

def evaluate_model(dataset, source_lang, target_lang):
    """Evaluate the Microsoft Phi model on the dataset."""
    translations = []
    references = []
    source_texts = []
    
    for example in tqdm(dataset):
        source_text = example['translation'][source_lang]
        reference = example['translation'][target_lang]
        
        try:
            translation = translate_text(source_text, source_lang, target_lang)
            source_texts.append(source_text)
            translations.append(translation)
            references.append(reference)
        except Exception as e:
            print(f"Error during translation: {str(e)}")
            continue
    
    if translations:
        metrics = calculate_metrics(references, translations)
        pd.DataFrame({
            'Source': source_texts,
            'Reference': references,
            'Translation': translations
        }).to_csv(f'translations_phi_{source_lang}-{target_lang}.csv', index=False)
        return metrics
    return {}

def visualize_results(results, pair_name):
    """Create visualizations for the evaluation results."""
    plt.figure(figsize=(12, 6))
    sns.barplot(data=pd.DataFrame(results, index=[0]).T, x=0, y=pd.DataFrame(results, index=[0]).T.index, palette="viridis")
    plt.title(f'Translation Metrics for {pair_name} using Microsoft Phi')
    plt.xlabel('Score')
    plt.ylabel('Metrics')
    plt.tight_layout()
    plt.savefig(f'mt_evaluation_bar_{pair_name}.png')
    plt.close()

def main():
    language_pairs = [
        ("cs-en", "Czech-English"),
        ("de-en", "German-English"),
        ("fi-en", "Finnish-English"),
        ("fr-de", "French-German"),
        ("gu-en", "Gujarati-English"),
        ("kk-en", "Kazakh-English"),
        ("lt-en", "Lithuanian-English"),
        ("ru-en", "Russian-English"),
        ("zh-en", "Chinese-English")
    ]
    
    for pair_code, pair_name in language_pairs:
        print(f"\nEvaluating {pair_name} translations...")
        dataset = load_translation_data(pair_code, num_samples=100)
        source_lang, target_lang = pair_code.split("-")
        results = evaluate_model(dataset, source_lang, target_lang)
        results_df = pd.DataFrame([results], index=["Microsoft Phi"])
        results_df.to_csv(f"mt_evaluation_results_phi_{pair_code}.csv")
        visualize_results(results, pair_name)
        print(f"\nResults for {pair_name}:")
        print(results_df)

if __name__ == "__main__":
    main()


Using device: cuda


tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]


Evaluating Czech-English translations...


README.md:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/254M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/473k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7270695 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2983 [00:00<?, ? examples/s]

  0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
  1%|          | 1/100 [00:08<13:47,  8.36s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 2/100 [00:21<18:06, 11.09s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  3%|▎         | 3/100 [00:29<15:23,  9.52s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  4%|▍         | 4/100 [00:35<13:32,  8.47s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  5%|▌         | 5/100 [00:50<17:04, 10.79s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  6%|▌         | 6/100 [00:56<14:16,  9.11s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  7%|▋         | 7/100 [01:05<14:15,  9.19s/i

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for Czech-English:
                   BLEU       chrF         TER  BERTScore       WER       CER  \
Microsoft Phi  0.483783  14.012148  968.157424   -0.25617  9.699463  9.311726   

                ROUGE-1   ROUGE-2   ROUGE-L  
Microsoft Phi  0.063692  0.011466  0.052139  

Evaluating German-English translations...


Downloading data:   0%|          | 0/16 [00:00<?, ?files/s]

train-00000-of-00016.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00001-of-00016.parquet:   0%|          | 0.00/130M [00:00<?, ?B/s]

train-00002-of-00016.parquet:   0%|          | 0.00/102M [00:00<?, ?B/s]

train-00003-of-00016.parquet:   0%|          | 0.00/176M [00:00<?, ?B/s]

train-00004-of-00016.parquet:   0%|          | 0.00/282M [00:00<?, ?B/s]

train-00005-of-00016.parquet:   0%|          | 0.00/183M [00:00<?, ?B/s]

train-00006-of-00016.parquet:   0%|          | 0.00/251M [00:00<?, ?B/s]

train-00007-of-00016.parquet:   0%|          | 0.00/336M [00:00<?, ?B/s]

train-00008-of-00016.parquet:   0%|          | 0.00/232M [00:00<?, ?B/s]

train-00009-of-00016.parquet:   0%|          | 0.00/224M [00:00<?, ?B/s]

train-00010-of-00016.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

train-00011-of-00016.parquet:   0%|          | 0.00/340M [00:00<?, ?B/s]

train-00012-of-00016.parquet:   0%|          | 0.00/401M [00:00<?, ?B/s]

train-00013-of-00016.parquet:   0%|          | 0.00/307M [00:00<?, ?B/s]

train-00014-of-00016.parquet:   0%|          | 0.00/305M [00:00<?, ?B/s]

train-00015-of-00016.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/495k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/34782245 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2998 [00:00<?, ? examples/s]

  0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  1%|          | 1/100 [00:15<25:04, 15.20s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 2/100 [00:20<15:13,  9.32s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  3%|▎         | 3/100 [00:35<19:07, 11.83s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  4%|▍         | 4/100 [00:39<13:56,  8.72s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  5%|▌         | 5/100 [00:39<09:08,  5.78s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  6%|▌         | 6/100 [00:44<08:27,  5.40s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  7%|▋         | 7/100 [00:50<08:40,  5.59s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  8%|▊         | 8/100 [00:58<09:43,  6.34s/it]Setting `pad_toke


Results for German-English:
                   BLEU       chrF         TER  BERTScore       WER      CER  \
Microsoft Phi  2.179013  20.955072  836.860236  -0.113943  8.408464  8.73829   

                ROUGE-1   ROUGE-2   ROUGE-L  
Microsoft Phi  0.166228  0.076531  0.136881  

Evaluating Finnish-English translations...


train-00000-of-00003.parquet:   0%|          | 0.00/350M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/177M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/445k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6587448 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

  0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  1%|          | 1/100 [00:06<11:00,  6.67s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 2/100 [00:21<18:40, 11.44s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  3%|▎         | 3/100 [00:22<10:38,  6.59s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  4%|▍         | 4/100 [00:23<07:25,  4.64s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  5%|▌         | 5/100 [00:30<08:32,  5.39s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  6%|▌         | 6/100 [00:44<13:06,  8.36s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  7%|▋         | 7/100 [00:47<10:05,  6.51s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  8%|▊         | 8/100 [00:51<08:54,  5.80s/it]Setting `pad_toke


Results for Finnish-English:
                   BLEU       chrF         TER  BERTScore       WER       CER  \
Microsoft Phi  0.367346  13.165669  916.675379  -0.303051  9.184004  9.272766   

                ROUGE-1   ROUGE-2   ROUGE-L  
Microsoft Phi  0.062087  0.006795  0.050861  

Evaluating French-German translations...


train-00000-of-00005.parquet:   0%|          | 0.00/368M [00:00<?, ?B/s]

train-00001-of-00005.parquet:   0%|          | 0.00/163M [00:00<?, ?B/s]

train-00002-of-00005.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

train-00003-of-00005.parquet:   0%|          | 0.00/245M [00:00<?, ?B/s]

train-00004-of-00005.parquet:   0%|          | 0.00/272M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/263k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9824476 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1512 [00:00<?, ? examples/s]

  0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  1%|          | 1/100 [00:15<25:50, 15.66s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 2/100 [00:20<15:04,  9.23s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  3%|▎         | 3/100 [00:33<17:46, 10.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  4%|▍         | 4/100 [00:43<17:10, 10.73s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  5%|▌         | 5/100 [00:53<16:39, 10.52s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  6%|▌         | 6/100 [01:03<16:11, 10.34s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  7%|▋         | 7/100 [01:04<11:00,  7.10s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  8%|▊         | 8/100 [01:17<13:47,  9.00s/it]Setting `pad_toke


Results for French-German:
                   BLEU       chrF         TER  BERTScore       WER       CER  \
Microsoft Phi  1.276273  16.779796  849.684116  -0.361385  8.489621  7.177238   

                ROUGE-1   ROUGE-2   ROUGE-L  
Microsoft Phi  0.056755  0.023802  0.051182  

Evaluating Gujarati-English translations...


train-00000-of-00001.parquet:   0%|          | 0.00/361k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/370k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11670 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1998 [00:00<?, ? examples/s]

  0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  1%|          | 1/100 [00:11<18:49, 11.41s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 2/100 [00:15<11:44,  7.19s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Error during translation: Input length of input_ids is 512, but `max_length` is set to 512. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


  4%|▍         | 4/100 [00:24<08:58,  5.61s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  5%|▌         | 5/100 [00:33<10:28,  6.62s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  6%|▌         | 6/100 [00:38<09:36,  6.13s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  7%|▋         | 7/100 [00:40<07:36,  4.91s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  8%|▊         | 8/100 [00:51<10:19,  6.73s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  9%|▉         | 9/100 [01:00<11:07,  7.33s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 10%|█         | 10/100 [01:06<10:24,  6.93s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 11%|█         | 11/100 [01:14<10:51,  7.32s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 12%|█▏        | 12/100 [01:22<10:45,  7.33s/it]Settin

Error during translation: Input length of input_ids is 512, but `max_length` is set to 512. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


 41%|████      | 41/100 [05:39<05:56,  6.04s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 42%|████▏     | 42/100 [05:48<06:39,  6.89s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Error during translation: Input length of input_ids is 512, but `max_length` is set to 512. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


 44%|████▍     | 44/100 [05:58<05:38,  6.04s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 45%|████▌     | 45/100 [06:06<05:59,  6.53s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 46%|████▌     | 46/100 [06:07<04:40,  5.19s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 47%|████▋     | 47/100 [06:14<04:56,  5.59s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 48%|████▊     | 48/100 [06:19<04:43,  5.46s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 49%|████▉     | 49/100 [06:24<04:35,  5.39s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 50%|█████     | 50/100 [06:33<05:12,  6.25s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 51%|█████     | 51/100 [06:41<05:27,  6.69s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 52%|█████▏    | 52/100 [06:46<05:02,  6.30s/it]

Error during translation: Input length of input_ids is 512, but `max_length` is set to 512. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


 80%|████████  | 80/100 [10:14<01:31,  4.60s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 81%|████████  | 81/100 [10:26<02:01,  6.38s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 82%|████████▏ | 82/100 [10:35<02:10,  7.23s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 83%|████████▎ | 83/100 [10:36<01:34,  5.55s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 84%|████████▍ | 84/100 [10:48<01:56,  7.28s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 85%|████████▌ | 85/100 [10:55<01:46,  7.12s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 86%|████████▌ | 86/100 [11:04<01:47,  7.65s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 87%|████████▋ | 87/100 [11:13<01:46,  8.19s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 88%|████████▊ | 88/100 [11:14<01:14,  6.18s/it]


Results for Gujarati-English:
                   BLEU       chrF         TER  BERTScore       WER     CER  \
Microsoft Phi  0.083857  12.579439  546.557377  -0.690913  5.472131  5.0093   

                ROUGE-1   ROUGE-2   ROUGE-L  
Microsoft Phi  0.087328  0.002769  0.074398  

Evaluating Kazakh-English translations...


train-00000-of-00001.parquet:   0%|          | 0.00/5.30M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/462k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/126583 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2066 [00:00<?, ? examples/s]

  0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  1%|          | 1/100 [00:13<23:01, 13.96s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 2/100 [00:24<19:56, 12.21s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  3%|▎         | 3/100 [00:36<19:28, 12.05s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  4%|▍         | 4/100 [00:43<16:12, 10.13s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  5%|▌         | 5/100 [00:53<15:40,  9.90s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  6%|▌         | 6/100 [00:54<10:53,  6.96s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  7%|▋         | 7/100 [00:59<09:44,  6.28s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  8%|▊         | 8/100 [01:00<06:47,  4.43s/it]Setting `pad_toke


Results for Kazakh-English:
                   BLEU       chrF         TER  BERTScore       WER      CER  \
Microsoft Phi  0.061698  13.190723  514.063927  -0.549312  5.151598  4.88502   

                ROUGE-1  ROUGE-2  ROUGE-L  
Microsoft Phi  0.083511  0.00384  0.07232  

Evaluating Lithuanian-English translations...


train-00000-of-00002.parquet:   0%|          | 0.00/160M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/125M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/351k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2344893 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

  0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  1%|          | 1/100 [00:11<19:12, 11.64s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 2/100 [00:24<20:39, 12.65s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  3%|▎         | 3/100 [00:38<20:43, 12.82s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  4%|▍         | 4/100 [00:43<15:40,  9.80s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  5%|▌         | 5/100 [00:51<14:31,  9.17s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  6%|▌         | 6/100 [01:04<16:30, 10.53s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  7%|▋         | 7/100 [01:16<17:14, 11.12s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  8%|▊         | 8/100 [01:31<18:41, 12.18s/it]Setting `pad_toke


Results for Lithuanian-English:
                   BLEU      chrF         TER  BERTScore       WER       CER  \
Microsoft Phi  0.285407  13.61733  929.543198  -0.325616  9.311817  9.450231   

                ROUGE-1   ROUGE-2   ROUGE-L  
Microsoft Phi  0.069702  0.010954  0.054421  

Evaluating Russian-English translations...


Resolving data files:   0%|          | 0/28 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/28 [00:00<?, ?files/s]

train-00000-of-00028.parquet:   0%|          | 0.00/135M [00:00<?, ?B/s]

train-00001-of-00028.parquet:   0%|          | 0.00/150M [00:00<?, ?B/s]

train-00002-of-00028.parquet:   0%|          | 0.00/130M [00:00<?, ?B/s]

train-00003-of-00028.parquet:   0%|          | 0.00/167M [00:00<?, ?B/s]

train-00004-of-00028.parquet:   0%|          | 0.00/152M [00:00<?, ?B/s]

train-00005-of-00028.parquet:   0%|          | 0.00/158M [00:00<?, ?B/s]

train-00006-of-00028.parquet:   0%|          | 0.00/125M [00:00<?, ?B/s]

train-00007-of-00028.parquet:   0%|          | 0.00/141M [00:00<?, ?B/s]

train-00008-of-00028.parquet:   0%|          | 0.00/101M [00:00<?, ?B/s]

train-00009-of-00028.parquet:   0%|          | 0.00/245M [00:00<?, ?B/s]

train-00010-of-00028.parquet:   0%|          | 0.00/125M [00:00<?, ?B/s]

train-00011-of-00028.parquet:   0%|          | 0.00/266M [00:00<?, ?B/s]

train-00012-of-00028.parquet:   0%|          | 0.00/270M [00:00<?, ?B/s]

train-00013-of-00028.parquet:   0%|          | 0.00/254M [00:00<?, ?B/s]

train-00014-of-00028.parquet:   0%|          | 0.00/268M [00:00<?, ?B/s]

train-00015-of-00028.parquet:   0%|          | 0.00/258M [00:00<?, ?B/s]

train-00016-of-00028.parquet:   0%|          | 0.00/263M [00:00<?, ?B/s]

train-00017-of-00028.parquet:   0%|          | 0.00/262M [00:00<?, ?B/s]

train-00018-of-00028.parquet:   0%|          | 0.00/266M [00:00<?, ?B/s]

train-00019-of-00028.parquet:   0%|          | 0.00/261M [00:00<?, ?B/s]

train-00020-of-00028.parquet:   0%|          | 0.00/270M [00:00<?, ?B/s]

train-00021-of-00028.parquet:   0%|          | 0.00/268M [00:00<?, ?B/s]

train-00022-of-00028.parquet:   0%|          | 0.00/270M [00:00<?, ?B/s]

train-00023-of-00028.parquet:   0%|          | 0.00/274M [00:00<?, ?B/s]

train-00024-of-00028.parquet:   0%|          | 0.00/269M [00:00<?, ?B/s]

train-00025-of-00028.parquet:   0%|          | 0.00/269M [00:00<?, ?B/s]

train-00026-of-00028.parquet:   0%|          | 0.00/275M [00:00<?, ?B/s]

train-00027-of-00028.parquet:   0%|          | 0.00/273M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/611k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/37492126 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

  0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  1%|          | 1/100 [00:09<14:52,  9.02s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 2/100 [00:10<07:05,  4.34s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  3%|▎         | 3/100 [00:11<04:32,  2.81s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  4%|▍         | 4/100 [00:12<03:25,  2.14s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  5%|▌         | 5/100 [00:14<03:16,  2.07s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  6%|▌         | 6/100 [00:14<02:25,  1.55s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  7%|▋         | 7/100 [00:18<03:21,  2.17s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  8%|▊         | 8/100 [00:23<05:00,  3.27s/it]Setting `pad_toke

Error during translation: Input length of input_ids is 512, but `max_length` is set to 512. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


 21%|██        | 21/100 [00:49<02:37,  1.99s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 22%|██▏       | 22/100 [00:55<04:04,  3.13s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 23%|██▎       | 23/100 [00:56<03:15,  2.54s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 24%|██▍       | 24/100 [00:57<02:38,  2.09s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 25%|██▌       | 25/100 [00:59<02:32,  2.04s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 26%|██▌       | 26/100 [01:08<04:53,  3.97s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 27%|██▋       | 27/100 [01:08<03:36,  2.96s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 28%|██▊       | 28/100 [01:12<03:49,  3.19s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 29%|██▉       | 29/100 [01:26<07:32,  6.38s/it]


Results for Russian-English:
                   BLEU       chrF         TER  BERTScore       WER       CER  \
Microsoft Phi  1.034057  16.012403  258.622203  -0.460086  2.603774  2.469475   

                ROUGE-1   ROUGE-2   ROUGE-L  
Microsoft Phi  0.199448  0.041464  0.161081  

Evaluating Chinese-English translations...


train-00000-of-00013.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

train-00001-of-00013.parquet:   0%|          | 0.00/284M [00:00<?, ?B/s]

train-00002-of-00013.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

train-00003-of-00013.parquet:   0%|          | 0.00/289M [00:00<?, ?B/s]

train-00004-of-00013.parquet:   0%|          | 0.00/288M [00:00<?, ?B/s]

train-00005-of-00013.parquet:   0%|          | 0.00/289M [00:00<?, ?B/s]

train-00006-of-00013.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

train-00007-of-00013.parquet:   0%|          | 0.00/291M [00:00<?, ?B/s]

train-00008-of-00013.parquet:   0%|          | 0.00/307M [00:00<?, ?B/s]

train-00009-of-00013.parquet:   0%|          | 0.00/291M [00:00<?, ?B/s]

train-00010-of-00013.parquet:   0%|          | 0.00/185M [00:00<?, ?B/s]

train-00011-of-00013.parquet:   0%|          | 0.00/342M [00:00<?, ?B/s]

train-00012-of-00013.parquet:   0%|          | 0.00/263M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/728k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25984574 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3981 [00:00<?, ? examples/s]

  0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  1%|          | 1/100 [00:08<13:46,  8.35s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  2%|▏         | 2/100 [00:09<06:42,  4.10s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  3%|▎         | 3/100 [00:12<05:29,  3.40s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  4%|▍         | 4/100 [00:21<09:13,  5.76s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  5%|▌         | 5/100 [00:27<09:30,  6.01s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  6%|▌         | 6/100 [00:35<10:07,  6.46s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  7%|▋         | 7/100 [00:40<09:29,  6.13s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
  8%|▊         | 8/100 [00:54<12:55,  8.43s/it]Setting `pad_toke


Results for Chinese-English:
                   BLEU      chrF         TER  BERTScore       WER       CER  \
Microsoft Phi  2.083147  26.60148  323.397436  -0.257722  3.261859  3.070347   

                ROUGE-1   ROUGE-2   ROUGE-L  
Microsoft Phi  0.238125  0.077757  0.186024  


In [3]:
# import numpy as np
# import pandas as pd
# from datasets import load_dataset
# from sacrebleu.metrics import BLEU, CHRF, TER
# from bert_score import BERTScorer
# from torchmetrics.text import TranslationEditRate, WordErrorRate, CharErrorRate
# from rouge_score import rouge_scorer
# from transformers import AutoTokenizer, AutoModelForCausalLM
# from tqdm import tqdm
# import warnings
# import matplotlib.pyplot as plt
# import seaborn as sns

# warnings.filterwarnings('ignore')

# # Define Microsoft Phi model and tokenizer
# MODEL_NAME = "microsoft/phi-2"
# # Load model directly
# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

# def load_translation_data(language_pair, num_samples=20):
#     """Load dataset for specified language pair."""
#     try:
#         # Try loading from validation set first
#         dataset = load_dataset("wmt19", language_pair, split="validation")
#     except ValueError:
#         # If validation not available, try train set
#         dataset = load_dataset("wmt19", language_pair, split="train")
    
#     # Select the specified number of samples
#     return dataset.select(range(min(num_samples, len(dataset))))

# def translate_text(text, source_lang, target_lang):
#     """Translate text using the Microsoft Phi model."""
#     prompt = f"Translate the following text from {source_lang} to {target_lang}: {text}"
#     inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
#     outputs = model.generate(**inputs, max_length=512)
#     return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

# def calculate_metrics(references, hypotheses):
#     """Calculate various MT evaluation metrics."""
#     # Initialize metrics
#     bleu = BLEU()
#     chrf = CHRF()
#     ter_metric = TER()
#     bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
#     wer = WordErrorRate()
#     cer = CharErrorRate()
#     rouge_metrics = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
#     # Calculate BLEU and variants
#     bleu_score = bleu.corpus_score(hypotheses, [references]).score
#     chrf_score = chrf.corpus_score(hypotheses, [references]).score
#     ter_score = ter_metric.corpus_score(hypotheses, [references]).score
    
#     # Calculate BERTScore
#     P, R, F1 = bert_scorer.score(hypotheses, references)
#     bert_score = F1.mean().item()
    
#     # Calculate WER and CER
#     wer_score = wer(hypotheses, references).item()
#     cer_score = cer(hypotheses, references).item()
    
#     # Calculate ROUGE scores
#     rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
#     for hyp, ref in zip(hypotheses, references):
#         scores = rouge_metrics.score(ref, hyp)
#         rouge_scores['rouge1'] += scores['rouge1'].fmeasure
#         rouge_scores['rouge2'] += scores['rouge2'].fmeasure
#         rouge_scores['rougeL'] += scores['rougeL'].fmeasure
    
#     for key in rouge_scores:
#         rouge_scores[key] /= len(hypotheses)

#     return {
#         "BLEU": bleu_score,
#         "chrF": chrf_score,
#         "TER": ter_score,
#         "BERTScore": bert_score,
#         "WER": wer_score,
#         "CER": cer_score,
#         "ROUGE-1": rouge_scores['rouge1'],
#         "ROUGE-2": rouge_scores['rouge2'],
#         "ROUGE-L": rouge_scores['rougeL']
#     }

# def evaluate_model(dataset, source_lang, target_lang):
#     """Evaluate the Microsoft Phi model on the dataset."""
#     translations = []
#     references = []
#     source_texts = []
    
#     for example in tqdm(dataset):
#         source_text = example['translation'][source_lang]
#         reference = example['translation'][target_lang]
        
#         try:
#             translation = translate_text(source_text, source_lang, target_lang)
#             source_texts.append(source_text)
#             translations.append(translation)
#             references.append(reference)
#         except Exception as e:
#             print(f"Error during translation: {str(e)}")
#             continue
    
#     if translations:  # Only calculate metrics if we have translations
#         # Calculate all metrics
#         metrics = calculate_metrics(references, translations)
        
#         # Save translations and source texts for review
#         pd.DataFrame({
#             'Source': source_texts,
#             'Reference': references,
#             'Translation': translations
#         }).to_csv(f'translations_phi_{source_lang}-{target_lang}.csv', index=False)
        
#         return metrics
#     return {}

# def visualize_results(results, pair_name):
#     """Create visualizations for the evaluation results."""
#     plt.figure(figsize=(12, 6))
#     sns.barplot(data=pd.DataFrame(results, index=[0]).T, x=0, y=pd.DataFrame(results, index=[0]).T.index, palette="viridis")
#     plt.title(f'Translation Metrics for {pair_name} using Microsoft Phi')
#     plt.xlabel('Score')
#     plt.ylabel('Metrics')
#     plt.tight_layout()
#     plt.savefig(f'mt_evaluation_bar_{pair_name}.png')
#     plt.close()

# def main():
#     # Language pairs to evaluate
#     language_pairs = [
#         ("cs-en", "Czech-English"),
#         ("de-en", "German-English")
#     ]
    
#     for pair_code, pair_name in language_pairs:
#         print(f"\nEvaluating {pair_name} translations...")
        
#         # Load dataset
#         dataset = load_translation_data(pair_code, num_samples=100)
        
#         # Get source and target language codes
#         source_lang, target_lang = pair_code.split("-")
        
#         # Evaluate model
#         results = evaluate_model(dataset, source_lang, target_lang)
        
#         # Save results
#         results_df = pd.DataFrame([results], index=["Microsoft Phi"])
#         results_df.to_csv(f"mt_evaluation_results_phi_{pair_code}.csv")
        
#         # Create visualizations
#         visualize_results(results, pair_name)
        
#         # Print results
#         print(f"\nResults for {pair_name}:")
#         print(results_df)

# if __name__ == "__main__":
#     main()


In [4]:
# import numpy as np
# import pandas as pd
# from datasets import load_dataset
# from sacrebleu.metrics import BLEU, CHRF, TER
# from bert_score import BERTScorer
# from torchmetrics.text import TranslationEditRate, WordErrorRate, CharErrorRate
# from rouge_score import rouge_scorer
# from groq import Groq
# import torch
# from tqdm import tqdm
# import warnings
# import matplotlib.pyplot as plt
# import seaborn as sns

In [5]:
# from huggingface_hub import login

# login(token="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")

# warnings.filterwarnings('ignore')

# # Initialize Groq client with placeholder API key
# client = Groq(api_key="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")

In [6]:
# import torch
# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# MODELS = {
# #     "gemma2-9b-it": {"provider": "Google", "context_length": 8192},
# #     "gemma-7b-it": {"provider": "Google", "context_length": 8192},
# #     "llama3-groq-70b-8192-tool-use-preview": {"provider": "Groq", "context_length": 8192},
# #     "llama3-groq-8b-8192-tool-use-preview": {"provider": "Groq", "context_length": 8192},
# #     "llama-3.1-70b-versatile": {"provider": "Meta", "context_length": 8192},
# #     "llama-3.1-8b-instant": {"provider": "Meta", "context_length": 8192},
# #     "mixtral-8x7b-32768": {"provider": "Mistral", "context_length": 32768},
# #     "llama-3.2-90b-vision-preview": {"provider": "Meta", "context_length": 128000},
#     "Qwen2.5-0.5B": {"provider": "Qwen", "context_length": 512, "model_name": "Qwen2.5-0.5B"},
#     "Phi-3.5-mini-instruct": {"provider": "microsoft", "context_length": 512, "model_name": "Phi-3.5-mini-instruct"},
# }

In [7]:
# def load_translation_data(language_pair, num_samples=20):
#     """Load dataset for specified language pair."""
#     try:
#         # Try loading from validation set first
#         dataset = load_dataset("wmt19", language_pair, split="validation")
#     except ValueError:
#         # If validation not available, try train set
#         dataset = load_dataset("wmt19", language_pair, split="train")
    
#     # Select the specified number of samples
#     return dataset.select(range(min(num_samples, len(dataset))))

In [8]:
# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# def translate_text(text, model_name, source_lang, target_lang):
#     if MODELS[model_name]["provider"] == "Hugging Face":
#         # Load model and tokenizer
#         model = AutoModelForSeq2SeqLM.from_pretrained(MODELS[model_name]["model_name"])
#         tokenizer = AutoTokenizer.from_pretrained(MODELS[model_name]["model_name"])
        
#         # Prepare input
#         input_text = f"Translate the following {source_lang} text to {target_lang}: {text} Translation:"
#         inputs = tokenizer(input_text, return_tensors="pt", max_length=MODELS[model_name]["context_length"], truncation=True)
        
#         # Generate translation
#         output = model.generate(**inputs)
        
#         # Convert to text
#         translation = tokenizer.decode(output[0], skip_special_tokens=True)
        
#         return translation
    
#     else:
#         language_names = {
#             'cs': 'Czech', 'en': 'English', 'de': 'German', 'fi': 'Finnish', 'fr': 'French',
#             'gu': 'Gujarati', 'kk': 'Kazakh', 'lt': 'Lithuanian', 'ru': 'Russian', 'zh': 'Chinese'
#         }
#         source_lang_name = language_names[source_lang]
#         target_lang_name = language_names[target_lang]
#         prompt = f"""Translate the following {source_lang_name} text to {target_lang_name}: {text} Translation:"""
        
#         # Get model's max context length
#         max_length = MODELS[model_name]["context_length"]
        
#         # Truncate input if necessary to fit context length (leaving room for prompt and response)
#         safe_length = max_length - 500
#         if len(text) > safe_length:
#             text = text[:safe_length] + "..."
        
#         chat_completion = client.chat.completions.create(
#             model=model_name,
#             messages=[{"role": "user", "content": prompt}],
#             temperature=0.1,
#         )
        
#         return chat_completion.choices[0].message.content.strip()


In [9]:
# def calculate_metrics(references, hypotheses):
#     """Calculate various MT evaluation metrics."""
#     # Initialize metrics
#     bleu = BLEU()
#     chrf = CHRF()
#     ter_metric = TER()
#     bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
#     wer = WordErrorRate()
#     cer = CharErrorRate()
#     rouge_metrics = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
#     # Calculate BLEU and variants
#     bleu_score = bleu.corpus_score(hypotheses, [references]).score
#     chrf_score = chrf.corpus_score(hypotheses, [references]).score
#     ter_score = ter_metric.corpus_score(hypotheses, [references]).score
    
#     # Calculate BERTScore
#     P, R, F1 = bert_scorer.score(hypotheses, references)
#     bert_score = F1.mean().item()
    
#     # Calculate WER and CER
#     wer_score = wer(hypotheses, references).item()
#     cer_score = cer(hypotheses, references).item()
    
#     # Calculate ROUGE scores
#     rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
#     for hyp, ref in zip(hypotheses, references):
#         scores = rouge_metrics.score(ref, hyp)
#         rouge_scores['rouge1'] += scores['rouge1'].fmeasure
#         rouge_scores['rouge2'] += scores['rouge2'].fmeasure
#         rouge_scores['rougeL'] += scores['rougeL'].fmeasure
    
#     for key in rouge_scores:
#         rouge_scores[key] /= len(hypotheses)

#     return {
#         "BLEU": bleu_score,
#         "chrF": chrf_score,
#         "TER": ter_score,
#         "BERTScore": bert_score,
#         "WER": wer_score,
#         "CER": cer_score,
#         "ROUGE-1": rouge_scores['rouge1'],
#         "ROUGE-2": rouge_scores['rouge2'],
#         "ROUGE-L": rouge_scores['rougeL']
#     }

In [10]:
# def evaluate_models(dataset, source_lang, target_lang):
#     results = {}
#     for model_name, model_info in MODELS.items():
#         print(f"\nEvaluating {model_name} ({model_info['provider']})...")
        
#         translations = []
#         references = []
#         source_texts = []
        
#         for example in tqdm(dataset):
#             source_text = example['translation'][source_lang]
#             reference = example['translation'][target_lang]
            
#             try:
#                 translation = translate_text(source_text, model_name, source_lang, target_lang)
#                 source_texts.append(source_text)
#                 translations.append(translation)
#                 references.append(reference)
#             except Exception as e:
#                 print(f"Error with {model_name} on text: {str(e)}")
#                 continue
        
#         if translations:
#             # Only calculate metrics if we have translations
#             # Calculate all metrics
#             metrics = calculate_metrics(references, translations)
#             results[f"{model_name} ({model_info['provider']})"] = metrics
            
#             # Save translations and source texts for review
#             pd.DataFrame({
#                 'Source': source_texts,
#                 'Reference': references,
#                 'Translation': translations
#             }).to_csv(f'translations_{model_name}_{source_lang}-{target_lang}.csv', index=False)
    
#     return pd.DataFrame(results).T


In [11]:
# import matplotlib.pyplot as plt
# import seaborn as sns

In [12]:
# def visualize_results(results, pair_name):
#     """Create visualizations for the evaluation results."""
    
#     plt.figure(figsize=(20, 10))
#     sns.heatmap(results, annot=True, cmap='YlOrRd', fmt='.3f')
#     plt.title(f'Translation Metrics Comparison for {pair_name}')
#     plt.ylabel('Models')
#     plt.xlabel('Metrics')
#     plt.xticks(rotation=45)
#     plt.yticks(rotation=0)
#     plt.tight_layout()
#     plt.savefig(f'mt_evaluation_heatmap_{pair_name}.png')
#     plt.close()

In [13]:
# def main():
#     # Language pairs to evaluate
#     language_pairs = [
#     ("cs-en", "Czech-English"),
#     ("de-en", "German-English"),
#     ("fi-en", "Finnish-English"),
#     ("fr-de", "French-German"),
#     ("gu-en", "Gujarati-English"),
#     ("kk-en", "Kazakh-English"),
#     ("lt-en", "Lithuanian-English"),
#     ("ru-en", "Russian-English"),
#     ("zh-en", "Chinese-English")
#     ]
    
#     all_results = {}
    
#     for pair_code, pair_name in language_pairs:
#         print(f"\nEvaluating {pair_name} translations...")
        
#         # Load dataset
#         dataset = load_translation_data(pair_code, num_samples=100)
        
#         # Get source and target language codes
#         source_lang, target_lang = pair_code.split("-")
        
#         # Evaluate models
#         results = evaluate_models(dataset, source_lang, target_lang)
        
#         # Save results
#         results.to_csv(f"mt_evaluation_results_{pair_code}.csv")
        
#         # Create visualizations
#         visualize_results(results, pair_name)
        
#         # Store results
#         all_results[pair_name] = results
        
#         # Print results
#         print(f"\nResults for {pair_name}:")
#         print(results)
    
#     # Create combined visualization
# #     plt.figure(figsize=(25, 12))
# #     for idx, (pair_name, results) in enumerate(all_results.items()):
# #         plt.subplot(1, 2, idx+1)
# #         sns.heatmap(results, annot=True, cmap='YlOrRd', fmt='.3f')
# #         plt.title(f'Results for {pair_name}')
# #         plt.ylabel('Models')
# #         plt.xlabel('Metrics')
# #         plt.xticks(rotation=45)
# #         plt.yticks(rotation=0)
# #     plt.tight_layout()
# #     plt.savefig('combined_results.png')
# #     plt.close()

# if __name__ == "__main__":
#     main()

In [14]:
# from transformers import AutoModelForCausalLM, AutoTokenizer

# # Model and tokenizer names
# model_name = "microsoft/Phi-3.5-mini-instruct"

# # Load the model and tokenizer
# model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# # Function to generate text with long context
# def generate_text_with_long_context(prompt, max_length=100):
#     # Tokenize the prompt and ensure it fits within the context length
#     inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=128000)
#     outputs = model.generate(**inputs, max_length=max_length)
#     return tokenizer.decode(outputs[0], skip_special_tokens=True)

# # Example usage with a longer prompt
# long_prompt = """
# What are you and what can you do?
# """
# generated_text = generate_text_with_long_context(long_prompt)
# print(generated_text)

In [15]:
# !pip install sacrebleu bert-score torchmetrics nltk rouge-score datasets transformers groq pandas tqdm matplotlib seaborn

In [16]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from datasets import load_dataset
# from sacrebleu import corpus_bleu

# # Model and tokenizer names
# model_name = "microsoft/Phi-3.5-mini-instruct"

# # Load the model and tokenizer
# model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# # Load the WMT19 dataset
# dataset = load_dataset("wmt19", "de-en")

# def preprocess_example(example):
#     source_text = example["translation"]["de"]
#     target_text = example["translation"]["en"]
#     input_prompt = f"<|system|>\nYou are a translator.\n<|end|>\n<|user|>\nTranslate the following text from English to German: {source_text}\n<|end|>\n<|assistant|>"
#     return {"input_prompt": input_prompt, "target_text": target_text}

# # Preprocess the dataset
# dataset = dataset.map(preprocess_example, batched=False)

# def generate_translation(example):
#     input_prompt = example["input_prompt"]
#     inputs = tokenizer(input_prompt, return_tensors="pt")
#     outputs = model.generate(**inputs, max_length=100)
#     generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     return {"generated_text": generated_text}

# # Generate translations
# dataset = dataset.map(generate_translation, batched=True, batch_size=8)

# def evaluate_translations(dataset):
#     references = [example["target_text"] for example in dataset]
#     hypotheses = [example["generated_text"] for example in dataset]
#     bleu_score = corpus_bleu(hypotheses, [references], force=True)
#     return bleu_score

# # Evaluate the translations
# bleu_score = evaluate_translations(dataset["test"])
# print(f"BLEU Score: {bleu_score.score}")

In [17]:
# !pip install transformers datasets torch torchvision sacrebleu rouge-score
# !pip install rouge
# # Uninstall the previous installed nltk library
# !pip install -U nltk

# # This upgraded nltkto version 3.5 in which meteor_score is there.
# # !pip install nltk==3.5
# # !python -m nltk.downloader popular

In [18]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
# from datasets import load_dataset
# from sacrebleu import corpus_bleu
# # from nltk.translate import meteor
# # from nltk import word_tokenize
# from rouge import Rouge
# import torch
# from torch.nn import DataParallel

# # Check if GPUs are available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

# # Model and tokenizer names
# model_name = "microsoft/Phi-3.5-mini-instruct"

# # Load the model and tokenizer
# model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# # Move the model to the GPU and use half precision
# if torch.cuda.is_available():
#     model.to(device)
#     model = model.half()
#     # Wrap the model with DataParallel
#     model = DataParallel(model, device_ids=[0, 1])

# # Load the WMT19 dataset
# dataset = load_dataset("wmt19", "de-en")

# def preprocess_example(example):
#     source_text = example["translation"]["de"]
#     target_text = example["translation"]["en"]
#     input_prompt = f"<|system|>\nYou are a translator.\n<|end|>\n<|user|>\nTranslate the following text from English to German: {source_text}\n<|end|>\n<|assistant|>"
#     return {"input_prompt": input_prompt, "target_text": target_text}

# # Preprocess the dataset
# dataset = dataset.map(preprocess_example, batched=False)

# def generate_translation(examples):
#     input_prompts = [example["input_prompt"] for example in examples]
#     inputs = tokenizer(input_prompts, return_tensors="pt", padding=True, truncation=True)
#     if torch.cuda.is_available():
#         inputs = {k: v.to(device) for k, v in inputs.items()}
#     outputs = model.generate(**inputs, max_length=100)
#     generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
#     return [{"generated_text": text} for text in generated_texts]

# # Generate translations in batches
# batch_size = 8
# dataset = dataset.map(generate_translation, batched=True, batch_size=batch_size)

# def evaluate_translations(dataset):
#     references = [example["target_text"] for example in dataset]
#     hypotheses = [example["generated_text"] for example in dataset]

#     # BLEU Score
#     bleu_score = corpus_bleu(hypotheses, [references], force=True)
#     print(f"BLEU Score: {bleu_score.score}")

#     # METEOR Score
# #     meteor_scores = [meteor([ref], hyp) for ref, hyp in zip(references, hypotheses)]
# #     meteor_score_avg = sum(meteor_scores) / len(meteor_scores)
# #     print(f"METEOR Score: {meteor_score_avg}")

#     # ROUGE Scores
#     rouge = Rouge()
#     rouge_scores = rouge.get_scores(hypotheses, references, avg=True)
#     print(f"ROUGE-1 Score: {rouge_scores['rouge-1']['f']}")
#     print(f"ROUGE-2 Score: {rouge_scores['rouge-2']['f']}")
#     print(f"ROUGE-L Score: {rouge_scores['rouge-l']['f']}")

# # Evaluate the translations
# evaluate_translations(dataset["test"])