# allenai/OLMo Models Machine Translation Performance

In [1]:
!pip install numpy pandas datasets sacrebleu bert_score torchmetrics rouge_score transformers tqdm matplotlib seaborn torch

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l- done
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.0.0-py3-none-any.whl (19 kB)
Building wheels for collected packages: roug

## allenai/OLMo-1B-0724-hf

In [2]:
import numpy as np
import pandas as pd
from datasets import load_dataset
from sacrebleu.metrics import BLEU, CHRF, TER
from bert_score import BERTScorer
from torchmetrics.text import TranslationEditRate, WordErrorRate, CharErrorRate
from rouge_score import rouge_scorer
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import torch

warnings.filterwarnings('ignore')

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define models
model_names = ["allenai/OLMo-1B-0724-hf"]

def load_translation_data(language_pair, num_samples=100):
    """Load dataset for specified language pair."""
    try:
        dataset = load_dataset("wmt19", language_pair, split="validation")
    except ValueError:
        dataset = load_dataset("wmt19", language_pair, split="train")
    return dataset.select(range(min(num_samples, len(dataset))))

def translate_text(text, source_lang, target_lang, model_name, tokenizer, model):
    """Translate text using the specified model."""
    prompt = f"Translate the following text from {source_lang} to {target_lang}: {text}"
    inputs = tokenizer(prompt, return_tensors="pt", max_length=256, truncation=True).to(device)
    
    # Generate translation with no gradients
    with torch.no_grad():
        outputs = model.generate(**inputs, max_length=256)
    return tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

def calculate_metrics(references, hypotheses):
    """Calculate various MT evaluation metrics."""
    bleu = BLEU()
    chrf = CHRF()
    ter_metric = TER()
    bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    wer = WordErrorRate()
    cer = CharErrorRate()
    rouge_metrics = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    bleu_score = bleu.corpus_score(hypotheses, [references]).score
    chrf_score = chrf.corpus_score(hypotheses, [references]).score
    ter_score = ter_metric.corpus_score(hypotheses, [references]).score
    P, R, F1 = bert_scorer.score(hypotheses, references)
    bert_score = F1.mean().item()
    wer_score = wer(hypotheses, references).item()
    cer_score = cer(hypotheses, references).item()
    rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}
    
    for hyp, ref in zip(hypotheses, references):
        scores = rouge_metrics.score(ref, hyp)
        rouge_scores['rouge1'] += scores['rouge1'].fmeasure
        rouge_scores['rouge2'] += scores['rouge2'].fmeasure
        rouge_scores['rougeL'] += scores['rougeL'].fmeasure
    
    for key in rouge_scores:
        rouge_scores[key] /= len(hypotheses)

    return {
        "BLEU": bleu_score,
        "chrF": chrf_score,
        "TER": ter_score,
        "BERTScore": bert_score,
        "WER": wer_score,
        "CER": cer_score,
        "ROUGE-1": rouge_scores['rouge1'],
        "ROUGE-2": rouge_scores['rouge2'],
        "ROUGE-L": rouge_scores['rougeL']
    }

def plot_confusion_matrix(references, translations, model_name, pair_code):
    """Generate and save a confusion matrix."""
    matrix = np.zeros((len(references), len(translations)))
    for i, ref in enumerate(references):
        for j, hyp in enumerate(translations):
            matrix[i, j] = len(set(ref.split()) & set(hyp.split())) / len(set(ref.split()) | set(hyp.split()))
    
    plt.figure(figsize=(12, 8))
    sns.heatmap(matrix, annot=False, cmap="coolwarm", cbar=True)
    plt.title(f"Confusion Matrix for {model_name} ({pair_code})")
    plt.xlabel("Hypotheses")
    plt.ylabel("References")
    plt.tight_layout()
    plt.savefig(f"confusion_matrix_{model_name}_{pair_code}.png")
    plt.close()

def evaluate_model(dataset, source_lang, target_lang, model_name):
    """Evaluate a specific model on the dataset."""
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
    
    translations = []
    references = []
    source_texts = []
    
    for example in tqdm(dataset, desc=f"Translating with {model_name}"):
        source_text = example['translation'][source_lang]
        reference = example['translation'][target_lang]
        
        try:
            translation = translate_text(source_text, source_lang, target_lang, model_name, tokenizer, model)
            source_texts.append(source_text)
            translations.append(translation)
            references.append(reference)
        except Exception as e:
            print(f"Error during translation with {model_name}: {str(e)}")
            continue

    # Clear GPU memory after processing
    model.to("cpu")
    torch.cuda.empty_cache()

    if translations:
        metrics = calculate_metrics(references, translations)
        return source_texts, references, translations, metrics
    return [], [], [], {}

def main():
    language_pairs = [
        ("cs-en", "Czech-English"),
        ("de-en", "German-English"),
        ("fi-en", "Finnish-English"),
        ("fr-de", "French-German"),
        ("gu-en", "Gujarati-English"),
        ("kk-en", "Kazakh-English"),
        ("lt-en", "Lithuanian-English"),
        ("ru-en", "Russian-English"),
        ("zh-en", "Chinese-English")
    ]
    
    for pair_code, pair_name in language_pairs:
        print(f"\nEvaluating {pair_name} translations...")
        dataset = load_translation_data(pair_code, num_samples=100)
        source_lang, target_lang = pair_code.split("-")
        
        for model_name in model_names:
            # Original Direction
            print(f"Evaluating {model_name} in original direction ({source_lang} -> {target_lang})...")
            source_texts, references, translations, metrics = evaluate_model(dataset, source_lang, target_lang, model_name)
            
            # Save metrics
            results_df = pd.DataFrame([metrics], index=[model_name.split("/")[-1]])
            results_df.to_csv(f"mt_evaluation_results_{model_name.split('/')[-1]}_{pair_code}_original.csv")
            
            # Save translations for review
            translations_df = pd.DataFrame({
                "Source": source_texts,
                "Reference": references,
                "Translation": translations
            })
            translations_df.to_csv(f"translations_{model_name.split('/')[-1]}_{pair_code}_original.csv", index=False)
            
            # Plot and save confusion matrix
            # plot_confusion_matrix(references, translations, model_name.split("/")[-1], f"{pair_code}_original")
            
            # Print metrics
            print(f"\nResults for {model_name} ({pair_name}) in original direction:")
            print(results_df)

            # Reverse Direction
            print(f"Evaluating {model_name} in reverse direction ({target_lang} -> {source_lang})...")
            source_texts, references, translations, metrics = evaluate_model(dataset, target_lang, source_lang, model_name)
            
            # Save metrics
            results_df = pd.DataFrame([metrics], index=[model_name.split("/")[-1]])
            results_df.to_csv(f"mt_evaluation_results_{model_name.split('/')[-1]}_{pair_code}_reverse.csv")
            
            # Save translations for review
            translations_df = pd.DataFrame({
                "Source": source_texts,
                "Reference": references,
                "Translation": translations
            })
            translations_df.to_csv(f"translations_{model_name.split('/')[-1]}_{pair_code}_reverse.csv", index=False)
            
            # Plot and save confusion matrix
            # plot_confusion_matrix(references, translations, model_name.split("/")[-1], f"{pair_code}_reverse")
            
            # Print metrics
            print(f"\nResults for {model_name} ({pair_name}) in reverse direction:")
            print(results_df)



if __name__ == "__main__":
    main()

Using device: cuda

Evaluating Czech-English translations...


README.md:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/216M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/254M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/473k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7270695 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2983 [00:00<?, ? examples/s]

Evaluating allenai/OLMo-1B-0724-hf in original direction (cs -> en)...


tokenizer_config.json:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/9.25k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.71G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

Translating with allenai/OLMo-1B-0724-hf:   0%|          | 0/100 [00:00<?, ?it/s]Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
Translating with allenai/OLMo-1B-0724-hf: 100%|██████████| 100/100 [02:34<00:00,  1.55s/it]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for allenai/OLMo-1B-0724-hf (Czech-English) in original direction:
                     BLEU       chrF         TER  BERTScore       WER  \
OLMo-1B-0724-hf  2.486933  27.683637  387.298748  -0.146598  3.899225   

                      CER   ROUGE-1  ROUGE-2   ROUGE-L  
OLMo-1B-0724-hf  3.449525  0.169945  0.05894  0.139041  
Evaluating allenai/OLMo-1B-0724-hf in reverse direction (en -> cs)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with allenai/OLMo-1B-0724-hf: 100%|██████████| 100/100 [02:35<00:00,  1.55s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for allenai/OLMo-1B-0724-hf (Czech-English) in reverse direction:
                     BLEU       chrF         TER  BERTScore       WER  \
OLMo-1B-0724-hf  0.351135  15.622599  473.299928  -0.349635  4.735863   

                      CER   ROUGE-1   ROUGE-2   ROUGE-L  
OLMo-1B-0724-hf  3.718557  0.072759  0.010805  0.062234  

Evaluating German-English translations...


Downloading data:   0%|          | 0/16 [00:00<?, ?files/s]

train-00000-of-00016.parquet:   0%|          | 0.00/384M [00:00<?, ?B/s]

train-00001-of-00016.parquet:   0%|          | 0.00/130M [00:00<?, ?B/s]

train-00002-of-00016.parquet:   0%|          | 0.00/102M [00:00<?, ?B/s]

train-00003-of-00016.parquet:   0%|          | 0.00/176M [00:00<?, ?B/s]

train-00004-of-00016.parquet:   0%|          | 0.00/282M [00:00<?, ?B/s]

train-00005-of-00016.parquet:   0%|          | 0.00/183M [00:00<?, ?B/s]

train-00006-of-00016.parquet:   0%|          | 0.00/251M [00:00<?, ?B/s]

train-00007-of-00016.parquet:   0%|          | 0.00/336M [00:00<?, ?B/s]

train-00008-of-00016.parquet:   0%|          | 0.00/232M [00:00<?, ?B/s]

train-00009-of-00016.parquet:   0%|          | 0.00/224M [00:00<?, ?B/s]

train-00010-of-00016.parquet:   0%|          | 0.00/196M [00:00<?, ?B/s]

train-00011-of-00016.parquet:   0%|          | 0.00/340M [00:00<?, ?B/s]

train-00012-of-00016.parquet:   0%|          | 0.00/401M [00:00<?, ?B/s]

train-00013-of-00016.parquet:   0%|          | 0.00/307M [00:00<?, ?B/s]

train-00014-of-00016.parquet:   0%|          | 0.00/305M [00:00<?, ?B/s]

train-00015-of-00016.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/495k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/34782245 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2998 [00:00<?, ? examples/s]

Evaluating allenai/OLMo-1B-0724-hf in original direction (de -> en)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with allenai/OLMo-1B-0724-hf: 100%|██████████| 100/100 [02:28<00:00,  1.49s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for allenai/OLMo-1B-0724-hf (German-English) in original direction:
                     BLEU       chrF         TER  BERTScore       WER  \
OLMo-1B-0724-hf  6.755226  35.968229  365.649606   0.058252  3.698819   

                     CER   ROUGE-1   ROUGE-2   ROUGE-L  
OLMo-1B-0724-hf  3.55904  0.280643  0.145508  0.241056  
Evaluating allenai/OLMo-1B-0724-hf in reverse direction (en -> de)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with allenai/OLMo-1B-0724-hf: 100%|██████████| 100/100 [01:48<00:00,  1.09s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for allenai/OLMo-1B-0724-hf (German-English) in reverse direction:
                     BLEU       chrF         TER  BERTScore       WER  \
OLMo-1B-0724-hf  2.391224  27.825015  362.863662  -0.088172  3.643468   

                      CER   ROUGE-1   ROUGE-2   ROUGE-L  
OLMo-1B-0724-hf  2.536648  0.147625  0.060157  0.130818  

Evaluating Finnish-English translations...


train-00000-of-00003.parquet:   0%|          | 0.00/350M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/177M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/445k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6587448 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Evaluating allenai/OLMo-1B-0724-hf in original direction (fi -> en)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with allenai/OLMo-1B-0724-hf: 100%|██████████| 100/100 [02:24<00:00,  1.44s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for allenai/OLMo-1B-0724-hf (Finnish-English) in original direction:
                     BLEU       chrF         TER  BERTScore       WER  \
OLMo-1B-0724-hf  4.246349  31.199937  326.346053  -0.101986  3.298484   

                      CER   ROUGE-1   ROUGE-2   ROUGE-L  
OLMo-1B-0724-hf  3.301914  0.240526  0.094274  0.200092  
Evaluating allenai/OLMo-1B-0724-hf in reverse direction (en -> fi)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with allenai/OLMo-1B-0724-hf: 100%|██████████| 100/100 [01:56<00:00,  1.16s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for allenai/OLMo-1B-0724-hf (Finnish-English) in reverse direction:
                     BLEU       chrF         TER  BERTScore       WER  \
OLMo-1B-0724-hf  1.262637  28.859007  422.949527  -0.056775  4.240536   

                      CER   ROUGE-1   ROUGE-2   ROUGE-L  
OLMo-1B-0724-hf  2.490413  0.132593  0.046267  0.120902  

Evaluating French-German translations...


train-00000-of-00005.parquet:   0%|          | 0.00/368M [00:00<?, ?B/s]

train-00001-of-00005.parquet:   0%|          | 0.00/163M [00:00<?, ?B/s]

train-00002-of-00005.parquet:   0%|          | 0.00/214M [00:00<?, ?B/s]

train-00003-of-00005.parquet:   0%|          | 0.00/245M [00:00<?, ?B/s]

train-00004-of-00005.parquet:   0%|          | 0.00/272M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/263k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9824476 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1512 [00:00<?, ? examples/s]

Evaluating allenai/OLMo-1B-0724-hf in original direction (fr -> de)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with allenai/OLMo-1B-0724-hf: 100%|██████████| 100/100 [01:46<00:00,  1.06s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for allenai/OLMo-1B-0724-hf (French-German) in original direction:
                     BLEU       chrF         TER  BERTScore      WER  \
OLMo-1B-0724-hf  3.759546  26.291849  295.893502  -0.215097  2.96435   

                      CER   ROUGE-1   ROUGE-2   ROUGE-L  
OLMo-1B-0724-hf  2.164786  0.077843  0.036908  0.073699  
Evaluating allenai/OLMo-1B-0724-hf in reverse direction (de -> fr)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with allenai/OLMo-1B-0724-hf: 100%|██████████| 100/100 [02:28<00:00,  1.48s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for allenai/OLMo-1B-0724-hf (French-German) in reverse direction:
                     BLEU       chrF         TER  BERTScore       WER  \
OLMo-1B-0724-hf  3.115089  26.072657  359.982525  -0.154834  3.604194   

                      CER   ROUGE-1   ROUGE-2  ROUGE-L  
OLMo-1B-0724-hf  3.174768  0.102594  0.040717  0.08944  

Evaluating Gujarati-English translations...


train-00000-of-00001.parquet:   0%|          | 0.00/361k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/370k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11670 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1998 [00:00<?, ? examples/s]

Evaluating allenai/OLMo-1B-0724-hf in original direction (gu -> en)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with allenai/OLMo-1B-0724-hf:   2%|▏         | 2/100 [00:03<02:19,  1.43s/it]

Error during translation with allenai/OLMo-1B-0724-hf: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with allenai/OLMo-1B-0724-hf:   6%|▌         | 6/100 [00:05<01:15,  1.24it/s]

Error during translation with allenai/OLMo-1B-0724-hf: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with allenai/OLMo-1B-0724-hf:  39%|███▉      | 39/100 [00:53<00:53,  1.15it/s]

Error during translation with allenai/OLMo-1B-0724-hf: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with allenai/OLMo-1B-0724-hf:  42%|████▏     | 42/100 [00:54<00:32,  1.80it/s]

Error during translation with allenai/OLMo-1B-0724-hf: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with allenai/OLMo-1B-0724-hf:  78%|███████▊  | 78/100 [01:31<00:25,  1.14s/it]

Error during translation with allenai/OLMo-1B-0724-hf: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with allenai/OLMo-1B-0724-hf: 100%|██████████| 100/100 [01:49<00:00,  1.09s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for allenai/OLMo-1B-0724-hf (Gujarati-English) in original direction:
                     BLEU       chrF         TER  BERTScore       WER  \
OLMo-1B-0724-hf  0.510326  16.808842  281.358189  -0.662484  2.828229   

                      CER   ROUGE-1   ROUGE-2   ROUGE-L  
OLMo-1B-0724-hf  2.472316  0.146082  0.019478  0.123318  
Evaluating allenai/OLMo-1B-0724-hf in reverse direction (en -> gu)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with allenai/OLMo-1B-0724-hf: 100%|██████████| 100/100 [02:06<00:00,  1.27s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for allenai/OLMo-1B-0724-hf (Gujarati-English) in reverse direction:
                     BLEU     chrF         TER  BERTScore       WER       CER  \
OLMo-1B-0724-hf  0.066933  0.50359  429.201102  -0.778573  4.292011  3.865274   

                  ROUGE-1   ROUGE-2   ROUGE-L  
OLMo-1B-0724-hf  0.013765  0.002433  0.013239  

Evaluating Kazakh-English translations...


train-00000-of-00001.parquet:   0%|          | 0.00/5.30M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/462k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/126583 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2066 [00:00<?, ? examples/s]

Evaluating allenai/OLMo-1B-0724-hf in original direction (kk -> en)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with allenai/OLMo-1B-0724-hf:  13%|█▎        | 13/100 [00:19<02:15,  1.55s/it]

Error during translation with allenai/OLMo-1B-0724-hf: Input length of input_ids is 256, but `max_length` is set to 256. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.


Translating with allenai/OLMo-1B-0724-hf: 100%|██████████| 100/100 [02:06<00:00,  1.26s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for allenai/OLMo-1B-0724-hf (Kazakh-English) in original direction:
                     BLEU       chrF         TER  BERTScore       WER  \
OLMo-1B-0724-hf  0.348725  16.008753  277.613321  -0.488635  2.791859   

                      CER   ROUGE-1   ROUGE-2  ROUGE-L  
OLMo-1B-0724-hf  2.549441  0.123386  0.012904   0.0998  
Evaluating allenai/OLMo-1B-0724-hf in reverse direction (en -> kk)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with allenai/OLMo-1B-0724-hf: 100%|██████████| 100/100 [01:44<00:00,  1.04s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for allenai/OLMo-1B-0724-hf (Kazakh-English) in reverse direction:
                     BLEU      chrF         TER  BERTScore       WER  \
OLMo-1B-0724-hf  0.044847  0.475248  349.773243  -0.700356  3.497732   

                      CER   ROUGE-1   ROUGE-2   ROUGE-L  
OLMo-1B-0724-hf  2.658183  0.006781  0.001304  0.006558  

Evaluating Lithuanian-English translations...


train-00000-of-00002.parquet:   0%|          | 0.00/160M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/125M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/351k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2344893 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Evaluating allenai/OLMo-1B-0724-hf in original direction (lt -> en)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with allenai/OLMo-1B-0724-hf: 100%|██████████| 100/100 [02:31<00:00,  1.52s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for allenai/OLMo-1B-0724-hf (Lithuanian-English) in original direction:
                     BLEU       chrF         TER  BERTScore       WER  \
OLMo-1B-0724-hf  1.003353  22.576337  350.893744  -0.252745  3.527309   

                      CER  ROUGE-1   ROUGE-2   ROUGE-L  
OLMo-1B-0724-hf  3.307166  0.12904  0.028802  0.099732  
Evaluating allenai/OLMo-1B-0724-hf in reverse direction (en -> lt)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with allenai/OLMo-1B-0724-hf: 100%|██████████| 100/100 [02:37<00:00,  1.57s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for allenai/OLMo-1B-0724-hf (Lithuanian-English) in reverse direction:
                     BLEU       chrF         TER  BERTScore       WER  \
OLMo-1B-0724-hf  0.196464  16.452147  593.961039  -0.476847  5.941558   

                     CER   ROUGE-1   ROUGE-2   ROUGE-L  
OLMo-1B-0724-hf  3.94131  0.043274  0.011044  0.041283  

Evaluating Russian-English translations...


Resolving data files:   0%|          | 0/28 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/28 [00:00<?, ?files/s]

train-00000-of-00028.parquet:   0%|          | 0.00/135M [00:00<?, ?B/s]

train-00001-of-00028.parquet:   0%|          | 0.00/150M [00:00<?, ?B/s]

train-00002-of-00028.parquet:   0%|          | 0.00/130M [00:00<?, ?B/s]

train-00003-of-00028.parquet:   0%|          | 0.00/167M [00:00<?, ?B/s]

train-00004-of-00028.parquet:   0%|          | 0.00/152M [00:00<?, ?B/s]

train-00005-of-00028.parquet:   0%|          | 0.00/158M [00:00<?, ?B/s]

train-00006-of-00028.parquet:   0%|          | 0.00/125M [00:00<?, ?B/s]

train-00007-of-00028.parquet:   0%|          | 0.00/141M [00:00<?, ?B/s]

train-00008-of-00028.parquet:   0%|          | 0.00/101M [00:00<?, ?B/s]

train-00009-of-00028.parquet:   0%|          | 0.00/245M [00:00<?, ?B/s]

train-00010-of-00028.parquet:   0%|          | 0.00/125M [00:00<?, ?B/s]

train-00011-of-00028.parquet:   0%|          | 0.00/266M [00:00<?, ?B/s]

train-00012-of-00028.parquet:   0%|          | 0.00/270M [00:00<?, ?B/s]

train-00013-of-00028.parquet:   0%|          | 0.00/254M [00:00<?, ?B/s]

train-00014-of-00028.parquet:   0%|          | 0.00/268M [00:00<?, ?B/s]

train-00015-of-00028.parquet:   0%|          | 0.00/258M [00:00<?, ?B/s]

train-00016-of-00028.parquet:   0%|          | 0.00/263M [00:00<?, ?B/s]

train-00017-of-00028.parquet:   0%|          | 0.00/262M [00:00<?, ?B/s]

train-00018-of-00028.parquet:   0%|          | 0.00/266M [00:00<?, ?B/s]

train-00019-of-00028.parquet:   0%|          | 0.00/261M [00:00<?, ?B/s]

train-00020-of-00028.parquet:   0%|          | 0.00/270M [00:00<?, ?B/s]

train-00021-of-00028.parquet:   0%|          | 0.00/268M [00:00<?, ?B/s]

train-00022-of-00028.parquet:   0%|          | 0.00/270M [00:00<?, ?B/s]

train-00023-of-00028.parquet:   0%|          | 0.00/274M [00:00<?, ?B/s]

train-00024-of-00028.parquet:   0%|          | 0.00/269M [00:00<?, ?B/s]

train-00025-of-00028.parquet:   0%|          | 0.00/269M [00:00<?, ?B/s]

train-00026-of-00028.parquet:   0%|          | 0.00/275M [00:00<?, ?B/s]

train-00027-of-00028.parquet:   0%|          | 0.00/273M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/611k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/37492126 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Evaluating allenai/OLMo-1B-0724-hf in original direction (ru -> en)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with allenai/OLMo-1B-0724-hf: 100%|██████████| 100/100 [03:09<00:00,  1.90s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for allenai/OLMo-1B-0724-hf (Russian-English) in original direction:
                     BLEU       chrF         TER  BERTScore       WER  \
OLMo-1B-0724-hf  4.892023  32.049545  322.189097  -0.331952  3.276831   

                      CER   ROUGE-1   ROUGE-2   ROUGE-L  
OLMo-1B-0724-hf  3.042578  0.333078  0.161208  0.286455  
Evaluating allenai/OLMo-1B-0724-hf in reverse direction (en -> ru)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with allenai/OLMo-1B-0724-hf: 100%|██████████| 100/100 [02:22<00:00,  1.43s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for allenai/OLMo-1B-0724-hf (Russian-English) in reverse direction:
                     BLEU       chrF         TER  BERTScore       WER  \
OLMo-1B-0724-hf  2.194523  20.228343  348.562127   0.084692  3.489962   

                      CER   ROUGE-1   ROUGE-2   ROUGE-L  
OLMo-1B-0724-hf  2.711936  0.007901  0.001326  0.007901  

Evaluating Chinese-English translations...


train-00000-of-00013.parquet:   0%|          | 0.00/212M [00:00<?, ?B/s]

train-00001-of-00013.parquet:   0%|          | 0.00/284M [00:00<?, ?B/s]

train-00002-of-00013.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

train-00003-of-00013.parquet:   0%|          | 0.00/289M [00:00<?, ?B/s]

train-00004-of-00013.parquet:   0%|          | 0.00/288M [00:00<?, ?B/s]

train-00005-of-00013.parquet:   0%|          | 0.00/289M [00:00<?, ?B/s]

train-00006-of-00013.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

train-00007-of-00013.parquet:   0%|          | 0.00/291M [00:00<?, ?B/s]

train-00008-of-00013.parquet:   0%|          | 0.00/307M [00:00<?, ?B/s]

train-00009-of-00013.parquet:   0%|          | 0.00/291M [00:00<?, ?B/s]

train-00010-of-00013.parquet:   0%|          | 0.00/185M [00:00<?, ?B/s]

train-00011-of-00013.parquet:   0%|          | 0.00/342M [00:00<?, ?B/s]

train-00012-of-00013.parquet:   0%|          | 0.00/263M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/728k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25984574 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3981 [00:00<?, ? examples/s]

Evaluating allenai/OLMo-1B-0724-hf in original direction (zh -> en)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with allenai/OLMo-1B-0724-hf: 100%|██████████| 100/100 [02:05<00:00,  1.26s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for allenai/OLMo-1B-0724-hf (Chinese-English) in original direction:
                     BLEU       chrF         TER  BERTScore       WER  \
OLMo-1B-0724-hf  9.159839  35.853844  136.955128  -0.204009  1.415064   

                     CER   ROUGE-1   ROUGE-2   ROUGE-L  
OLMo-1B-0724-hf  1.33956  0.389634  0.172463  0.320696  
Evaluating allenai/OLMo-1B-0724-hf in reverse direction (en -> zh)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Translating with allenai/OLMo-1B-0724-hf: 100%|██████████| 100/100 [02:54<00:00,  1.75s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Results for allenai/OLMo-1B-0724-hf (Chinese-English) in reverse direction:
                    BLEU      chrF          TER  BERTScore        WER  \
OLMo-1B-0724-hf  0.09082  5.206652  4616.783217  -0.094754  46.167831   

                     CER   ROUGE-1   ROUGE-2   ROUGE-L  
OLMo-1B-0724-hf  7.95368  0.023033  0.008473  0.023033  
