In [11]:
!pip install sacrebleu sacremoses



In [12]:
import time
from transformers import MarianMTModel, MarianTokenizer
import torch

# Load model and tokenizer
model_name = "/kaggle/input/full-train-inshallah/opus-mt-tc-big-en-ar-finetuned-en-to-ar/checkpoint-14062"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Beam search settings
num_beams = 1
length_penalty = 1.0
batch_size = 32

def chunkify(lst, n):
    """Yield successive n-sized chunks from list."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [13]:
times = [1,2,3]

In [14]:
input_file = "/kaggle/input/bleu-test/test.txt"
output_file = "/kaggle/working/tatoeba-translated.txt"

print("Starting batched translation of the sentences...")

with open(input_file, "r", encoding="utf-8") as infile:
    sentences = [line.strip() for line in infile if line.strip()]

start_time = time.time()
with open(output_file, "w", encoding="utf-8") as outfile:
    for batch_num, batch in enumerate(chunkify(sentences, batch_size), 1):
        try:
            encoded = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
            encoded = {k: v.to(device) for k, v in encoded.items()}

            outputs = model.generate(
                **encoded,
                num_beams=num_beams,
                length_penalty=length_penalty,
            )

            decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            outfile.write('\n'.join(decoded) + '\n')

        except Exception as e:
            print(f"Error in batch {batch_num}: {e}")
            for _ in batch:
                outfile.write("Error in translation\n")

end_time = time.time()
times[0] = end_time - start_time
print(f"Translation completed in {(times[0]):.2f} seconds.")
print(f"Saved to: {output_file}")

torch.cuda.empty_cache()
torch.cuda.ipc_collect()

Starting batched translation of the sentences...
Translation completed in 74.71 seconds.
Saved to: /kaggle/working/tatoeba-translated.txt


In [15]:
input_file = "/kaggle/input/flores101-enar/eng.devtest"
output_file = "/kaggle/working/flores101-translated.txt"

print("Starting batched translation of the sentences...")

with open(input_file, "r", encoding="utf-8") as infile:
    sentences = [line.strip() for line in infile if line.strip()]

start_time = time.time()
with open(output_file, "w", encoding="utf-8") as outfile:
    for batch_num, batch in enumerate(chunkify(sentences, batch_size), 1):
        try:
            encoded = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
            encoded = {k: v.to(device) for k, v in encoded.items()}

            outputs = model.generate(
                **encoded,
                num_beams=num_beams,
                length_penalty=length_penalty,
            )

            decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            outfile.write('\n'.join(decoded) + '\n')

        except Exception as e:
            print(f"Error in batch {batch_num}: {e}")
            for _ in batch:
                outfile.write("Error in translation\n")

end_time = time.time()
times[1] = end_time - start_time
print(f"Translation completed in {(times[1]):.2f} seconds.")
print(f"Saved to: {output_file}")

torch.cuda.empty_cache()
torch.cuda.ipc_collect()

Starting batched translation of the sentences...
Translation completed in 16.97 seconds.
Saved to: /kaggle/working/flores101-translated.txt


In [16]:
input_file = "/kaggle/input/tico19-enar/test.en"
output_file = "/kaggle/working/tico-translated.txt"

print("Starting batched translation of the sentences...")

with open(input_file, "r", encoding="utf-8") as infile:
    sentences = [line.strip() for line in infile if line.strip()]

start_time = time.time()
with open(output_file, "w", encoding="utf-8") as outfile:
    for batch_num, batch in enumerate(chunkify(sentences, batch_size), 1):
        try:
            encoded = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
            encoded = {k: v.to(device) for k, v in encoded.items()}

            outputs = model.generate(
                **encoded,
                num_beams=num_beams,
                length_penalty=length_penalty,
            )

            decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            outfile.write('\n'.join(decoded) + '\n')

        except Exception as e:
            print(f"Error in batch {batch_num}: {e}")
            for _ in batch:
                outfile.write("Error in translation\n")

end_time = time.time()
times[2] = end_time - start_time
print(f"Translation completed in {(times[2]):.2f} seconds.")
print(f"Saved to: {output_file}")

torch.cuda.empty_cache()
torch.cuda.ipc_collect()

Starting batched translation of the sentences...
Translation completed in 62.85 seconds.
Saved to: /kaggle/working/tico-translated.txt


In [17]:
import subprocess
import json
from tabulate import tabulate

# Define test sets and pre-recorded times
tests = [
    {
        "name": "tatoeba-test-v2021-08-07",
        "ref": "/kaggle/input/bleu-test/ref1.txt",
        "hyp": "/kaggle/working/tatoeba-translated.txt"
    },
    {
        "name": "flores101-devtest",
        "ref": "/kaggle/input/flores101-enar/ara.devtest",
        "hyp": "/kaggle/working/flores101-translated.txt"
    },
    {
        "name": "tico19-test",
        "ref": "/kaggle/input/tico19-enar/ref.ar",
        "hyp": "/kaggle/working/tico-translated.txt"
    }
]

# Function to run sacrebleu and extract metrics
def get_scores_json(ref, hyp):
    result = subprocess.run(
        ["sacrebleu", ref, "-i", hyp, "-m", "bleu", "chrf", "ter"],
        capture_output=True,
        text=True
    )
    if result.returncode != 0:
        print(f"Error: {result.stderr}")
        return None
    try:
        data = json.loads(result.stdout)
        scores = {entry["name"]: entry["score"] for entry in data}
        return scores
    except Exception as e:
        print(f"Failed to parse JSON: {e}")
        return None

# Build table
full_table = []

for i, test in enumerate(tests):
    scores = get_scores_json(test["ref"], test["hyp"])
    time_str = f"{(times[i]):.2f}s"
    if scores:
        full_table.append([
            test["name"],
            f"{scores.get('BLEU', 0):.1f}",
            f"{scores.get('chrF2', 0):.1f}",
            f"{scores.get('TER', 0):.1f}",
            time_str
        ])
    else:
        full_table.append([test["name"], "N/A", "N/A", "N/A", time_str])

# Print result
print(f"Model:{model_name}")
headers = ["testset", "BLEU", "chr-F", "TER", "Time"]
print(tabulate(full_table, headers=headers, tablefmt="tsv"))

Model:/kaggle/input/full-train-inshallah/opus-mt-tc-big-en-ar-finetuned-en-to-ar/checkpoint-14062
testset                 	  BLEU	  chr-F	  TER	Time
tatoeba-test-v2021-08-07	  23.4	   50.7	 62.5	74.71s
flores101-devtest       	  29.4	   60.1	 54.9	16.97s
tico19-test             	  30.9	   59.9	 56.1	62.85s
