# Translation Evaluation
In this Notebook:

0. Setup
1. Translation speed
2. BLEU Score
3. COMET
4. Terminology Adherence Evaluation

## 0. Setup

In [None]:
import os
import numpy as np
import pandas as pd

### Declaration and definition of variables

In [None]:
# for Terminology Adherence Evaluation
# path to file containing reference (human) translations
reference_path = "/reference_files"
# path to file containing llm generated translations
prediction_path = "/prediction_files"
# path to file containing the relevant terminology (csv) (each column needs to be the respective file name)
terminology_file = "terminology_DE-EN.csv"

# for Translation Speed Evaluation
# path to file containing the translation times (csv)
times_file = "DE-EN_translation_times.csv"

# for naming the files
model = "gpt-4o"
# target language
language = "EN"

# for concatenating all files (necessary for BLEU and COMET evaluation)
# path to folder containing all files
source_path = "/source_files"

### General Functions

In [None]:
# function for creating the predictions list (llm generated translations)
def create_predictions(prediction_file):
  predictions = []

  with open(prediction_file, "r", encoding="utf-8") as file:
    predictions = [line.strip() for line in file]

  return predictions

In [None]:
# function for concatenating all files (for BLEU and COMET)

# output file path
full_data = source_path.split("/")[-1] + ".txt"

# create output file
with open(full_data, 'w', encoding="utf-8") as outfile:
    for filename in sorted(os.listdir(source_path)):
        print(filename)
        if filename.endswith('.txt'):
            file_path = os.path.join(source_path, filename)
            with open(file_path, 'r', encoding="utf-8") as infile:
              new_file = infile.read().rstrip("\n")+"\n"
              # Remove UTF-8 BOM if it exists
              if new_file.startswith('\ufeff'):
                  new_file = new_file[1:]
              outfile.write(new_file)

## 1. Translation Speed

In [None]:
# @title
df = pd.read_csv(times_file)
print("Average translation speed per file:")
print(df.mean())

print("\nAverage translation speed over all files:")
print(df.mean().mean())

# 2. SacreBLEU



In [None]:
!pip install sacrebleu

In [None]:
# compute SacreBLEU score and compare to baseline
!sacrebleu en_reference.txt -l de-en -i without-tag_4o.txt markdown.txt json.txt tbx_dct.txt tbx_dca.txt yaml.txt -m bleu --paired-bs

# 3. COMET

In [None]:
!pip install unbabel-comet

In [None]:
# compute COMET score
!comet-score -s en_sources.txt -r en_reference.txt -t without-tag_4o.txt

In [None]:
# compare COMET scores
!comet-compare -s en_sources.txt -t without-tag_4o.txt markdown.txt json.txt tbx_dct.txt tbx_dca.txt yaml.txt -r en_reference.txt

# 4. Terminology Adherence Evaluation
The percentage of correctly translated terminology. Here, the files for the manual inspection of the evaluation are generated.

In [None]:
df = pd.read_csv(terminology_file)
print("Terms per file:")
print(df.count())

In [None]:
# creating a dictionary with the terminology for each file from the terminology csv
terminology = {}
for name, data in df.items():
  key_name = name.split("_")[0]
  print(key_name)
  value_list = []
  for value in data:
    if pd.isna(value):
      value_list.append([])
    else:
      value_list.append(value.split("|"))

  terminology[key_name] = value_list

In [None]:
# creating a txt file containing the terminology errors
def create_incorrect_file(terminology, predictions, reference_file):
  column_name = reference_file.split("_")[0]
  incorrect_file = f"{column_name}_incorrectTerminology.txt"
  final_score = 0
  total_terms = len([terms for terms in terminology[column_name] if terms])

  with open(incorrect_file, "w") as file:
    for idx, (terms, prediction) in enumerate(zip((terminology[column_name]), predictions)):
      term_count = 0
      missed_terms = 0
      sentence_score = 0
      missing_terms = []
      for term in terms:
        term_count += 1
        if term not in prediction:
          missed_terms += 1
          missing_terms.append(term)
      if term_count == 0:
        continue
      sentence_score = 1/term_count * (term_count - missed_terms)
      final_score += sentence_score
      if missed_terms != 0:
        file.write(f"Sentence {idx+1} ({missed_terms} out of {term_count} terms missing):\nmissing terms: {missing_terms} \nprediction: {prediction}\n\n")
  return final_score/total_terms, f"{final_score}/{total_terms}"

In [None]:
for predfile, ref_file in zip(sorted(os.listdir(prediction_path)), sorted(os.listdir(reference_path))):
  if predfile.split("_")[-1].split(".")[0] == ref_file.split("_")[0]:
    predictions = create_predictions(os.path.join(prediction_path, pred_file))
    score, calculation = create_incorrect_file(terminology, predictions, ref_file)
    with open(f"{model}_terminology_adherence_DE-{language}.txt", "a", encoding="utf-8") as file:
      file.write(f"{pred_file}: {calculation} = {score}\n")
    print(f"{pred_file}: {calculation} = {score}")