### 1. Dependencies

In [65]:
!pip install -quiet transformers datasets evaluate
!pip install unbabel-comet

In [8]:
import torch
import transformers
import pandas as pd
from tqdm import tqdm
import datasets
from transformers.pipelines.pt_utils import KeyDataset
import evaluate

In [19]:
device = "cuda" if torch.cuda.is_available() else "cpu"

### 2. Zero-shot Translation

In [13]:
## data prep
df = pd.read_csv("./data/short_test_eng_fre.tsv", sep="\t")
# convert the data into hf dataset format
dataset = datasets.Dataset.from_pandas(df)

In [14]:
print(dataset)

Dataset({
    features: ['English', 'French'],
    num_rows: 85
})


#### 2.1 Supervised Translation Model - NLLB

In [18]:
model_name = "facebook/nllb-200-distilled-600M" # Read more about the model here: https://huggingface.co/facebook/nllb-200-distilled-600M

# model specific parameters
task="translation"
src_lang = "eng_Latn"
tgt_lang = "fra_Latn"


In [24]:
translator = transformers.pipeline(task, model=model_name, src_lang=src_lang, tgt_lang=tgt_lang, device=device)

In [27]:
outputs = []

#  Inputs are passed to the model in batch. 
for output in translator(KeyDataset(dataset, "English"), batch_size=4, truncation="only_first"):

    # append the output into outputs
    outputs.append(output[0]['translation_text'])

In [31]:
# save the predictions into df and store 
df[model_name.split("/")[-1]] = outputs
df.to_csv(f"data/{model_name.split('/')[-1]}.csv", index=False)

#### 2.2 Translation with Instruction Following Models - Excercise

1. Do the translation with any instruction following the model (ideally of comparable size). 
2. Run the evaluation.
3. Report your results and compare them against that of NLLB baseline.

### 3. Supervised Evaluation

In [9]:
# load dataframe with predictions
df = pd.read_csv("data/nllb-200-distilled-600M.csv")

In [10]:
df.head()

Unnamed: 0,English,French,nllb-200-distilled-600M
0,A guy works on a building.,Un gars travaille sur un bâtiment.,Un type travaille sur un bâtiment.
1,Three people sit in a cave.,Trois personnes sont assises dans une grotte.,Trois personnes sont dans une cave.
2,People standing outside of a building.,Des gens debout devant un bâtiment.,Des gens debout à l'extérieur d'un bâtiment.
3,A man cutting branches of trees.,Un homme coupant les branches d'un arbre.,Un homme coupe des branches d'arbres.
4,A child is splashing in the water,Un enfant éclabousse dans l'eau,Un enfant est en train de sauter dans l'eau


#### 3.1 BLEU Score
1. Bilingual Evaluation Understudy.
2. Quality is considered to be the correspondence between a machine’s output and that of a human: “the closer a machine translation is to a professional human translation, the better it is”.
3. The BLEU score is computed as follows:

![Alt text](data/assets/bleu.png)

4. In summary, the BLEU score combines precision scores for different n-gram sizes and applies a brevity penalty to compute a final score that represents the quality of the machine-translated text relative to the reference translations.



In [34]:
# load the metrics from hf evaluate package
bleu = evaluate.load("bleu")

Downloading builder script: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5.94k/5.94k [00:00<00:00, 11.0MB/s]
Downloading extra modules: 4.07kB [00:00, 4.41MB/s]                                                                                                                                                                
Downloading extra modules: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.34k/3.34k [00:00<00:00, 8.10MB/s]


In [43]:
bleu_results = bleu.compute(predictions=df['nllb-200-distilled-600M'], references=df['French'], max_order=4) 
# max_order: Maximum n-gram order to use when computing BLEU score. Defaults to 4.

In [44]:
results

{'bleu': 0.4065892614949111,
 'precisions': [0.6751412429378532,
  0.47191011235955055,
  0.34386617100371747,
  0.24944812362030905],
 'brevity_penalty': 1.0,
 'length_ratio': 1.0260869565217392,
 'translation_length': 708,
 'reference_length': 690}

#### 3.2 METEOR
1. Metric for Evaluation of Translation with Explicit Ordering
2. METEOR can be formulated as:
3. ![Alt text](data/assets/meteor.png)


In [46]:
# load
meteor = evaluate.load('meteor')

Downloading builder script: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6.93k/6.93k [00:00<00:00, 11.4MB/s]
[nltk_data] Downloading package wordnet to /home/abdul/nltk_data...


In [47]:
meteor_results = meteor.compute(predictions=df['nllb-200-distilled-600M'], references=df['French'])

In [48]:
print(meteor_results)

{'meteor': 0.6448424005309302}


#### 3.2 ChrF and ChrF++
1. Character n-gram F-score.
2. ChrF and ChrF++ are two MT evaluation metrics that use the F-score statistic for character n-gram matches. ChrF++ additionally includes word n-grams, which correlate more strongly with direct assessment.
3. ChrF++ is ChrF with a word order of 2 (or 3). 

In [49]:
chrf = evaluate.load("chrf")

In [51]:
chrf_results = chrf.compute(predictions=df['nllb-200-distilled-600M'], references=df['French'])

In [52]:
print(chrf_results)

{'score': 64.03435195758375, 'char_order': 6, 'word_order': 0, 'beta': 2}


In [53]:
chrfp_results = chrf.compute(predictions=df['nllb-200-distilled-600M'], references=df['French'], word_order=2)

In [54]:
print(chrfp_results)

{'score': 62.683079599172984, 'char_order': 6, 'word_order': 2, 'beta': 2}


#### 3.3 TER 
1. Translation Edit Rate
2. TER Score is computed as:
 
  ![Alt text](data/assets/ter.png)

In [55]:
ter = evaluate.load("ter")

Downloading builder script: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9.99k/9.99k [00:00<00:00, 15.1MB/s]


In [59]:
ter_results = ter.compute(predictions=df['nllb-200-distilled-600M'], references=df['French'])

In [60]:
print(ter_results)

{'score': 43.08943089430895, 'num_edits': 265, 'ref_length': 615.0}


#### 3.4 COMET
1. COMET - Cross-lingual Optimal MEtadata-based Translation
2. A score close to 1 indicates a high-quality translation, while a score close to 0 indicates a translation that is no better than random chance.

In [3]:
comet = evaluate.load('comet')

Fetching 5 files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 30481.86it/s]
Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.2.1. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../../../.cache/huggingface/hub/models--Unbabel--wmt22-comet-da/snapshots/371e9839ca4e213dde891b066cf3080f75ec7e72/checkpoints/model.ckpt`
Encoder model frozen.
/home/abdul/venv/lib/python3.10/site-packages/pytorch_lightning/core/saving.py:188: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [11]:
comet_score = comet.compute(predictions=df['nllb-200-distilled-600M'], references=df['French'], sources=df['English'])

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [13]:
print(comet_score['mean_score'])

0.841721477578668


### 4. Unsupervised Evaluations
- Also known as reference-free evaluation.

#### 4.1 Human Evaluation
1. Prepare set of guidelines for rating critieria.
2. Provide pair of source and target language sentences and ask humans to evaluate the translation based on rating criteria.
3. Do 2 for multiple sentences and human evaluators. 

#### 4.2 LLM Judge
- Using powerful multilingual large language models such as ChatGPT, Gemni, and Claude as judges to assess the quality of translation. 
- Correlates with human evaluation. 
- Prone to score high/better to outputs produced by itself.
- Not uniformly good for all languages.
- Doing 4.1 but with LLM.

In [52]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
df.head()

In [59]:
model_name = "openchat/openchat-3.5-0106"

In [61]:
chatbot = transformers.pipeline(
    "text-generation",
    model=model_name,
    device=device,
)

In [None]:
prompt_template = open("data/prompts/prompt.txt", "r").read()
guideline = open("data/prompts/guideline.txt", "r").read()

Unnamed: 0,English,French,nllb-200-distilled-600M
0,A guy works on a building.,Un gars travaille sur un bâtiment.,Un type travaille sur un bâtiment.
1,Three people sit in a cave.,Trois personnes sont assises dans une grotte.,Trois personnes sont dans une cave.
2,People standing outside of a building.,Des gens debout devant un bâtiment.,Des gens debout à l'extérieur d'un bâtiment.
3,A man cutting branches of trees.,Un homme coupant les branches d'un arbre.,Un homme coupe des branches d'arbres.
4,A child is splashing in the water,Un enfant éclabousse dans l'eau,Un enfant est en train de sauter dans l'eau


In [48]:
prompt = prompt_template.format(src_lang="English", tgt_lang="French", guideline=guideline, source=df['English'].iloc[0], target=df['nllb-200-distilled-600M'].iloc[0]) 

In [49]:
prompt

'Evaluate the provided translation from English to French according to the following criteria:\n\nGuideline:\nCompleteness:\n1. Includes all essential information from the source text without omission.\n2. Captures main ideas, key details, and nuances accurately.\n\nAccuracy:\n1. Reflects the meaning and intent of the source text faithfully without distortion.\n2. Translates vocabulary, idiomatic expressions, and technical terms accurately.\n\nConsistency:\n1. Maintains terminology, style, and tone consistency throughout the translation.\n2. Ensures coherence and clarity across sentences and paragraphs.\n\nFormat Compliance:\n1. Adheres to the required format or structure specified for the target text, if applicable.\n2. Follows any formatting guidelines provided, such as headings, bullet points, or paragraphs.\n\nOriginality:\n1. Demonstrates originality and independent generation.\n2. Avoids verbatim repetition of source text and introduces variation in language usage.\n\nRatings:\n1

In [55]:
sequences = chatbot(
    prompt,
    do_sample=False,
    return_full_text=False
)

In [62]:
print(sequences['generated_text'])

#### 4.3 UScore, XMoverScore, SentSim
1. https://github.com/potamides/unsupervised-metrics

### 5. References (Recommended Reading)
  1. https://aclanthology.org/2023.eacl-main.27.pdf
  2. https://aclanthology.org/2023.acl-short.55/
