# Eval_full Notebook

This notebook is used for **full evaluation** of models before and after fine-tuning.

## What it does
- Loads both pretrained and fine-tuned versions.
- Loads the datasets:
    - ASSET (test) HF
    - MEDEASI (test) HF
    - OneStopEnglish (adv→ele) GitHub
- Defines the full set of prompts for evaluation.
- Computes all evaluation metrics:
    - id_ratio (both case sensitive and insensitive) → 'copy-paste'
    - SARI → Simplification
    - FKGL → Readability
    - BERTScore → Semantic similarity
    - LENS → Holistic score for simplification quality
- Saves **CSV results** into the `results/<dataset>_<subset>/` folders.
- Saves **sample outputs** (3 per prompt per model) into `results/<dataset>_<subset>/samples/` for human evaluation.

## Notes
- Model and dataset names are normalized (`replace("/", "_")`) when saving results.
- This notebook is intended for **final evaluation runs**, not for debugging or prompt prototyping.

In [None]:
# Install dependencies
!pip install -q transformers sentencepiece
!pip install -q --upgrade datasets fsspec

!pip install -q bert-score sacrebleu sacremoses
import nltk
nltk.download("punkt", quiet=True); nltk.download("punkt_tab", quiet=True)
!pip install -q --no-deps https://github.com/feralvam/easse/archive/refs/heads/master.zip

!pip install -q pytorch-lightning torchmetrics jsonargparse
!pip install -q --no-deps lens-metric


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/503.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m503.6/503.6 kB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.3/199.3 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m58.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pylibcudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 21.0.0 which is incompatible.
cudf-cu12 25.6.0 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you have pyarrow 21.0.0 which is incompatible.
gcsfs 2025.3.0 requires fsspec==2025.3.0, but you have fsspec 2025.

In [None]:
import torch, os

if torch.cuda.is_available():
    print("GPU is available :)")
    DEVICE = "cuda"
    torch.set_float32_matmul_precision("high")
    name = torch.cuda.get_device_name(0).lower()
    if "l4" in name:
        NUM_WORKERS, BATCH_SIZE = 8, 128
    elif "t4" in name:
        NUM_WORKERS, BATCH_SIZE = 2, 32
    else:
        NUM_WORKERS, BATCH_SIZE = 2, 16
else:
    print("GPU not available — using CPU instead")
    DEVICE = "cpu"
    NUM_WORKERS, BATCH_SIZE = 1, 16

GPU is available :)


In [None]:
_datasets: list[dict[str, object]] = []
_models:   list[dict[str, object]] = []
_prompts:  list[tuple[str, str]]   = []
_metrics                           = []

MODELS_TRAINED_PATH = os.path.join(os.getcwd(), "models/trained")
RESULTS_PATH = os.path.join(os.getcwd(), "results")

DATASETS

In [None]:
# Load the datasets
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
from datasets.utils.logging import disable_progress_bar
disable_progress_bar()

from datasets import load_dataset
from datasets import Dataset

#### MEDEASI ####
# Starting with Med-EASi - may OOM, catch this soon and lower BATCH_SIZE
ds_name = 'cbasu/Med-EASi'
ds_split = 'test'
ds = load_dataset(ds_name, split=ds_split)

_datasets.append({"name":    'medeasi',
                  "ds_obj":  ds,
                  "subset":  ds_split,
                  "src_key": "Expert",      # Source: expert medical text
                  "trg_key": "Simple",     # Target: simplified medical text (single string)
                  })
print(f"* Loaded '{ds_split}' subset of 'medeasi' dataset")

####  ASSET  ####
ds_name = 'asset'
ds_split = 'test'
ds = load_dataset(ds_name, split=ds_split)
_datasets.append({"name":    ds_name,
                  "ds_obj":  ds,
                  "subset":  ds_split,
                  "src_key": "original",
                  "trg_key": "simplifications",
                  })
print(f"* Loaded '{ds_split}' subset of '{ds_name}' dataset")


#### ONESTOP ENGLISH ####
# OneStopEnglish Advanced → Elementary from GitHub
import requests

try:
    ds_name = 'onestopenglish'
    ds_split = 'adv-ele'

    URL = f"https://raw.githubusercontent.com/nishkalavallabhi/OneStopEnglishCorpus/master/Sentence-Aligned/{ds_split.upper()}.txt"
    r = requests.get(URL, timeout=30)
    r.raise_for_status()
    lines = [ln.strip() for ln in r.text.splitlines()]
    clean = [ln for ln in lines if ln and ln != "*******"]
    advanced = clean[0::2]
    elementary   = clean[1::2]
    ds = Dataset.from_dict({"advanced": advanced, "elementary": elementary})

    _datasets.append({"name": ds_name,
                      "ds_obj": ds,
                      "subset": ds_split,
                      "src_key": "advanced",
                      "trg_key": "elementary"})
    print(f"* Loaded '{ds_split}' subset of '{ds_name}' dataset")

except Exception as e:
    print(f"\n* Skipping OneStopEnglish: {e}\n")

print(f"\n{len(_datasets)} datasets loaded")

* Loaded 'test' subset of 'medeasi' dataset
* Loaded 'test' subset of 'asset' dataset
* Loaded 'adv-ele' subset of 'onestopenglish' dataset

3 datasets loaded


MODELS

In [None]:
# Load the original and fine-tuned models
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers.utils import logging as tlogging
tlogging.disable_progress_bar()

#### original ####
m_name = 'facebook/bart-base'

tok = AutoTokenizer.from_pretrained(m_name)
m = AutoModelForSeq2SeqLM.from_pretrained(m_name).to(DEVICE)
m.eval()

# Just for prophetnet-large-uncased-cnndm, not finetuned #
if m_name == 'microsoft/prophetnet-large-uncased-cnndm':
    tok.add_special_tokens({"additional_special_tokens": ["[X_SEP]"]})
    m.resize_token_embeddings(len(tok))

_models.append({"name":      m_name,
                "model":     m,
                "tokenizer": tok,
                "origin":    True
                })

print(f"* Loaded '{m_name}'            model")



#### finetune ####
m_path = os.path.join(MODELS_TRAINED_PATH, m_name.replace("/", "_"))
if not os.path.exists(m_path):
    print("Error: no fine-tuned model found.")
    print(" * New fallback for git: using HF paper models")
    m_path = f'eilamc14/{f"{(m_name.split('/')[1:] or m_name.split('/'))[0]}-text-simplification"}'

tok = AutoTokenizer.from_pretrained(m_path)
m = AutoModelForSeq2SeqLM.from_pretrained(m_path).to(DEVICE)
m.eval()

_models.append({"name":      m_name,
                "model":     m,
                "tokenizer": tok,
                "origin":    False
                })

print(f"* Loaded '{m_name}' fine-tuned model")

print(f"\n{len(_models)} models loaded")

* Loaded 'facebook/bart-base'            model
* Loaded 'facebook/bart-base' fine-tuned model

2 models loaded


PROMPTS

In [None]:
# Load the prompts

# === P0: CONTROL PROMPT ===
_prompts.append((
    "Simplify: ",
    ""
))

# === P1: Zero-shot direct instruction ===
_prompts.append((
    "Simplify the following sentence so it is easy to understand while keeping the original meaning: ",
    ""
))

# === P2: Multi-shot ICL ===
multi_shot_prefix = """Simplify the sentence. Use common words; keep the meaning. Output only the simplified sentence.
Complex: The committee reached a unanimous decision after extensive deliberations. Simple: The group agreed after talking for a long time.
Complex: The ancient manuscript was preserved in a climate-controlled archive to prevent deterioration. Simple: The old book was kept in a special room to stop it from getting damaged.
Complex: The economic downturn had a profound effect on small businesses across the region. Simple: The bad economy hurt many small businesses in the area.
Complex: """
_prompts.append((multi_shot_prefix, " Simple: "))

# === P3: Chain-of-thought (diagnose → rewrite) ===
_prompts.append((
    "First list the words/phrases that make this sentence hard to read. Then rewrite the sentence in simpler language without changing its meaning: ",
    ""
))

# === P4: Lexical simplification for non-native readers ===
_prompts.append((
    "Rewrite the sentence using common, high-frequency words suitable for a B1 (intermediate) non-native reader. Keep all original information: ",
    ""
))

# === P5: Psycholinguistic constraints (familiarity & AoA) ===
_prompts.append((
    "Rewrite the sentence using words with high familiarity and early age-of-acquisition (avoid abstract or rare terms). Preserve the original meaning: ",
    ""
))

# === P6: Sentence splitting ===
_prompts.append((
    "Rewrite the sentence in simpler words and split long or embedded clauses into shorter sentences. Keep the same meaning: ",
    ""
))

# === P7: Data-driven transformation cues ===
_prompts.append((
    "Apply common simplification transformations (e.g., replace complex words, reorder for clarity, split long clauses) while keeping grammar and meaning: ",
    ""
))

# === P8: Readability target (Flesch) ===
_prompts.append((
    "Rewrite the sentence so that it reaches a Flesch Reading Ease score ≥ 80 (≈ grade 6), without losing information: ",
    ""
))

# === P9: ESL comprehension ===
_prompts.append((
    "Rewrite the sentence for ESL learners - use high-frequency words, avoid idioms, and add brief clarifications if needed. Keep the meaning the same: ",
    ""
))

# === P10: Content preservation constraint ===
_prompts.append((
    "Simplify the sentence for readability, but preserve ALL factual details (entities, quantities, relations) exactly: ",
    ""
))

print(f"{len(_prompts)} prompts loaded")


11 prompts loaded


METRICS

In [None]:
import unicodedata
import numpy as np
import re

# Load the metrics
from easse.sari import corpus_sari
from easse.fkgl import corpus_fkgl
from easse.bertscore import corpus_bertscore

from lens import download_model, LENS
# disable the bar only inside LENS
import lens.models.predict_pbar as pp
pp.PredictProgressBar.init_predict_tqdm = (
    lambda self: pp.tqdm(
        desc="Evaluating",
        leave=True,
        dynamic_ncols=True,
        file=pp.sys.stderr,
        smoothing=0,
        disable=True,   # force tqdm off
    )
)
_lens = LENS(download_model("davidheineman/lens"), rescale=True)

_ws_re = re.compile(r"\s+")
def _norm(s: str, lower: bool) -> str:
    """
    basic, language-agnostic: strip, NFKC, collapse spaces; optional lower
    """
    s = unicodedata.normalize("NFKC", s.strip())
    s = _ws_re.sub(" ", s)
    return s.lower() if lower else s

def metric_identical_ratio(preds, refs, srcs):
    same = [int(_norm(p, False) == _norm(s, False)) for p, s in zip(preds, srcs)]
    return {"Identical ratio": float(np.mean(same))} # case sensitive (A /= a)

def metric_identical_ratio_ci(preds, refs, srcs):
    same = [int(_norm(p, True) == _norm(s, True)) for p, s in zip(preds, srcs)]
    return {"Identical ratio (ci)": float(np.mean(same))} # case insensitive (A == a)

#### easse needs transpose to refs
def to_ref_major(refs_sample_major):
    """
    Convert sample-major (N x R) -> ref-major (R x N).
    Assumes a uniform number of references per sample (R).
    """
    R = len(refs_sample_major[0])
    assert all(len(r) == R for r in refs_sample_major), "non-uniform #refs per sample"
    # Transpose: (N x R) -> (R x N)
    return [list(col) for col in zip(*refs_sample_major)]

def metric_sari(preds, refs, srcs):
    sari = corpus_sari(orig_sents=srcs, sys_sents=preds, refs_sents=to_ref_major(refs))
    return {"SARI": float(sari)}

def metric_fkgl(preds, refs, srcs):
    fkgl = corpus_fkgl(sentences=preds)
    return {"FKGL": float(fkgl)}

def metric_bertscore(preds, refs, srcs):
    p, r, f1 = corpus_bertscore(sys_sents=preds, refs_sents=to_ref_major(refs))
    return {"BERTScore": float(f1*100)}

def metric_lens(preds, refs, srcs):
    scores = _lens.score(srcs, preds, refs, batch_size=BATCH_SIZE*4, devices=[0])
    return {"LENS": float(sum(scores) / len(scores))}

# You can add/remove metrics here without touching the eval loop:
_metrics.extend([
    metric_identical_ratio,
    metric_identical_ratio_ci,
    metric_sari,
    metric_fkgl,
    metric_bertscore,
    metric_lens,
])


Preprocess, collate, clean_preads, evaluate functions

In [None]:
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

def make_preprocess(tokenizer, before: str, after: str, src_key: str,
                    max_source_len=256):
    """
    Returns a function(examples) suitable for datasets.map(...)
    - before/after: strings concatenated around the source text
    - src_key: the column name containing the raw source text in your dataset (e.g., "original")
    """
    def _fn(examples):
        inputs = [f"{before}{ex}{after}" for ex in examples[src_key]]

        model_inputs = tokenizer(
            inputs,
            max_length=max_source_len,
            truncation=True,
            padding=True,
        )

        return model_inputs
    return _fn

def make_collate_fn(pad_id: int, src_key: str, trg_key: str):
    """
    Factory for a robust collate_fn that handles both single and multiple references
    """
    def collate_fn(batch):
        input_ids = [torch.tensor(x["input_ids"], dtype=torch.long) for x in batch]
        attention_mask = [torch.tensor(x["attention_mask"], dtype=torch.long) for x in batch]

        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=pad_id)
        attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)

        # meta for metrics
        sources = [x[src_key] for x in batch]
        references = [[x[trg_key]] if isinstance(x[trg_key], str) else x[trg_key] for x in batch]

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "sources": sources,
            "references": references,
        }
    return collate_fn

def clean_pred(before: str, after: str, pred: str) -> str:
    """
    Removes prompt from prediction if exists
    """
    return pred.replace(before, "", 1).replace(after, "", 1)


@torch.inference_mode()
def evaluate_model(model, tokenizer, dataloader, gen_kwargs=None, msg="   Evaluating", prompt_before="", prompt_after=""):
    model.eval()
    if gen_kwargs is None:
        gen_kwargs = dict(max_new_tokens=128, num_beams=1, early_stopping=True)

    predictions, sources, references = [], [], []

    for batch in tqdm(dataloader, desc=msg):
        inputs = {
            "input_ids": batch["input_ids"].to(DEVICE),
            "attention_mask": batch["attention_mask"].to(DEVICE),
        }
        outputs = model.generate(**inputs, **gen_kwargs)
        decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        decoded = [clean_pred(prompt_before, prompt_after, pred) for pred in decoded]

        predictions.extend(decoded)
        references.extend(batch["references"])
        sources.extend(batch["sources"])

    results = {}
    for m in _metrics:
        results.update(m(predictions, references, sources))

    return results, predictions, sources, references

args for generation

In [None]:
gen_kwargs = dict(
    max_new_tokens=64,
    num_beams=4,
    length_penalty=1.0,
    no_repeat_ngram_size=3,
    early_stopping=True,
    do_sample=False
)

remove some warning from being printed

In [None]:
import warnings, logging

# Logging levels
for name in [
    "transformers", "huggingface_hub",
    "pytorch_lightning", "lightning.pytorch",
    "easse", "lens", "torch"
]:
    logging.getLogger(name).setLevel(logging.ERROR)

# Hide common warnings (e.g., Roberta pooler init)
warnings.filterwarnings("ignore", message="Some weights of RobertaModel were not initialized*")
warnings.filterwarnings("ignore", message=".*Empty candidate sentence detected*")

Biggggg loop

In [None]:
from torch.utils.data import DataLoader

_all_rows = []

# { "<DATASET>_<subset>": { "<model_name>": { "<prompt_id or 'ft'>": [ {"src":..., "pred":...}, ... ] } } }
_samples: dict[str, dict[str, dict[str, list[dict[str, str]]]]] = {}

for dataset_dict in _datasets:
    dataset_name = dataset_dict["name"]
    dataset      = dataset_dict["ds_obj"]
    subset       = dataset_dict["subset"]
    src_key      = dataset_dict["src_key"]
    trg_key      = dataset_dict["trg_key"]
    print(f" === '{dataset_name.upper()}' dataset: '{subset}' subset ===\n")

    for model_dict in _models:
        model_name = model_dict["name"]
        model      = model_dict["model"]
        tokenizer  = model_dict["tokenizer"]
        origin     = model_dict["origin"]
        print(f" --- {model_name.upper()}{'' if origin else ' finetuned'} ---")

        collate_fn = make_collate_fn(tokenizer.pad_token_id, src_key, trg_key)

        prompts_to_use = _prompts if origin else [("Simplify: ","")]
        for idx, (prompt_before, prompt_after) in enumerate(prompts_to_use, start=0):
            preprocess_function = make_preprocess(tokenizer,
                                                  before=prompt_before,
                                                  after=prompt_after,
                                                  src_key=src_key,
                                                  max_source_len=256
                                                  )

            tokenized_datasets = dataset.map(preprocess_function, batched=True)

            dataloader = DataLoader(tokenized_datasets,
                                    batch_size=BATCH_SIZE,
                                    collate_fn=collate_fn,
                                    pin_memory=True,
                                    num_workers=NUM_WORKERS,
                                    persistent_workers=True,
                                    )

            msg = f" * Prompt #{idx}"
            metrics, preds, srcs, refs = evaluate_model(model, tokenizer, dataloader, gen_kwargs, msg, prompt_before, prompt_after)
            print()

            row = {
                  "Dataset": dataset_name,
                  "Subset": subset,
                  "Model": model_name,
                  "Mode": "Prompt-based" if origin else "Fine-tuned",
                  "Prompt#": int(idx),
                  }

            w = max(len(k) for k in metrics)
            for k, v in metrics.items():
                row[k] = float(v)
                print(f"      {k:<{w}} : {v:.4f}")
            _all_rows.append(row)


            # Samples for human eval and sanity check
            ds_key  = f"{dataset_name.upper()}_{subset}"
            mdl_key = model_name
            pr_key  = "ft" if not origin else str(idx)
            examples = [{"src": srcs[i], "pred": preds[i]} for i in range(24,27)] # change this manually- 0,3 3,6 6,9 etc

            _samples.setdefault(ds_key, {}).setdefault(mdl_key, {})[pr_key] = examples

            print()
    print("\n")

 === 'MEDEASI' dataset: 'test' subset ===

 --- FACEBOOK/BART-BASE ---


 * Prompt #0: 100%|██████████| 3/3 [00:23<00:00,  7.85s/it]



      Identical ratio      : 0.8567
      Identical ratio (ci) : 0.8667
      SARI                 : 24.6796
      FKGL                 : 11.1952
      BERTScore            : 47.6286
      LENS                 : 49.4364



 * Prompt #1: 100%|██████████| 3/3 [00:23<00:00,  7.73s/it]



      Identical ratio      : 0.7933
      Identical ratio (ci) : 0.7967
      SARI                 : 28.3655
      FKGL                 : 10.8985
      BERTScore            : 47.0612
      LENS                 : 47.4221



 * Prompt #2: 100%|██████████| 3/3 [00:24<00:00,  8.22s/it]



      Identical ratio      : 0.0000
      Identical ratio (ci) : 0.0000
      SARI                 : 29.1317
      FKGL                 : 4.6118
      BERTScore            : -18.2327
      LENS                 : 4.0497



 * Prompt #3: 100%|██████████| 3/3 [00:23<00:00,  7.94s/it]



      Identical ratio      : 0.6600
      Identical ratio (ci) : 0.6600
      SARI                 : 32.0363
      FKGL                 : 10.4823
      BERTScore            : 46.0307
      LENS                 : 44.4573



 * Prompt #4: 100%|██████████| 3/3 [00:23<00:00,  7.99s/it]



      Identical ratio      : 0.6267
      Identical ratio (ci) : 0.6267
      SARI                 : 32.7579
      FKGL                 : 10.3227
      BERTScore            : 45.4818
      LENS                 : 42.7828



 * Prompt #5: 100%|██████████| 3/3 [00:23<00:00,  7.88s/it]



      Identical ratio      : 0.6267
      Identical ratio (ci) : 0.6267
      SARI                 : 32.7821
      FKGL                 : 10.3207
      BERTScore            : 45.4710
      LENS                 : 42.7334



 * Prompt #6: 100%|██████████| 3/3 [00:23<00:00,  7.94s/it]



      Identical ratio      : 0.7600
      Identical ratio (ci) : 0.7600
      SARI                 : 29.7454
      FKGL                 : 10.7410
      BERTScore            : 46.8284
      LENS                 : 46.4131



 * Prompt #7: 100%|██████████| 3/3 [00:23<00:00,  7.91s/it]



      Identical ratio      : 0.6533
      Identical ratio (ci) : 0.6533
      SARI                 : 32.3842
      FKGL                 : 10.4017
      BERTScore            : 45.7553
      LENS                 : 43.1686



 * Prompt #8: 100%|██████████| 3/3 [00:23<00:00,  7.89s/it]



      Identical ratio      : 0.6600
      Identical ratio (ci) : 0.6600
      SARI                 : 32.0135
      FKGL                 : 10.4893
      BERTScore            : 45.9917
      LENS                 : 44.3381



 * Prompt #9: 100%|██████████| 3/3 [00:23<00:00,  7.97s/it]



      Identical ratio      : 0.5900
      Identical ratio (ci) : 0.5900
      SARI                 : 33.4357
      FKGL                 : 10.1404
      BERTScore            : 44.9593
      LENS                 : 41.1422



 * Prompt #10: 100%|██████████| 3/3 [00:23<00:00,  7.90s/it]



      Identical ratio      : 0.7267
      Identical ratio (ci) : 0.7267
      SARI                 : 30.6073
      FKGL                 : 10.7176
      BERTScore            : 46.6185
      LENS                 : 45.3623

 --- FACEBOOK/BART-BASE finetuned ---


 * Prompt #0: 100%|██████████| 3/3 [00:22<00:00,  7.59s/it]



      Identical ratio      : 0.0233
      Identical ratio (ci) : 0.0233
      SARI                 : 33.4695
      FKGL                 : 10.4872
      BERTScore            : 44.1646
      LENS                 : 35.3270



 === 'ASSET' dataset: 'test' subset ===

 --- FACEBOOK/BART-BASE ---


 * Prompt #0: 100%|██████████| 3/3 [00:21<00:00,  7.20s/it]



      Identical ratio      : 0.9331
      Identical ratio (ci) : 0.9443
      SARI                 : 21.4425
      FKGL                 : 10.0226
      BERTScore            : 90.7998
      LENS                 : 60.0461



 * Prompt #1: 100%|██████████| 3/3 [00:21<00:00,  7.31s/it]



      Identical ratio      : 0.8969
      Identical ratio (ci) : 0.8997
      SARI                 : 22.6130
      FKGL                 : 9.9410
      BERTScore            : 90.1941
      LENS                 : 58.4443



 * Prompt #2: 100%|██████████| 3/3 [00:26<00:00,  8.78s/it]



      Identical ratio      : 0.0000
      Identical ratio (ci) : 0.0000
      SARI                 : 26.7025
      FKGL                 : 5.2017
      BERTScore            : -6.9198
      LENS                 : 4.7598



 * Prompt #3: 100%|██████████| 3/3 [00:22<00:00,  7.35s/it]



      Identical ratio      : 0.7744
      Identical ratio (ci) : 0.7744
      SARI                 : 25.3010
      FKGL                 : 9.6323
      BERTScore            : 88.1643
      LENS                 : 53.7898



 * Prompt #4: 100%|██████████| 3/3 [00:22<00:00,  7.43s/it]



      Identical ratio      : 0.7270
      Identical ratio (ci) : 0.7298
      SARI                 : 26.1516
      FKGL                 : 9.4873
      BERTScore            : 87.5318
      LENS                 : 53.0714



 * Prompt #5: 100%|██████████| 3/3 [00:22<00:00,  7.52s/it]



      Identical ratio      : 0.7298
      Identical ratio (ci) : 0.7326
      SARI                 : 26.1505
      FKGL                 : 9.4982
      BERTScore            : 87.7510
      LENS                 : 53.1678



 * Prompt #6: 100%|██████████| 3/3 [00:22<00:00,  7.47s/it]



      Identical ratio      : 0.8524
      Identical ratio (ci) : 0.8552
      SARI                 : 23.4813
      FKGL                 : 9.8466
      BERTScore            : 89.7813
      LENS                 : 57.2629



 * Prompt #7: 100%|██████████| 3/3 [00:22<00:00,  7.54s/it]



      Identical ratio      : 0.7549
      Identical ratio (ci) : 0.7577
      SARI                 : 25.7742
      FKGL                 : 9.5612
      BERTScore            : 88.0126
      LENS                 : 53.4425



 * Prompt #8: 100%|██████████| 3/3 [00:22<00:00,  7.61s/it]



      Identical ratio      : 0.7716
      Identical ratio (ci) : 0.7716
      SARI                 : 25.4104
      FKGL                 : 9.6350
      BERTScore            : 88.2243
      LENS                 : 53.7901



 * Prompt #9: 100%|██████████| 3/3 [00:22<00:00,  7.57s/it]



      Identical ratio      : 0.6992
      Identical ratio (ci) : 0.7019
      SARI                 : 26.8829
      FKGL                 : 9.3777
      BERTScore            : 86.5909
      LENS                 : 51.5791



 * Prompt #10: 100%|██████████| 3/3 [00:22<00:00,  7.37s/it]



      Identical ratio      : 0.8384
      Identical ratio (ci) : 0.8412
      SARI                 : 24.0870
      FKGL                 : 9.7688
      BERTScore            : 89.4796
      LENS                 : 56.2057

 --- FACEBOOK/BART-BASE finetuned ---


 * Prompt #0: 100%|██████████| 3/3 [00:19<00:00,  6.60s/it]



      Identical ratio      : 0.0000
      Identical ratio (ci) : 0.0000
      SARI                 : 36.1283
      FKGL                 : 8.5014
      BERTScore            : 85.5653
      LENS                 : 43.8831



 === 'ONESTOPENGLISH' dataset: 'adv-ele' subset ===

 --- FACEBOOK/BART-BASE ---


 * Prompt #0: 100%|██████████| 18/18 [02:09<00:00,  7.21s/it]



      Identical ratio      : 0.9316
      Identical ratio (ci) : 0.9353
      SARI                 : 27.3634
      FKGL                 : 9.2581
      BERTScore            : 81.0992
      LENS                 : 59.8026



 * Prompt #1: 100%|██████████| 18/18 [02:16<00:00,  7.57s/it]



      Identical ratio      : 0.8719
      Identical ratio (ci) : 0.8737
      SARI                 : 28.5872
      FKGL                 : 9.0682
      BERTScore            : 80.2733
      LENS                 : 57.8335



 * Prompt #2: 100%|██████████| 18/18 [02:43<00:00,  9.06s/it]



      Identical ratio      : 0.0000
      Identical ratio (ci) : 0.0000
      SARI                 : 18.7114
      FKGL                 : 5.6757
      BERTScore            : -10.9095
      LENS                 : 4.0358



 * Prompt #3: 100%|██████████| 18/18 [02:18<00:00,  7.67s/it]



      Identical ratio      : 0.7167
      Identical ratio (ci) : 0.7167
      SARI                 : 31.1692
      FKGL                 : 8.5768
      BERTScore            : 77.9758
      LENS                 : 52.9294



 * Prompt #4: 100%|██████████| 18/18 [02:18<00:00,  7.71s/it]



      Identical ratio      : 0.6694
      Identical ratio (ci) : 0.6694
      SARI                 : 31.7928
      FKGL                 : 8.4112
      BERTScore            : 77.1824
      LENS                 : 51.4338



 * Prompt #5: 100%|██████████| 18/18 [02:18<00:00,  7.72s/it]



      Identical ratio      : 0.6694
      Identical ratio (ci) : 0.6694
      SARI                 : 31.8081
      FKGL                 : 8.4044
      BERTScore            : 77.2586
      LENS                 : 51.5433



 * Prompt #6: 100%|██████████| 18/18 [02:17<00:00,  7.61s/it]



      Identical ratio      : 0.8219
      Identical ratio (ci) : 0.8242
      SARI                 : 29.6419
      FKGL                 : 8.8974
      BERTScore            : 79.7312
      LENS                 : 56.3222



 * Prompt #7: 100%|██████████| 18/18 [02:18<00:00,  7.71s/it]



      Identical ratio      : 0.6974
      Identical ratio (ci) : 0.6974
      SARI                 : 31.4456
      FKGL                 : 8.4755
      BERTScore            : 77.5282
      LENS                 : 52.3068



 * Prompt #8: 100%|██████████| 18/18 [02:19<00:00,  7.73s/it]



      Identical ratio      : 0.7176
      Identical ratio (ci) : 0.7181
      SARI                 : 31.2023
      FKGL                 : 8.5518
      BERTScore            : 77.9275
      LENS                 : 52.9396



 * Prompt #9: 100%|██████████| 18/18 [02:20<00:00,  7.81s/it]



      Identical ratio      : 0.6212
      Identical ratio (ci) : 0.6212
      SARI                 : 32.2062
      FKGL                 : 8.2369
      BERTScore            : 76.0237
      LENS                 : 49.7177



 * Prompt #10: 100%|██████████| 18/18 [02:19<00:00,  7.76s/it]



      Identical ratio      : 0.7961
      Identical ratio (ci) : 0.7966
      SARI                 : 30.1282
      FKGL                 : 8.8220
      BERTScore            : 79.2945
      LENS                 : 55.6610

 --- FACEBOOK/BART-BASE finetuned ---


 * Prompt #0: 100%|██████████| 18/18 [02:00<00:00,  6.72s/it]



      Identical ratio      : 0.0000
      Identical ratio (ci) : 0.0000
      SARI                 : 37.4539
      FKGL                 : 8.0757
      BERTScore            : 75.4558
      LENS                 : 41.1780





Sanity check ⬇

In [None]:
for ds_key, models in _samples.items():
    for model_name, prompts in models.items():
        for pr_key, examples in prompts.items():
            suffix = " finetuned" if pr_key == "ft" else f" (Prompt #{pr_key})"
            header = f"{model_name.upper()} on {ds_key}{suffix}"
            print(header)
            for ex in examples:
                print(f"   SRC  : {ex.get('src','')}")
                print(f"   PRED : {ex.get('pred','')}")
                print("   ---")
            print()

FACEBOOK/BART-BASE on MEDEASI_test (Prompt #0)
   SRC  : In this notation, the DA reaction and the dipolar reaction both become a [ 4+2 ] cycloaddition.
   PRED : In this notation, the DA reaction and the dipolar reaction both become a [ 4+2 ] cycloaddition.
   ---
   SRC  : Intubation and CPAP are indicated for respiratory distress, followed by mechanical ventilation and admission to the neonatal ICU as needed.
   PRED : Intubation and CPAP are indicated for respiratory distress, followed by mechanical ventilation and admission to the neonatal ICU as needed.
   ---
   SRC  : Lady Burdett-Coutts died of acute bronchitis at her home on Stratton Street, Piccadilly.
   PRED : Lady Burdett-Coutts died of acute bronchitis at her home on Stratton Street, Piccadilly.
   ---

FACEBOOK/BART-BASE on MEDEASI_test (Prompt #1)
   SRC  : In this notation, the DA reaction and the dipolar reaction both become a [ 4+2 ] cycloaddition.
   PRED : In this notation, the DA reaction and the dipolar reaction

Saving samples

In [None]:
for ds_key, models in _samples.items():
    out_dir = os.path.join(RESULTS_PATH, ds_key, "samples")
    os.makedirs(out_dir, exist_ok=True)

    for model_name, prompts in models.items():
        file_path = os.path.join(out_dir, f"{model_name.replace('/', '_')}.txt")
        lines = []

        for pr_key, examples in prompts.items():
            heading = "Fine-Tune" if pr_key == "ft" else f"Prompt {pr_key}"
            lines.append(f"{heading}")
            for ex in examples:
                src  = ex.get("src", "")
                pred = ex.get("pred", "")
                lines.append(f"SRC  : {src}")
                lines.append(f"PRED : {pred}")
                lines.append("------")
            lines.append("")

        content = "\n".join(lines).rstrip() + "\n"
        with open(file_path, "w", encoding="utf-8", newline="\n") as f:
            f.write(content)

PANDAS + SHOW

In [None]:
import pandas as pd
df_all = pd.DataFrame(_all_rows)

def build_tables_by_dataset(
    df: pd.DataFrame,
    *,
    group_col="Dataset",
    sort_keys=("Subset", "Model", "Mode", "Prompt#"),
    drop_cols=("Dataset", "Subset", "Prompt#"),
):
    tables = {}
    for ds_name, g in df.groupby(group_col, sort=False):
        ordered = g.sort_values(list(sort_keys), kind="stable")
        cleaned = ordered.drop(columns=list(drop_cols), errors="ignore").reset_index(drop=True)
        tables[ds_name] = cleaned  # keep original column order
    return tables

_results_tables_by_ds = build_tables_by_dataset(df_all)

from IPython.display import display

for ds, df in _results_tables_by_ds.items():
    print(f"=== {ds} ===")
    display(df)
    print()

=== medeasi ===


Unnamed: 0,Model,Mode,Identical ratio,Identical ratio (ci),SARI,FKGL,BERTScore,LENS
0,facebook/bart-base,Fine-tuned,0.023333,0.023333,33.469485,10.487174,44.164571,35.327042
1,facebook/bart-base,Prompt-based,0.856667,0.866667,24.679596,11.195239,47.628555,49.436351
2,facebook/bart-base,Prompt-based,0.793333,0.796667,28.365469,10.898536,47.061166,47.422125
3,facebook/bart-base,Prompt-based,0.0,0.0,29.131678,4.611812,-18.232717,4.049666
4,facebook/bart-base,Prompt-based,0.66,0.66,32.036317,10.48233,46.030691,44.457281
5,facebook/bart-base,Prompt-based,0.626667,0.626667,32.757896,10.322677,45.481816,42.782797
6,facebook/bart-base,Prompt-based,0.626667,0.626667,32.78215,10.320724,45.471025,42.733437
7,facebook/bart-base,Prompt-based,0.76,0.76,29.745397,10.741017,46.828419,46.413094
8,facebook/bart-base,Prompt-based,0.653333,0.653333,32.384185,10.401736,45.7553,43.16856
9,facebook/bart-base,Prompt-based,0.66,0.66,32.013465,10.489288,45.991716,44.338097



=== asset ===


Unnamed: 0,Model,Mode,Identical ratio,Identical ratio (ci),SARI,FKGL,BERTScore,LENS
0,facebook/bart-base,Fine-tuned,0.0,0.0,36.128289,8.50141,85.565281,43.88309
1,facebook/bart-base,Prompt-based,0.933148,0.94429,21.442491,10.022555,90.799791,60.046117
2,facebook/bart-base,Prompt-based,0.896936,0.899721,22.612971,9.940955,90.194118,58.444314
3,facebook/bart-base,Prompt-based,0.0,0.0,26.702468,5.20169,-6.919838,4.759755
4,facebook/bart-base,Prompt-based,0.774373,0.774373,25.300994,9.632275,88.164282,53.789806
5,facebook/bart-base,Prompt-based,0.727019,0.729805,26.151607,9.487261,87.531829,53.071377
6,facebook/bart-base,Prompt-based,0.729805,0.732591,26.150539,9.498155,87.750983,53.167828
7,facebook/bart-base,Prompt-based,0.852368,0.855153,23.481347,9.846576,89.781255,57.262887
8,facebook/bart-base,Prompt-based,0.754875,0.75766,25.774215,9.561164,88.012642,53.442462
9,facebook/bart-base,Prompt-based,0.771588,0.771588,25.410396,9.635006,88.224328,53.790105



=== onestopenglish ===


Unnamed: 0,Model,Mode,Identical ratio,Identical ratio (ci),SARI,FKGL,BERTScore,LENS
0,facebook/bart-base,Fine-tuned,0.0,0.0,37.453942,8.075692,75.455803,41.177953
1,facebook/bart-base,Prompt-based,0.931589,0.935262,27.363396,9.258097,81.099236,59.802587
2,facebook/bart-base,Prompt-based,0.871901,0.873737,28.587196,9.068226,80.273271,57.833533
3,facebook/bart-base,Prompt-based,0.0,0.0,18.711396,5.675737,-10.909525,4.035839
4,facebook/bart-base,Prompt-based,0.716713,0.716713,31.169211,8.576765,77.975786,52.929429
5,facebook/bart-base,Prompt-based,0.669421,0.669421,31.792839,8.411221,77.182448,51.433766
6,facebook/bart-base,Prompt-based,0.669421,0.669421,31.808109,8.404442,77.258623,51.543258
7,facebook/bart-base,Prompt-based,0.821855,0.824151,29.641911,8.897367,79.731196,56.322236
8,facebook/bart-base,Prompt-based,0.697429,0.697429,31.445573,8.475469,77.52822,52.306807
9,facebook/bart-base,Prompt-based,0.717631,0.71809,31.202256,8.551841,77.927548,52.939574





CSV SAVE

In [None]:
for (ds_name, subset, model_name), g in df_all.groupby(["Dataset", "Subset", "Model"], sort=False):
    out_dir = os.path.join(RESULTS_PATH, f"{str(ds_name).upper()}_{str(subset)}")
    os.makedirs(out_dir, exist_ok=True)

    csv_path = os.path.join(out_dir, f"{str(model_name).replace('/', '_')}.csv")

    to_save = (
        g.drop(columns=["Dataset", "Subset", "Model"], errors="ignore")
         .sort_values(["Mode", "Prompt#"], kind="stable")
         .reset_index(drop=True)
    )

    to_save.to_csv(csv_path, index=False, encoding="utf-8", lineterminator="\n")

    print(f" * Saved: {csv_path}")

 * Saved: /content/drive/MyDrive/NLP_Project/results/MEDEASI_test/facebook_bart-base.csv
 * Saved: /content/drive/MyDrive/NLP_Project/results/ASSET_test/facebook_bart-base.csv
 * Saved: /content/drive/MyDrive/NLP_Project/results/ONESTOPENGLISH_adv-ele/facebook_bart-base.csv
