# evaluation

In [6]:
#––– 1. BERTScore –––
# pip install bert-score
from bert_score import score

def compute_bertscore(source: str, summary: str, lang: str = "en"):
    """
    Returns (precision, recall, F1) of summary vs. source.
    """
    P, R, F1 = score(
        [summary],      # candidate summaries
        [source],       # references (here: the original text)
        lang=lang,
        rescale_with_baseline=True
    )
    return {
        "precision": P.mean().item(),
        "recall":    R.mean().item(),
        "f1":        F1.mean().item()
    }


## Gather summaries

In [7]:

#––– 2. Perplexity –––
# pip install transformers torch
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

def compute_perplexity(text: str, model_name: str = "gpt2"):
    """
    Returns the perplexity of `text` under the specified GPT-2 model.
    """
    # load tokenizer & model
    tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
    model     = GPT2LMHeadModel.from_pretrained(model_name)
    model.eval()

    # tokenize & run
    encodings = tokenizer(text, return_tensors="pt")
    input_ids = encodings.input_ids
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        # outputs.loss is the average negative log‐likelihood per token
        neg_log_likelihood = outputs.loss * input_ids.size(1)

    ppl = torch.exp(outputs.loss)
    return ppl.item()

## Surprise – quick latent-factor view

In [8]:

#––– 3. Flesch–Kincaid –––
# pip install textstat
import textstat

def compute_readability(text: str):
    """
    Returns a dict with:
      - flesch_kincaid_grade: US grade level
      - flesch_reading_ease:  higher = easier
    """
    return {
        "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
        "flesch_reading_ease": textstat.flesch_reading_ease(text)
    }


## RecBole – needs its own folder layout

In [16]:
def process_files(file_list):
    for fname in file_list:
        try:
            with open(fname, "r", encoding="utf-8") as f:
                source = f.read()
        except FileNotFoundError:
            print(f" File non trovato: {fname}")
            continue

        ppl = compute_perplexity(source)
        rd  = compute_readability(source)

        print(f"\n{fname}")
        print(f"\nPerplexity → {ppl}")
        print(f"\nReadability → {rd}")
        print("-" * 40)

if __name__ == "__main__":
    files = ["DL.txt", "retrieval.txt", "Reinforcement.txt", "ML.txt"]
    process_files(files)



DL.txt

Perplexity → 106.84957885742188

Readability → {'flesch_kincaid_grade': 25.1, 'flesch_reading_ease': -12.15}
----------------------------------------

retrieval.txt

Perplexity → 213.31503295898438

Readability → {'flesch_kincaid_grade': 14.6, 'flesch_reading_ease': 20.38}
----------------------------------------

Reinforcement.txt

Perplexity → 78.48356628417969

Readability → {'flesch_kincaid_grade': 16.5, 'flesch_reading_ease': 10.19}
----------------------------------------

ML.txt

Perplexity → 129.42745971679688

Readability → {'flesch_kincaid_grade': 13.1, 'flesch_reading_ease': 29.86}
----------------------------------------
