# evaluation

In [16]:
from bert_score import score

def compute_bert_score(reference, candidate):
    """Returns precision, recall, and F1 based on BERT embeddings"""
    P, R, F1 = score([candidate], [reference], lang="en")
    return {"bert_precision": P.mean().item(),
            "bert_recall": R.mean().item(),
            "bert_f1": F1.mean().item()}

In [17]:
from rouge import Rouge 

def compute_rouge(reference, candidate):
    rouge = Rouge()
    scores = rouge.get_scores(candidate, reference)[0]
    return {f"rouge_{k}": v for k,v in scores.items()}

In [18]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

def compute_entailment_score(premise, hypothesis):
    tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
    model = AutoModelForSequenceClassification.from_pretrained("roberta-large-mnli")
    
    inputs = tokenizer(premise, hypothesis, return_tensors="pt", truncation=True)
    outputs = model(**inputs)
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    # Return probability of entailment (label 2 for MNLI)
    return probabilities[0][2].item()

def detect_hallucinations(source, summary):
    """Check each summary sentence against source"""
    from nltk import sent_tokenize
    source_sents = sent_tokenize(source)
    summary_sents = sent_tokenize(summary)
    
    scores = []
    for sent in summary_sents:
        max_score = max(compute_entailment_score(source_sent, sent) 
                       for source_sent in source_sents)
        scores.append(max_score)
    
    return {
        "hallucination_score": 1 - (sum(scores)/len(scores)),
        "sentence_entailment": scores
    }

## Gather summaries

In [23]:

#––– 2. Perplexity –––
# pip install transformers torch
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

def compute_perplexity(text: str, model_name: str = "gpt2"):
    """
    Returns the perplexity of `text` under the specified GPT-2 model.
    """
    # load tokenizer & model
    tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
    model     = GPT2LMHeadModel.from_pretrained(model_name)
    model.eval()

    # tokenize & run
    encodings = tokenizer(text, return_tensors="pt")
    input_ids = encodings.input_ids
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        # outputs.loss is the average negative log‐likelihood per token
        neg_log_likelihood = outputs.loss * input_ids.size(1)

    ppl = torch.exp(outputs.loss)
    return ppl.item()

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/opt/miniconda3/envs/textsummary/lib/python3.9/site-packages/transformers/utils/import_utils.py", line 1967, in _get_module
  File "/opt/miniconda3/envs/textsummary/lib/python3.9/importlib/__init__.py", line 127, in import_module
    return _bootstrap._gcd_import(name[level:], package, level)
  File "<frozen importlib._bootstrap>", line 1030, in _gcd_import
  File "<frozen importlib._bootstrap>", line 1007, in _find_and_load
  File "<frozen importlib._bootstrap>", line 986, in _find_and_load_unlocked
  File "<frozen importlib._bootstrap>", line 680, in _load_unlocked
  File "<frozen importlib._bootstrap_external>", line 850, in exec_module
  File "<frozen importlib._bootstrap>", line 228, in _call_with_frames_removed
  File "/opt/miniconda3/envs/textsummary/lib/python3.9/site-packages/transformers/models/gpt2/modeling_gpt2.py", line 38, in <module>
    from ...modeling_utils import PreTrainedModel, SequenceSummary
  File "/opt/miniconda3/envs/

## Surprise – quick latent-factor view

In [24]:

#––– 3. Flesch–Kincaid –––
# pip install textstat
import textstat

def compute_readability(text: str):
    """
    Returns a dict with:
      - flesch_kincaid_grade: US grade level
      - flesch_reading_ease:  higher = easier
    """
    return {
        "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
        "flesch_reading_ease": textstat.flesch_reading_ease(text)
    }


## Evaluation

In [25]:
import os
from pathlib import Path  # More modern path handling

def process_all_files():
    # Get current directory (where evaluation.ipynb is)
    root_dir = Path.cwd()
    
    # Define paths
    originals_dir = root_dir / "originals"
    summaries_dir = root_dir / "summaries"
    
    # Create mapping between summaries and originals
    file_mapping = {
        "DL.txt": "Deep Learning.txt",
        "retrieval.txt": "Information Retrieval.txt",
        "Reinforcement.txt": "Reinforcement Learning.txt",
        "ML.txt": "Machine Learning.txt"
    }
    
    results = {}
    
    for summary_name, original_name in file_mapping.items():
        # Build full paths
        original_path = originals_dir / original_name
        summary_path = summaries_dir / summary_name
        
        # Check if files exist
        if not original_path.exists():
            print(f"⚠️ Missing original: {original_path}")
            continue
        if not summary_path.exists():
            print(f"⚠️ Missing summary: {summary_path}")
            continue
            
        # Process the pair
        print(f"\n🔍 Processing {summary_name} -> {original_name}")
        metrics = process_files(original_path, summary_path)
        results[summary_name] = metrics
        
    return results

# Modified process_files to use pathlib
def process_files(original_path, summary_path):
    try:
        with open(original_path, "r", encoding="utf-8") as f:
            original = f.read()
    except Exception as e:
        print(f"Error reading {original_path}: {str(e)}")
        return None

    try:
        with open(summary_path, "r", encoding="utf-8") as f:
            summary = f.read()
    except Exception as e:
        print(f"Error reading {summary_path}: {str(e)}")
        return None

    metrics = {}
    # ... your existing metric calculations ...
    return metrics

# Run the analysis
if __name__ == "__main__":
    all_results = process_all_files()
    print("\nFinal Results:")
    for name, metrics in all_results.items():
        print(f"\n{name}:")
        for k, v in metrics.items():
            print(f"{k:>25}: {v:.4f}" if isinstance(v, float) else f"{k:>25}: {v}")


🔍 Processing DL.txt -> Deep Learning.txt

🔍 Processing retrieval.txt -> Information Retrieval.txt

🔍 Processing Reinforcement.txt -> Reinforcement Learning.txt

🔍 Processing ML.txt -> Machine Learning.txt

Final Results:

DL.txt:

retrieval.txt:

Reinforcement.txt:

ML.txt:


In [None]:
import re, math, pathlib
from collections import Counter
import nltk
import pandas as pd
from nltk.util import ngrams

# optional grammar/spell checker
try:
    import enchant
    IT_DICT = enchant.Dict("it_IT")          # Italian dictionary
except Exception:
    IT_DICT = None   # spell-error rate will be skipped if dict not present


# ----------  sentence & word tokenizers ----------
# 1) try native Italian Punkt; 2) fall back to English Punkt; 3) regex split
def _build_sentence_splitter():
    try:                                    # 1️⃣ Italian model (rarely available now)
        return nltk.data.load('tokenizers/punkt/italian.pickle').tokenize
    except (LookupError, OSError):
        try:                                # 2️⃣ Generic English Punkt
            nltk.download('punkt', quiet=True)
            return nltk.data.load('tokenizers/punkt/english.pickle').tokenize
        except Exception:                   # 3️⃣ Dumb regex fallback
            return lambda txt: re.split(r'(?<=[.!?])\s+', txt.strip())

sent_split = _build_sentence_splitter()
word_tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")

def sentences(text):
    return [s for s in sent_split(text) if s.strip()]

def words(text):
    return word_tokenizer.tokenize(text)

# ----------  syllables (for Gulpease / F-V) ----------
vowel_re = re.compile(r'[aeiouyàèéìòóù]+', re.I)
def syllables(word:str):
    return len(vowel_re.findall(word))

# ----------  readability ----------
def gulpease(text):
    w = words(text); s = sentences(text)
    if not w or not s: return None
    letters = sum(len(wd) for wd in w)
    return 89 + (300*len(s) - 10*letters) / len(w)

def flesch_vacca(text):
    w = words(text); s = sentences(text); syll = sum(syllables(wd) for wd in w)
    if not w or not s: return None
    return 206 - 1.3*(len(w)/len(s)) - 60*(syll/len(w))

# ----------  writing quality ----------
def spell_error_rate(text):
    if IT_DICT is None: return None
    tokens = [t for t in words(text) if t.isalpha()]
    if not tokens: return None
    errors = sum(not IT_DICT.check(t) for t in tokens)
    return errors / len(tokens)

def avg_sentence_length(text):
    s = sentences(text); w = words(text)
    return len(w)/len(s) if s else None

# ----------  hallucination / term faithfulness ----------
tech_term_re = re.compile(r'[A-Z]{2,}[A-Za-z]*')   # crude heuristic


In [None]:
orig_dir = pathlib.Path("originals")
sum_dir  = pathlib.Path("summaries")

records = []

for name in ["RL.txt", "DL.txt", "IR.txt", "ML.txt"]:
    en = (orig_dir / name).read_text(encoding='utf-8')
    it = (sum_dir  / name).read_text(encoding='utf-8')

    rec = {
        "file":          name,
        # readability
        "gulpease":      round(gulpease(it), 1),
        "flesch_vacca":  round(flesch_vacca(it), 1),
        # writing quality
        "spell_err":     round(spell_error_rate(it), 3) if spell_error_rate(it) is not None else "—",
        "avg_sent_len":  round(avg_sentence_length(it), 1),
    }
    records.append(rec)

df = pd.DataFrame(records)
print(df.to_string(index=False))


  file  gulpease  flesch_vacca spell_err  avg_sent_len  term_ratio  jaccard_3gram
RL.txt      43.9          32.0         —          19.8       0.000          0.000
DL.txt      29.7          -7.8         —          42.3       0.032          0.001
IR.txt      48.7          40.6         —          17.8       0.100          0.000
ML.txt      45.7          31.1         —          16.1       0.048          0.000
