In [None]:
%pip install --upgrade huggingface_hub
%pip install evaluate
%pip install bert_score

In [None]:
from huggingface_hub import login
from sklearn.metrics import f1_score,root_mean_squared_error
from transformers import pipeline,AutoTokenizer
import evaluate
import numpy as np
import pandas as pd
import json

from pls_evaluator import get_pls_evaluation, create_readability_table, create_summary_table

### - Loading Results

In [None]:
import pandas as pd

# Load the integrated evaluation results
df = pd.read_csv('outputs/integrated_evaluation_results.csv')

# Remove standard deviation (the value after the ±)


# Display the table
print(f"Loaded table with {len(df)} models and {len(df.columns)} columns")
df

Loaded table with 5 models and 17 columns


Unnamed: 0,Model,BERTScore F1 (original),MeaningBERT (original),AlignScore (original),BERTScore F1 (reference),MeaningBERT (reference),AlignScore (reference),Flesch-Kincaid,ARI,Coleman-Liau,FleschReadingEase,GunningFogIndex,LIX,SMOGIndex,RIX,DaleChallIndex,Excellence Rate (%)
0,human (reference),0.8482,0.6825,0.7551,-,-,-,11.38 ± 1.67,11.46 ± 1.89,11.21 ± 1.52,49.23 ± 9.09,16.21 ± 2.06,50.35 ± 4.87,14.23 ± 1.56,6.07 ± 1.23,7.37 ± 0.62,37.83
1,gemini-2.5-pro (baseline),0.834,0.5828,0.7436,0.8696,0.7161,0.6900,8.72 ± 0.88,8.68 ± 0.90,8.94 ± 0.82,64.48 ± 5.17,12.55 ± 1.06,41.89 ± 2.76,12.31 ± 1.01,4.29 ± 0.52,6.28 ± 0.40,61.67
2,gemini-2.5-pro,0.8419,0.582,0.7893,0.8712,0.6850,0.7189,8.37 ± 1.14,8.31 ± 1.22,9.33 ± 1.06,64.06 ± 6.46,12.25 ± 1.36,41.34 ± 4.03,12.00 ± 0.91,4.08 ± 0.74,6.35 ± 0.50,63.78
3,gpt-5,0.8283,0.5632,0.7647,0.8519,0.6414,0.6296,9.04 ± 1.40,9.51 ± 1.50,10.23 ± 1.30,61.31 ± 7.94,12.93 ± 1.63,42.24 ± 4.62,13.64 ± 1.44,4.34 ± 0.92,7.15 ± 0.58,56.61
4,gpt-oss-120b,0.8311,0.6408,0.7925,0.8556,0.6713,0.6713,10.52 ± 1.39,10.67 ± 1.63,10.86 ± 1.30,53.37 ± 7.52,15.84 ± 1.54,48.06 ± 4.29,14.73 ± 1.17,5.53 ± 0.99,7.82 ± 0.48,43.44


### - AlignScore

In [2]:
# Import the AlignScore evaluator
from alignscore_evaluator import AlignScoreEvaluator, get_alignscore

# Initialize the evaluator once (to avoid reloading the model)
alignscore_evaluator = AlignScoreEvaluator()

AlignScore using device: mps


### - MeaningBERT

In [3]:
meaning_bert = evaluate.load("davebulaval/meaningbert")

In [4]:
def get_meaningbert_score(predictions, references, model):
  assert len(predictions) == len(references), "The number of references is different of the number of predictions."
  result=[]
  for pred, ref in zip(predictions, references):
        score = model.compute(predictions=[pred], references=[ref])
        result.append(score["scores"][0])
  return round(np.mean(result)/100, 4)

### - BERTScore

In [5]:
bertscore = evaluate.load("bertscore")

In [6]:
def get_bertscore(predictions, references, scorename=bertscore, scoretype="f1",  modelname=None):
  result = scorename.compute(references=references, predictions=predictions, lang="en") # , model_type=modelname)
  return round(np.mean(result[scoretype]), 4)

### - Readability

## Loading texts

### Original Abstracts

In [None]:
import os
import glob

# Load original results
original = {}
original_path = "outputs/original"

# Get all JSON files in the original directory
json_files = glob.glob(os.path.join(original_path, "*.json"))

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Extract cochrane_review_id and pls result
        if 'cochrane_review_id' in data and 'pls' in data:
            cochrane_review_id = data['cochrane_review_id']
            abstract = data['title'] + '\n' + data['abstract']
            original[cochrane_review_id] = abstract

# Print the first 3 key-value pairs
list(original.items())[:3]

[('CD015749.PUB2',
  "Multimodal interventions for cachexia management\nBackground\nCachexia (disease‐related wasting) is a complex metabolic syndrome which occurs in people with chronic illnesses, including cancer, HIV/AIDS, kidney disease, heart disease, and chronic obstructive pulmonary disease (COPD). People with cachexia experience unintentional weight loss, muscle loss, fatigue, loss of appetite, and reduced quality of life. Multimodal interventions which work synergistically to treat the syndrome could lead to benefits.\n\nObjectives\nTo assess the benefits and harms of multimodal interventions aimed at alleviating or stabilising cachexia in people with a chronic illness.\n\nSearch methods\nWe searched CENTRAL, MEDLINE, Embase, PsycINFO, and two trials registers in July 2024, together with reference checking, citation searching, and contact with study authors to identify studies.\n\nSelection criteria\nWe included randomised controlled trials (RCTs) in adults with or at risk of 

### References or gold standard

In [18]:
import os
import glob

# Load references results
references = {}
references_path = "outputs/original"

# Get all JSON files in the references directory
json_files = glob.glob(os.path.join(references_path, "*.json"))

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Extract cochrane_review_id and pls result
        if 'cochrane_review_id' in data and 'pls' in data:
            cochrane_review_id = data['cochrane_review_id']
            pls_result = data['pls']
            references[cochrane_review_id] = pls_result

# Print the first 3 key-value pairs
list(references.items())[:3]

[('CD015749.PUB2',
  "What are the benefits and risks of using a combination of treatments to manage cachexia (disease‐related wasting)?\n\nKey messages:\n• We do not know if combinations of two or more treatments (of medication, diet, and exercise) benefit people who have or are at risk of developing cachexia (disease‐related wasting). This is because there are currently not enough robust studies in this area.\n• We need future research to increase our confidence in the evidence by conducting better designed and larger studies.\nWhat is cachexia?\n\nCachexia is a complex metabolic syndrome that occurs in people with long‐term illnesses (known as chronic illnesses), such as cancer, HIV/AIDS, chronic kidney disease, heart disease, and chronic obstructive pulmonary disease (COPD).\nPeople who have cachexia may:\n• lose weight unintentionally;• lose muscle;• feel tired, weak, or both;• lose their appetite.\nCachexia affects well‐being and can be life‐threatening.\nHow is cachexia treated?

### Baseline (simple LLM with a single one prompt)

In [19]:
# Load baseline results
baseline = {}
baseline_path = "outputs/baseline"

# Get all JSON files in the baseline directory
json_files = glob.glob(os.path.join(baseline_path, "*.json"))

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Extract cochrane_id and response
        if 'cochrane_id' in data and 'response' in data and 'output' in data['response']:
            cochrane_id = data['cochrane_id']
            response = data['response']['output']
            baseline[cochrane_id] = response

# Print the first 3 key-value pairs
list(baseline.items())[:3]

[('CD015749.PUB2',
  "Plain Language Summary Title\nCan combining different treatments help people with cachexia?\n\nKey Messages\n– Cachexia is a condition that causes serious weight loss, muscle loss, and tiredness. It can happen to people with long-term illnesses like cancer or heart disease.\n– We looked at studies that combined different types of treatments for cachexia, such as medicines, diet changes, and exercise programs.\n– We do not know if using a combination of treatments helps or harms people with cachexia. The studies we found were too small and had problems in how they were done, so we are very uncertain about the results.\n\nWhat is cachexia?\nCachexia is a serious condition that is sometimes called 'wasting syndrome'. It causes people to lose a lot of weight and muscle, even if they are still eating. It is not the same as normal weight loss.\n\nCachexia happens to people who have long-term (chronic) illnesses, such as cancer, heart failure, kidney disease, or HIV/AIDS

### Gemini 2.5 Pro

In [20]:
# Load Gemini results
gemini = {}
gemini_path = "outputs/gemini"

# Get all JSON files in the gemini directory
json_files = glob.glob(os.path.join(gemini_path, "*.json"))

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Extract cochrane_id and response
        if 'cochrane_id' in data and 'response' in data and 'output' in data['response']:
            cochrane_id = data['cochrane_id']
            response = data['response']['output']
            gemini[cochrane_id] = response

# Print the first 3 key-value pairs
list(gemini.items())[:3]

[('CD015749.PUB2',
  "Can combining treatments like medicine, diet, and exercise help people with cachexia (severe weight and muscle loss from long-term illness)?\n\nKey messages\n- We do not know if combining treatments like medicine, diet, and exercise helps people with cachexia (severe weight and muscle loss from long-term illness). There was not enough good-quality evidence to be certain about the benefits or unwanted effects of these combined treatments.\n- Better-designed studies that include more people and last long enough are needed to find out if combining treatments can help manage cachexia.\n\nWhat is cachexia?\nPeople with long-term illnesses such as cancer, HIV/AIDS, kidney disease, and heart disease can develop a condition called cachexia. People with cachexia have severe, unintentional weight loss, including the loss of muscle. They may also have tiredness, loss of appetite, and a lower quality of life (how they feel and function day to day). Because cachexia has many c

### GPT-5

In [21]:
# Load gpt_5 results
gpt_5 = {}
gpt_5_path = "outputs/gpt-5"

# Get all JSON files in the gpt_5 directory
json_files = glob.glob(os.path.join(gpt_5_path, "*.json"))

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Extract cochrane_id and response
        if 'cochrane_id' in data and 'response' in data and 'output' in data['response']:
            cochrane_id = data['cochrane_id']
            response = data['response']['output']
            gpt_5[cochrane_id] = response

# Print the first 3 key-value pairs
list(gpt_5.items())[:3]

[('CD015749.PUB2',
  'Title\nDo combined treatments (medicine, nutrition, and exercise) work better than usual care or a single treatment for people with cachexia (a condition with severe, unintentional weight loss, including loss of muscle and fat, often due to long-term illness)?\n\nKey messages\n- We found too little good evidence to say if programs that combine medicine, nutrition support, and exercise work better than usual care (the standard care people would normally get) or a single treatment. Short-term effects on daily function, muscle strength, weight, symptoms, and overall wellbeing (quality of life: how someone feels and works day to day) were unclear. Side effects or harms (adverse events: an unwanted event that causes harm) were tracked but did not show clear differences.\n- Most trials were small and short (treatment lasted 6 weeks to about 3–7 months). They used different mixes and doses of medicine, nutrition (how the body takes in and uses food and nutrients), and ex

### GPT-OSS-120B

In [22]:
# Load gpt_oss_120b results
gpt_oss_120b = {}
gpt_oss_120b_path = "outputs/gpt-oss-120b"

# Get all JSON files in the gpt_oss_120b directory
json_files = glob.glob(os.path.join(gpt_oss_120b_path, "*.json"))

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Extract cochrane_id and response
        if 'cochrane_id' in data and 'response' in data and 'output' in data['response']:
            cochrane_id = data['cochrane_id']
            response = data['response']['output']
            gpt_oss_120b[cochrane_id] = response

# Print the first 3 key-value pairs
list(gpt_oss_120b.items())[:3]

[('CD015749.PUB2',
  'Do combined treatments (drugs, nutrition, and exercise) improve muscle loss and weight loss (cachexia) in people with chronic illnesses?\n\nKey messages  \n- Current research does not show a clear benefit of combined treatments for muscle loss, weight loss, or related outcomes in people with chronic illnesses, and data on possible harms are very limited.  \n- The nine studies we found were few, mainly involved people with cancer, and were small or had methodological problems, so the results are uncertain for other conditions such as chronic obstructive pulmonary disease (a type of lung disease marked by permanent damage to lung tissue that makes breathing hard), chronic kidney disease (a condition in which the kidneys are damaged slowly over time and cannot remove waste and extra water from the blood), or HIV/AIDS.  \n- Larger, well‑designed trials that follow participants for longer periods are needed to determine whether combined treatments can safely improve mu

## Test execution

### Reference

In [25]:
# Extract references and original text with matching keys
ground_truth_reference = []
ground_truth_original = []
matching_ids = []

for cochrane_id in original:
    if cochrane_id in original:
        ground_truth_reference.append(references[cochrane_id])
        ground_truth_original.append(original[cochrane_id])
        matching_ids.append(cochrane_id)

print(f"Found {len(matching_ids)} matching entries between original and references")

Found 100 matching entries between original and references


In [None]:
# Calculate BERTScore using the function for baseline (gold standard and original text)
ground_truth_bertscore = get_bertscore(ground_truth_reference, ground_truth_original, bertscore)
print(f"Ground Truth BERTScore F1: {ground_truth_bertscore:.4f}")

# Calculate MeaningBERT using the function for ground truth (gold standard and original text)
ground_truth_meaningbert = get_meaningbert_score(ground_truth_reference, ground_truth_original, meaning_bert)
print(f"Ground Truth MeaningBERT score: {ground_truth_meaningbert:.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Baseline BERTScore F1: 0.8482
Baseline MeaningBERT score: 0.6825


In [None]:
ground_truth_alignscore = get_alignscore(ground_truth_reference, ground_truth_original, alignscore_evaluator)
print(f"Baseline AlignScore: {ground_truth_alignscore:.4f}")

Computing AlignScore:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1687 > 512). Running this sequence through the model will result in indexing errors
Computing AlignScore: 100%|██████████| 100/100 [43:17<00:00, 25.98s/it]

Baseline AlignScore: 0.7551





### Baseline (simple agent)

In [26]:
# Extract predictions and original with matching keys
baseline_predictions = []
ground_truth_original = []
matching_ids = []

for cochrane_id in original:
    if cochrane_id in original:
        baseline_predictions.append(baseline[cochrane_id])
        ground_truth_original.append(original[cochrane_id])
        matching_ids.append(cochrane_id)

print(f"Found {len(matching_ids)} matching entries between baseline and original")

Found 100 matching entries between baseline and original


In [27]:
# Calculate BERTScore using the function
baseline_bertscore = get_bertscore(baseline_predictions, ground_truth_original, bertscore)
print(f"Baseline BERTScore F1: {baseline_bertscore:.4f}")

baseline_meaningbert = get_meaningbert_score(baseline_predictions, ground_truth_original, meaning_bert)
print(f"Baseline MeaningBERT score: {baseline_meaningbert:.4f}")

baseline_bertscore_reference = get_bertscore(baseline_predictions, ground_truth_reference, bertscore)
print(f"Baseline BERTScore F1 (reference): {baseline_bertscore_reference:.4f}")

baseline_meaningbert_reference = get_meaningbert_score(baseline_predictions, ground_truth_reference, meaning_bert)
print(f"Baseline MeaningBERT score (reference): {baseline_meaningbert_reference:.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Baseline BERTScore F1: 0.8340
Baseline MeaningBERT score: 0.5828
Baseline BERTScore F1 (reference): 0.8696
Baseline MeaningBERT score (reference): 0.7161


In [None]:
baseline_pls = get_pls_evaluation(baseline_predictions)

Evaluating with PLS API: 100%|██████████| 100/100 [00:19<00:00,  5.15it/s]


{'readability': {'flesch_kincaid_grade': {'mean': np.float64(8.72),
   'std': np.float64(0.88),
   'formatted': '8.72 ± 0.88'},
  'automated_readability_index': {'mean': np.float64(8.68),
   'std': np.float64(0.9),
   'formatted': '8.68 ± 0.90'},
  'coleman_liau_index': {'mean': np.float64(8.94),
   'std': np.float64(0.82),
   'formatted': '8.94 ± 0.82'},
  'flesch_reading_ease': {'mean': np.float64(64.48),
   'std': np.float64(5.17),
   'formatted': '64.48 ± 5.17'},
  'gunning_fog_index': {'mean': np.float64(12.55),
   'std': np.float64(1.06),
   'formatted': '12.55 ± 1.06'},
  'lix': {'mean': np.float64(41.89),
   'std': np.float64(2.76),
   'formatted': '41.89 ± 2.76'},
  'smog_index': {'mean': np.float64(12.31),
   'std': np.float64(1.01),
   'formatted': '12.31 ± 1.01'},
  'rix': {'mean': np.float64(4.29),
   'std': np.float64(0.52),
   'formatted': '4.29 ± 0.52'},
  'dale_chall_readability': {'mean': np.float64(6.28),
   'std': np.float64(0.4),
   'formatted': '6.28 ± 0.40'}},
 '

In [31]:
readability_table = create_readability_table(
    model_results=[baseline_pls],
    model_names=['baseline']
)

readability_table

Unnamed: 0,model,Kincaid,ARI,Coleman-Liau,FleschReadingEase,GunningFogIndex,LIX,SMOGIndex,RIX,DaleChallIndex
0,baseline,8.72 ± 0.88,8.68 ± 0.90,8.94 ± 0.82,64.48 ± 5.17,12.55 ± 1.06,41.89 ± 2.76,12.31 ± 1.01,4.29 ± 0.52,6.28 ± 0.40


In [32]:
# For Baseline predictions vs original
baseline_alignscore = get_alignscore(baseline_predictions, ground_truth_original, alignscore_evaluator)
print(f"Baseline AlignScore: {baseline_alignscore:.4f}")

# For Baseline predictions vs reference
baseline_alignscore_reference = get_alignscore(baseline_predictions, ground_truth_reference, alignscore_evaluator)
print(f"Baseline AlignScore (reference): {baseline_alignscore_reference:.4f}")

Computing AlignScore:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1687 > 512). Running this sequence through the model will result in indexing errors
Computing AlignScore: 100%|██████████| 100/100 [45:17<00:00, 27.17s/it]


Baseline AlignScore: 0.7436


Computing AlignScore: 100%|██████████| 100/100 [27:29<00:00, 16.49s/it]

Baseline AlignScore (reference): 0.6900





### Gemini 2.5 Pro

In [None]:
# Extract predictions and original with matching keys
gemini_predictions = []
ground_truth_original = []
matching_ids = []

for cochrane_id in original:
    if cochrane_id in original:
        gemini_predictions.append(gemini[cochrane_id])
        ground_truth_original.append(original[cochrane_id])
        matching_ids.append(cochrane_id)

print(f"Found {len(matching_ids)} matching entries between gemini and original")

Found 100 matching entries between gemini and original


In [None]:
# Calculate BERTScore using the function
gemini_bertscore = get_bertscore(gemini_predictions, ground_truth_original, bertscore)
print(f"Gemini BERTScore F1: {gemini_bertscore:.4f}")

gemini_meaningbert = get_meaningbert_score(gemini_predictions, ground_truth_original, meaning_bert)
print(f"Gemini MeaningBERT score: {gemini_meaningbert:.4f}")

gemini_bertscore_reference = get_bertscore(gemini_predictions, ground_truth_reference, bertscore)
print(f"Gemini BERTScore F1 (reference): {gemini_bertscore_reference:.4f}")

gemini_meaningbert_reference = get_meaningbert_score(gemini_predictions, ground_truth_reference, meaning_bert)
print(f"Gemini MeaningBERT score (reference): {gemini_meaningbert_reference:.4f}")

Gemini BERTScore F1: 0.8419
Gemini MeaningBERT score: 0.5820
Gemini BERTScore F1 (reference): 0.8712
Gemini MeaningBERT score (reference): 0.6850


In [None]:
# For Gemini predictions vs original
gemini_alignscore = get_alignscore(gemini_predictions, ground_truth_original, alignscore_evaluator)
print(f"Gemini AlignScore: {gemini_alignscore:.4f}")

# For Gemini predictions vs reference
gemini_alignscore_reference = get_alignscore(gemini_predictions, ground_truth_reference, alignscore_evaluator)
print(f"Gemini AlignScore (reference): {gemini_alignscore_reference:.4f}")

Computing AlignScore: 100%|██████████| 100/100 [42:34<00:00, 25.54s/it]


Gemini AlignScore: 0.7893


Computing AlignScore: 100%|██████████| 100/100 [28:24<00:00, 17.05s/it]

Gemini AlignScore (reference): 0.7189





### GPT-OSS 120B

In [None]:
# Extract predictions and original with matching keys
gpt_oss_120b_predictions = []
ground_truth_original = []
matching_ids = []

for cochrane_id in original:
    if cochrane_id in original:
        gpt_oss_120b_predictions.append(gpt_oss_120b[cochrane_id])
        ground_truth_original.append(original[cochrane_id])
        matching_ids.append(cochrane_id)

print(f"Found {len(matching_ids)} matching entries between gpt-oss-120b and original")

Found 100 matching entries between gpt-oss-120b and original


In [None]:
# Calculate BERTScore using the function
gpt_oss_120b_bertscore = get_bertscore(gpt_oss_120b_predictions, ground_truth_original, bertscore)
print(f"GPT-OSS-120B BERTScore F1: {gpt_oss_120b_bertscore:.4f}")

gpt_oss_120b_meaningbert = get_meaningbert_score(gpt_oss_120b_predictions, ground_truth_original, meaning_bert)
print(f"GPT-OSS-120B MeaningBERT score: {gpt_oss_120b_meaningbert:.4f}")

gpt_oss_120b_bertscore_reference = get_bertscore(gpt_oss_120b_predictions, ground_truth_reference, bertscore)
print(f"GPT-OSS-120B BERTScore F1 (reference): {gpt_oss_120b_bertscore_reference:.4f}")

gpt_oss_120b_meaningbert_reference = get_meaningbert_score(gpt_oss_120b_predictions, ground_truth_reference, meaning_bert)
print(f"GPT-OSS-120B MeaningBERT score (reference): {gpt_oss_120b_meaningbert_reference:.4f}")

GPT-OSS-120B BERTScore F1: 0.8311
GPT-OSS-120B MeaningBERT score: 0.6408
GPT-OSS-120B BERTScore F1 (reference): 0.8556
GPT-OSS-120B MeaningBERT score (reference): 0.7197


In [None]:
gpt_oss_120b_alignscore = get_alignscore(gpt_oss_120b_predictions, ground_truth_original, alignscore_evaluator)
print(f"GPT-OSS-120B AlignScore: {gpt_oss_120b_alignscore:.4f}")

gpt_oss_120b_alignscore_reference = get_alignscore(gpt_oss_120b_predictions, ground_truth_reference, alignscore_evaluator)
print(f"GPT-OSS-120B AlignScore (reference): {gpt_oss_120b_alignscore_reference:.4f}")

Computing AlignScore: 100%|██████████| 100/100 [42:49<00:00, 25.70s/it]


GPT-OSS-120B AlignScore: 0.7925


Computing AlignScore: 100%|██████████| 100/100 [37:13<00:00, 22.33s/it]

GPT-OSS-120B AlignScore (reference): 0.6713





### GPT-5

In [None]:
# Extract predictions and references with matching keys
gpt_5_predictions = []
ground_truth_original = []
matching_ids = []

for cochrane_id in original:
    if cochrane_id in original:
        gpt_5_predictions.append(gpt_5[cochrane_id])
        ground_truth_original.append(original[cochrane_id])
        matching_ids.append(cochrane_id)

print(f"Found {len(matching_ids)} matching entries between gpt-5 and original")

Found 100 matching entries between gpt-5 and original


In [None]:
# Calculate BERTScore using the function
gpt_5_bertscore = get_bertscore(gpt_5_predictions, ground_truth_original, bertscore)
print(f"GPT-5 BERTScore F1: {gpt_5_bertscore:.4f}")

# Calculate MeaningBERT score for GPT-5 and original
gpt_5_meaningbert = get_meaningbert_score(gpt_5_predictions, ground_truth_original, meaning_bert)
print(f"GPT-5 MeaningBERT score: {gpt_5_meaningbert:.4f}")

# Calculate BERTScore for GPT-5 and reference
gpt_5_bertscore_reference = get_bertscore(gpt_5_predictions, ground_truth_reference, bertscore)
print(f"GPT-5 BERTScore F1 (reference): {gpt_5_bertscore_reference:.4f}")

# Calculate MeaningBERT score for GPT-5 and reference
gpt_5_meaningbert_reference = get_meaningbert_score(gpt_5_predictions, ground_truth_reference, meaning_bert)
print(f"GPT-5 MeaningBERT score (reference): {gpt_5_meaningbert_reference:.4f}")

GPT-5 BERTScore F1: 0.8283
GPT-5 MeaningBERT score: 0.5632
GPT-5 BERTScore F1 (reference): 0.8519
GPT-5 MeaningBERT score (reference): 0.6414


In [None]:
gpt_5_alignscore = get_alignscore(gpt_5_predictions, ground_truth_original, alignscore_evaluator)
print(f"GPT-5 AlignScore: {gpt_5_alignscore:.4f}")

gpt_5_alignscore_reference = get_alignscore(gpt_5_predictions, ground_truth_reference, alignscore_evaluator)
print(f"GPT-5 AlignScore (reference): {gpt_5_alignscore_reference:.4f}")

Computing AlignScore: 100%|██████████| 100/100 [1:00:42<00:00, 36.43s/it]


GPT-5 AlignScore: 0.7647


Computing AlignScore: 100%|██████████| 100/100 [40:48<00:00, 24.49s/it]

GPT-5 AlignScore (reference): 0.6296





### Readability

In [None]:
# Evaluate each model's predictions
print("Evaluating Reference (Gold Standard)...")
reference_pls = get_pls_evaluation(ground_truth_reference)

print("Evaluating GPT-5...")
gpt5_pls = get_pls_evaluation(gpt_5_predictions)

print("Evaluating Gemini...")
gemini_pls = get_pls_evaluation(gemini_predictions)

print("Evaluating GPT-OSS-120B...")
gpt_oss_120b_pls = get_pls_evaluation(gpt_oss_120b_predictions)

# Create readability metrics table
readability_table = create_readability_table(
    model_results=[reference_pls, gpt5_pls, gemini_pls, gpt_oss_120b_pls],
    model_names=['Reference', 'GPT-5', 'Gemini', 'GPT-OSS-120B']
)

print("\nReadability Metrics:")
print(readability_table)

# Create summary percentages table
summary_table = create_summary_table(
    model_results=[reference_pls, gpt5_pls, gemini_pls, gpt_oss_120b_pls],
    model_names=['Reference', 'GPT-5', 'Gemini', 'GPT-OSS-120B']
)

print("\nSummary Percentages:")
print(summary_table)

# Access individual metrics if needed
print(f"\nGPT-5 Flesch-Kincaid Grade: {gpt5_pls['readability']['flesch_kincaid_grade']['mean']:.2f}")
print(f"GPT-5 Excellence Rate: {gpt5_pls['summary']['excellence_rate']:.2f}%")

Evaluating Reference (Gold Standard)...


Evaluating with PLS API: 100%|██████████| 100/100 [00:30<00:00,  3.26it/s]


Evaluating GPT-5...


Evaluating with PLS API: 100%|██████████| 100/100 [00:36<00:00,  2.78it/s]


Evaluating Gemini...


Evaluating with PLS API: 100%|██████████| 100/100 [00:31<00:00,  3.14it/s]


Evaluating GPT-OSS-120B...


Evaluating with PLS API: 100%|██████████| 100/100 [00:32<00:00,  3.09it/s]


Readability Metrics:
          model       Kincaid           ARI  Coleman-Liau FleschReadingEase  \
0     Reference  11.38 ± 1.67  11.46 ± 1.89  11.21 ± 1.52      49.23 ± 9.09   
1         GPT-5   9.04 ± 1.40   9.51 ± 1.50  10.23 ± 1.30      61.31 ± 7.94   
2        Gemini   8.37 ± 1.14   8.31 ± 1.22   9.33 ± 1.06      64.06 ± 6.46   
3  GPT-OSS-120B  10.52 ± 1.39  10.67 ± 1.63  10.86 ± 1.30      53.37 ± 7.52   

  GunningFogIndex           LIX     SMOGIndex          RIX DaleChallIndex  
0    16.21 ± 2.06  50.35 ± 4.87  14.23 ± 1.56  6.07 ± 1.23    7.37 ± 0.62  
1    12.93 ± 1.63  42.24 ± 4.62  13.64 ± 1.44  4.34 ± 0.92    7.15 ± 0.58  
2    12.25 ± 1.36  41.34 ± 4.03  12.00 ± 0.91  4.08 ± 0.74    6.35 ± 0.50  
3    15.84 ± 1.54  48.06 ± 4.29  14.73 ± 1.17  5.53 ± 0.99    7.82 ± 0.48  

Summary Percentages:
          model  Excellent %  Good %  Acceptable %  Poor %  Critical %  \
0     Reference        37.83   17.89         14.67   15.00       14.61   
1         GPT-5        56.61    




In [2]:
df

Unnamed: 0,Model,BERTScore F1 (original),MeaningBERT (original),AlignScore (original),BERTScore F1 (reference),MeaningBERT (reference),AlignScore (reference),Flesch-Kincaid,ARI,Coleman-Liau,FleschReadingEase,GunningFogIndex,LIX,SMOGIndex,RIX,DaleChallIndex,Excellence Rate (%)
0,Reference,0.8482,0.6825,0.7551,-,-,-,11.38 ± 1.67,11.46 ± 1.89,11.21 ± 1.52,49.23 ± 9.09,16.21 ± 2.06,50.35 ± 4.87,14.23 ± 1.56,6.07 ± 1.23,7.37 ± 0.62,37.83
1,GPT-5,0.8283,0.5632,0.7647,0.8519,0.6414,0.6296,9.04 ± 1.40,9.51 ± 1.50,10.23 ± 1.30,61.31 ± 7.94,12.93 ± 1.63,42.24 ± 4.62,13.64 ± 1.44,4.34 ± 0.92,7.15 ± 0.58,56.61
2,Gemini,0.8419,0.582,0.7893,0.8712,0.6850,0.7189,8.37 ± 1.14,8.31 ± 1.22,9.33 ± 1.06,64.06 ± 6.46,12.25 ± 1.36,41.34 ± 4.03,12.00 ± 0.91,4.08 ± 0.74,6.35 ± 0.50,63.78
3,GPT-OSS-120B,0.8311,0.6408,0.7925,0.8556,0.6713,0.6713,10.52 ± 1.39,10.67 ± 1.63,10.86 ± 1.30,53.37 ± 7.52,15.84 ± 1.54,48.06 ± 4.29,14.73 ± 1.17,5.53 ± 0.99,7.82 ± 0.48,43.44


In [66]:
table = pd.DataFrame({
    'Model': ['Reference', 'GPT-5', 'Gemini', 'GPT-OSS-120B'],
    'BERTScore F1 (original)': [baseline_bertscore, gpt_5_bertscore, gemini_bertscore, gpt_oss_120b_bertscore],
    'MeaningBERT (original)': [baseline_meaningbert, gpt_5_meaningbert, gemini_meaningbert, gpt_oss_120b_meaningbert],
    'AlignScore (original)': [baseline_alignscore, gpt_5_alignscore, gemini_alignscore, gpt_oss_120b_alignscore],
    'BERTScore F1 (reference)': ["-", gpt_5_bertscore_reference, gemini_bertscore_reference, gpt_oss_120b_bertscore_reference],
    'MeaningBERT (reference)': ["-", gpt_5_meaningbert_reference, gemini_meaningbert_reference, gpt_oss_120b_alignscore_reference],
    'AlignScore (reference)': ["-", gpt_5_alignscore_reference, gemini_alignscore_reference, gpt_oss_120b_alignscore_reference],
    'Flesch-Kincaid': [reference_pls['readability']['flesch_kincaid_grade']['mean'],
                        gpt5_pls['readability']['flesch_kincaid_grade']['mean'],
                        gemini_pls['readability']['flesch_kincaid_grade']['mean'],
                        gpt_oss_120b_pls['readability']['flesch_kincaid_grade']['mean']],
    'ARI': [reference_pls['readability']['automated_readability_index']['mean'],
            gpt5_pls['readability']['automated_readability_index']['mean'],
            gemini_pls['readability']['automated_readability_index']['mean'],
            gpt_oss_120b_pls['readability']['automated_readability_index']['mean']],
    'Excellence Rate (%)': [reference_pls['summary']['excellence_rate'],
                            gpt5_pls['summary']['excellence_rate'],
                            gemini_pls['summary']['excellence_rate'],
                            gpt_oss_120b_pls['summary']['excellence_rate']]
})

table

Unnamed: 0,Model,BERTScore F1 (original),MeaningBERT (original),AlignScore (original),BERTScore F1 (reference),MeaningBERT (reference),AlignScore (reference),Flesch-Kincaid,ARI,Excellence Rate (%)
0,Reference,0.8482,0.6825,0.7551,-,-,-,11.38,11.46,37.83
1,GPT-5,0.8283,0.5632,0.7647,0.8519,0.6414,0.6296,9.04,9.51,56.61
2,Gemini,0.8419,0.582,0.7893,0.8712,0.685,0.7189,8.37,8.31,63.78
3,GPT-OSS-120B,0.8311,0.6408,0.7925,0.8556,0.6713,0.6713,10.52,10.67,43.44


In [65]:
%pip install openpyxl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m[31mERROR: Could not find a version that satisfies the requirement openpyxl (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for openpyxl[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
import datetime

# Create timestamp for file naming
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

# Save as CSV
table.to_csv(f'outputs/evaluation_results_{timestamp}.csv', index=False)
print(f"Results saved to outputs/evaluation_results_{timestamp}.csv")

# Save as Excel with formatting
with pd.ExcelWriter(f'outputs/evaluation_results_{timestamp}.xlsx', engine='openpyxl') as writer:
    table.to_excel(writer, sheet_name='Evaluation Metrics', index=False)

    # Also save the readability table in a separate sheet
    readability_table.to_excel(writer, sheet_name='Readability Details', index=False)

    # Get the workbook and worksheet to apply formatting
    workbook = writer.book
    worksheet = writer.sheets['Evaluation Metrics']

    # Auto-adjust column widths
    for column in worksheet.columns:
        max_length = 0
        column_letter = column[0].column_letter
        for cell in column:
            try:
                if len(str(cell.value)) > max_length:
                    max_length = len(str(cell.value))
            except:
                pass
        adjusted_width = min(max_length + 2, 30)
        worksheet.column_dimensions[column_letter].width = adjusted_width

print(f"Results saved to outputs/evaluation_results_{timestamp}.xlsx")

# Save as JSON for programmatic access
results_dict = table.to_dict('records')
with open(f'outputs/evaluation_results_{timestamp}.json', 'w') as f:
    json.dump(results_dict, f, indent=2)
print(f"Results saved to outputs/evaluation_results_{timestamp}.json")

# Save as Markdown for documentation
markdown_table = table.to_markdown(index=False)
with open(f'outputs/evaluation_results_{timestamp}.md', 'w') as f:
    f.write("# Evaluation Results\n\n")
    f.write(f"Generated on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
    f.write(markdown_table)
print(f"Results saved to outputs/evaluation_results_{timestamp}.md")

# Save as LaTeX for papers
latex_table = table.to_latex(index=False, caption="Model Evaluation Results", label="tab:eval_results")
with open(f'outputs/evaluation_results_{timestamp}.tex', 'w') as f:
    f.write(latex_table)
print(f"Results saved to outputs/evaluation_results_{timestamp}.tex")

# Display the table
print("\n" + "="*80)
print("EVALUATION RESULTS")
print("="*80)
print(table.to_string(index=False))

Results saved to outputs/evaluation_results_20250819_114333.csv
Results saved to outputs/evaluation_results_20250819_114333.json


ImportError: Missing optional dependency 'tabulate'.  Use pip or conda to install tabulate.