In [6]:
%pip install --upgrade huggingface_hub
%pip install evaluate
%pip install bert_score

Collecting huggingface_hub
  Downloading huggingface_hub-0.35.3-py3-none-any.whl.metadata (14 kB)
Downloading huggingface_hub-0.35.3-py3-none-any.whl (564 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.3/564.3 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface_hub
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.34.4
    Uninstalling huggingface-hub-0.34.4:
      Successfully uninstalled huggingface-hub-0.34.4
Successfully installed huggingface_hub-0.35.3
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [1]:
from huggingface_hub import login
from sklearn.metrics import f1_score,root_mean_squared_error
from transformers import pipeline,AutoTokenizer
import evaluate
import numpy as np
import pandas as pd
import json

from pls_evaluator import get_pls_evaluation, create_readability_table, create_summary_table

  from .autonotebook import tqdm as notebook_tqdm


### - Loading Results

In [None]:
import pandas as pd

# Load the integrated evaluation results
df = pd.read_csv('outputs/integrated_evaluation_results.csv')

# Remove standard deviation (the value after the ±)


# Display the table
print(f"Loaded table with {len(df)} models and {len(df.columns)} columns")
df

Loaded table with 5 models and 17 columns


Unnamed: 0,Model,BERTScore F1 (original),MeaningBERT (original),AlignScore (original),BERTScore F1 (reference),MeaningBERT (reference),AlignScore (reference),Flesch-Kincaid,ARI,Coleman-Liau,FleschReadingEase,GunningFogIndex,LIX,SMOGIndex,RIX,DaleChallIndex,Excellence Rate (%)
0,human (reference),0.8482,0.6825,0.7551,-,-,-,11.38 ± 1.67,11.46 ± 1.89,11.21 ± 1.52,49.23 ± 9.09,16.21 ± 2.06,50.35 ± 4.87,14.23 ± 1.56,6.07 ± 1.23,7.37 ± 0.62,37.83
1,gemini-2.5-pro (baseline),0.834,0.5828,0.7436,0.8696,0.7161,0.6900,8.72 ± 0.88,8.68 ± 0.90,8.94 ± 0.82,64.48 ± 5.17,12.55 ± 1.06,41.89 ± 2.76,12.31 ± 1.01,4.29 ± 0.52,6.28 ± 0.40,61.67
2,gemini-2.5-pro,0.8419,0.582,0.7893,0.8712,0.6850,0.7189,8.37 ± 1.14,8.31 ± 1.22,9.33 ± 1.06,64.06 ± 6.46,12.25 ± 1.36,41.34 ± 4.03,12.00 ± 0.91,4.08 ± 0.74,6.35 ± 0.50,63.78
3,gpt-5,0.8283,0.5632,0.7647,0.8519,0.6414,0.6296,9.04 ± 1.40,9.51 ± 1.50,10.23 ± 1.30,61.31 ± 7.94,12.93 ± 1.63,42.24 ± 4.62,13.64 ± 1.44,4.34 ± 0.92,7.15 ± 0.58,56.61
4,gpt-oss-120b,0.8311,0.6408,0.7925,0.8556,0.6713,0.6713,10.52 ± 1.39,10.67 ± 1.63,10.86 ± 1.30,53.37 ± 7.52,15.84 ± 1.54,48.06 ± 4.29,14.73 ± 1.17,5.53 ± 0.99,7.82 ± 0.48,43.44


### - AlignScore

In [2]:
import importlib
import alignscore_evaluator

# Reload the module to get the updated code
importlib.reload(alignscore_evaluator)

# Re-import the updated classes/functions
from alignscore_evaluator import AlignScoreEvaluator, get_alignscore

# Re-initialize the evaluator with the new optimized code
alignscore_evaluator = AlignScoreEvaluator()

AlignScore using device: mps


### - MeaningBERT

In [3]:
meaning_bert = evaluate.load("davebulaval/meaningbert")

In [4]:
def get_meaningbert_score(predictions, references, model):
  assert len(predictions) == len(references), "The number of references is different of the number of predictions."
  result=[]
  for pred, ref in zip(predictions, references):
        score = model.compute(predictions=[pred], references=[ref])
        result.append(score["scores"][0])
  return round(np.mean(result)/100, 4)

### - BERTScore

In [5]:
bertscore = evaluate.load("bertscore")

In [6]:
def get_bertscore(predictions, references, scorename=bertscore, scoretype="f1",  modelname=None):
  result = scorename.compute(references=references, predictions=predictions, lang="en") # , model_type=modelname)
  return round(np.mean(result[scoretype]), 4)

### - Readability

## Loading texts

### Original Abstracts

In [7]:
import os
import glob

# Load original results
original = {}
original_path = "outputs/original"

# Get all JSON files in the original directory
json_files = glob.glob(os.path.join(original_path, "*.json"))

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Extract cochrane_review_id and pls result
        if 'cochrane_review_id' in data and 'pls' in data:
            cochrane_review_id = data['cochrane_review_id']
            abstract = data['title'] + '\n' + data['abstract']
            original[cochrane_review_id] = abstract

# Print the first 3 key-value pairs
list(original.items())[:3]

[('CD015749.PUB2',
  "Multimodal interventions for cachexia management\nBackground\nCachexia (disease‐related wasting) is a complex metabolic syndrome which occurs in people with chronic illnesses, including cancer, HIV/AIDS, kidney disease, heart disease, and chronic obstructive pulmonary disease (COPD). People with cachexia experience unintentional weight loss, muscle loss, fatigue, loss of appetite, and reduced quality of life. Multimodal interventions which work synergistically to treat the syndrome could lead to benefits.\n\nObjectives\nTo assess the benefits and harms of multimodal interventions aimed at alleviating or stabilising cachexia in people with a chronic illness.\n\nSearch methods\nWe searched CENTRAL, MEDLINE, Embase, PsycINFO, and two trials registers in July 2024, together with reference checking, citation searching, and contact with study authors to identify studies.\n\nSelection criteria\nWe included randomised controlled trials (RCTs) in adults with or at risk of 

### References or gold standard

In [8]:
import os
import glob

# Load references results
references = {}
references_path = "outputs/original"

# Get all JSON files in the references directory
json_files = glob.glob(os.path.join(references_path, "*.json"))

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Extract cochrane_review_id and pls result
        if 'cochrane_review_id' in data and 'pls' in data:
            cochrane_review_id = data['cochrane_review_id']
            pls_result = data['pls']
            references[cochrane_review_id] = pls_result

# Print the first 3 key-value pairs
list(references.items())[:3]

[('CD015749.PUB2',
  "What are the benefits and risks of using a combination of treatments to manage cachexia (disease‐related wasting)?\n\nKey messages:\n• We do not know if combinations of two or more treatments (of medication, diet, and exercise) benefit people who have or are at risk of developing cachexia (disease‐related wasting). This is because there are currently not enough robust studies in this area.\n• We need future research to increase our confidence in the evidence by conducting better designed and larger studies.\nWhat is cachexia?\n\nCachexia is a complex metabolic syndrome that occurs in people with long‐term illnesses (known as chronic illnesses), such as cancer, HIV/AIDS, chronic kidney disease, heart disease, and chronic obstructive pulmonary disease (COPD).\nPeople who have cachexia may:\n• lose weight unintentionally;• lose muscle;• feel tired, weak, or both;• lose their appetite.\nCachexia affects well‐being and can be life‐threatening.\nHow is cachexia treated?

### Baselines (simple LLM with a single one prompt)

#### Gemini 2.5 Pro

In [9]:
# Load baseline_gemini_2_5_pro results
baseline_gemini_2_5_pro = {}
baseline_gemini_2_5_pro_path = "outputs/baseline_gemini_2_5_pro"

# Get all JSON files in the baseline_gemini_2_5_pro directory
json_files = glob.glob(os.path.join(baseline_gemini_2_5_pro_path, "*.json"))

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Extract cochrane_id and response
        if 'cochrane_id' in data and 'response' in data and 'output' in data['response']:
            cochrane_id = data['cochrane_id']
            response = data['response']['output']
            baseline_gemini_2_5_pro[cochrane_id] = response

# Print the first 3 key-value pairs
list(baseline_gemini_2_5_pro.items())[:3]

[('CD015749.PUB2',
  "Can combining different treatments help people with cachexia (disease-related wasting)?\n\n**Key messages**\n\n– We do not know if combining treatments, such as medicines, nutritional support, and exercise, helps people with cachexia. The available evidence is very uncertain, so we cannot say if these combined treatments improve physical function, strength, appetite, or quality of life. It is also unclear if they cause any unwanted effects.\n\n– The studies we found were small and had weaknesses in how they were carried out.\n\n– We need larger, better-designed studies to find out if combined treatments are helpful and safe for people with cachexia.\n\n**What is cachexia?**\n\nCachexia is a condition also known as disease-related wasting. It can happen to people with long-term (chronic) illnesses like cancer, heart disease, kidney disease, or HIV/AIDS.\n\nPeople with cachexia lose weight and muscle without trying to. They often feel very tired, lose their appetite

In [10]:
# Load baseline_gpt_5 results
baseline_gpt_5 = {}
baseline_gpt_5_path = "outputs/baseline_gpt_5"

# Get all JSON files in the baseline_gpt_5 directory
json_files = glob.glob(os.path.join(baseline_gpt_5_path, "*.json"))

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Extract cochrane_id and response
        if 'cochrane_id' in data and 'response' in data and 'output' in data['response']:
            cochrane_id = data['cochrane_id']
            response = data['response']['output']
            baseline_gpt_5[cochrane_id] = response

# Print the first 3 key-value pairs
list(baseline_gpt_5.items())[:3]

[('CD015749.PUB2',
  'Plain Language Summary Title\nWhat are the benefits and risks of combined treatments (multimodal interventions) for cachexia (disease-related wasting) in people with chronic illnesses?\n\nKey Messages\n– We did not find enough good-quality evidence to show whether combined treatments (for example, medicines plus nutrition support and/or exercise) help people with cachexia. It is unclear if they improve physical function, strength, weight, appetite, fatigue, blood test markers, or quality of life. It is also unclear if they change the chance of unwanted effects (harms).\n– The studies were small and short, used different mixes of treatments, and included people with cancer, chronic obstructive pulmonary disease (COPD), chronic kidney disease, and HIV/AIDS. Some important results were not reported, and some study data could not be used.\n– Future research should include larger, longer, well-designed studies that fairly compare treatments by randomly assigning people

In [11]:
# Load baseline_gpt_oss_120b results
baseline_gpt_oss_120b = {}
baseline_gpt_oss_120b_path = "outputs/baseline_gpt_oss_120b"

# Get all JSON files in the baseline_gpt_oss_120b directory
json_files = glob.glob(os.path.join(baseline_gpt_oss_120b_path, "*.json"))

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Extract cochrane_id and response
        if 'cochrane_id' in data and 'response' in data and 'output' in data['response']:
            cochrane_id = data['cochrane_id']
            response = data['response']['output']
            baseline_gpt_oss_120b[cochrane_id] = response

# Print the first 3 key-value pairs
list(baseline_gpt_oss_120b.items())[:3]

[('CD015749.PUB2',
  'Plain Language Summary Title  \nIs a combined program of medicines, nutrition advice, and exercise helpful for people with disease‑related wasting (cachexia)?\n\nKey Messages  \n- We found no clear evidence that a combined program of medicines, nutrition, and exercise improves strength, physical ability, weight, appetite, fatigue, quality of life, or other health measures in people with cachexia, and the studies did not report any consistent harmful effects.  \n- The evidence comes from nine small studies that included about 900 adults with cancer, COPD, kidney disease, or HIV/AIDS; all of these studies had important limitations, so we are not confident in the results.  \n- More large, well‑designed studies are needed to determine whether such combined programs can really help people with cachexia.\n\nWhat is cachexia and why does it matter?  \nCachexia (disease‑related wasting) is a condition where people with long‑term illnesses lose weight and muscle even when 

In [34]:
# Load baseline_gpt_oss_20b results
baseline_gpt_oss_20b = {}
baseline_gpt_oss_20b_path = "outputs/baseline_gpt_oss_20b"

# Get all JSON files in the baseline_gpt_oss_20b directory
json_files = glob.glob(os.path.join(baseline_gpt_oss_20b_path, "*.json"))

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Extract cochrane_id and response
        if 'cochrane_id' in data and 'response' in data and 'output' in data['response']:
            cochrane_id = data['cochrane_id']
            response = data['response']['output']
            baseline_gpt_oss_20b[cochrane_id] = response

# Print the first 3 key-value pairs
list(baseline_gpt_oss_20b.items())[:3]

[('CD015749.PUB2',
  'Can multimodal treatments help people with cachexia from chronic illnesses?\n\nKey Messages:\n- We found no clear evidence that combining medicines, nutrition, and exercise improves muscle strength, weight, appetite, or quality of life for people with cachexia. The studies were small and had design problems, so we are not sure.\n- The evidence is very uncertain, so we cannot say whether these treatments are safe or harmful.\n- More well‑designed studies with enough participants and longer follow‑up are needed to know if multimodal treatments work.\n\nIntroduction\nWhat is cachexia and why is it a problem?\nCachexia is a condition where people lose muscle and often weight because of a long‑term illness such as cancer, heart failure, kidney disease or chronic lung disease. It makes everyday activities harder, reduces quality of life and can shorten survival.\n\nWhat did we want to find out?\nWe wanted to know whether a combination of medicines, special nutrition and

In [97]:
# Load baseline_llama_3_3_70b results
baseline_llama_3_3_70b = {}
baseline_llama_3_3_70b_path = "outputs/baseline_llama_3_3_70b"

# Get all JSON files in the baseline_llama_3_3_70b directory
json_files = glob.glob(os.path.join(baseline_llama_3_3_70b_path, "*.json"))

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Extract cochrane_id and response
        if 'cochrane_id' in data and 'response' in data and 'output' in data['response']:
            cochrane_id = data['cochrane_id']
            response = data['response']['output']
            baseline_llama_3_3_70b[cochrane_id] = response

# Print the first 3 key-value pairs
list(baseline_llama_3_3_70b.items())[:3]

[('CD015749.PUB2',
  'What are the benefits and harms of multimodal interventions for managing cachexia in people with chronic illnesses?\n\n### Key Messages\n* Due to a lack of robust evidence, the benefits and harms of multimodal interventions for managing cachexia in people with chronic illnesses are unclear. The evidence was very uncertain, and we could not find any clear evidence for an effect of multimodal interventions on physical function, strength, adverse events, body composition, weight, quality of life, appetite, fatigue, or biochemical markers.\n* We have little confidence in the evidence because it is possible that people in the studies were aware of which treatment they were getting, and because the studies were very small.\n* Future research in this area should focus on methodologically rigorous, well-powered randomized controlled trials with adequate interaction times to assess the effectiveness of multimodal interventions in managing cachexia across chronic illnesses.

In [41]:
# Load baseline_llama_3_2_3b results
baseline_llama_3_2_3b = {}
baseline_llama_3_2_3b_path = "outputs/baseline_llama_3_2_3b"

# Get all JSON files in the baseline_llama_3_2_3b directory
json_files = glob.glob(os.path.join(baseline_llama_3_2_3b_path, "*.json"))

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Extract cochrane_id and response
        if 'cochrane_id' in data and 'response' in data and 'output' in data['response']:
            cochrane_id = data['cochrane_id']
            response = data['response']['output']
            baseline_llama_3_2_3b[cochrane_id] = response

# Print the first 3 key-value pairs
list(baseline_llama_3_2_3b.items())[:3]

[('CD015749.PUB2',
  'I can help you transform the Cochrane abstract into a Plain Language Summary (PLS) following the guidelines provided.\n\nHere is the transformed summary:\n\n**What are the benefits and risks of multimodal interventions for managing cachexia?**\n\nCachexia is a complex condition that occurs in people with chronic illnesses, such as cancer, HIV/AIDS, kidney disease, heart disease, and chronic obstructive pulmonary disease (COPD). It can cause unintentional weight loss, muscle loss, fatigue, loss of appetite, and reduced quality of life.\n\n**What did we want to find out?**\n\nWe wanted to find out if multimodal interventions, which combine different treatments such as medication, nutrition, and exercise, can help alleviate or stabilize cachexia in people with chronic illnesses.\n\n**What did we find?**\n\nWe found nine studies that involved 926 adults with cachexia. The studies were conducted in different countries and had different sample sizes. However, we were un

### Agentics

### Gemini 2.5 Pro

In [125]:
# Load Gemini results
gemini = {}
gemini_path = "outputs/agentic_old_gemini_2_5_pro"

# Get all JSON files in the gemini directory
json_files = glob.glob(os.path.join(gemini_path, "*.json"))

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Extract cochrane_id and response
        if 'cochrane_id' in data and 'response' in data and 'output' in data['response']:
            cochrane_id = data['cochrane_id']
            response = data['response']['output']
            gemini[cochrane_id] = response

# Print the first 3 key-value pairs
list(gemini.items())[:3]

[('CD015749.PUB2',
  "Can combining treatments like medicine, diet, and exercise help people with cachexia (severe weight and muscle loss from long-term illness)?\n\nKey messages\n\nIt is unclear if combining treatments like medicine, diet, and exercise helps people with cachexia (severe weight and muscle loss from long-term illness). This is because there is not enough good-quality evidence. We found not enough evidence to know if these combined treatments have benefits (like improving strength or weight) or cause unwanted effects.\n\nWe need larger and better-designed studies to find out if combining treatments is helpful and safe for people with cachexia. These studies should also follow people for longer to better understand the treatments' effects.\n\nWhat is disease-related wasting (cachexia)?\n\nPeople with long-term illnesses such as cancer, HIV/AIDS, kidney disease, and heart disease can develop a condition called cachexia. Cachexia is a complex problem of disease-related wast

### GPT-5

In [127]:
# Load gpt_5 results
gpt_5 = {}
gpt_5_path = "outputs/agentic_old_gpt_5"

# Get all JSON files in the gpt_5 directory
json_files = glob.glob(os.path.join(gpt_5_path, "*.json"))

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Extract cochrane_id and response
        if 'cochrane_id' in data and 'response' in data and 'output' in data['response']:
            cochrane_id = data['cochrane_id']
            response = data['response']['output']
            gpt_5[cochrane_id] = response

# Print the first 3 key-value pairs
list(gpt_5.items())[:3]

[('CD015749.PUB2',
  'Title: Do combined treatments (medicine, diet, and exercise) help manage disease-related weight and muscle loss (cachexia)?\n\nKey messages\n- We found too little reliable evidence to say that combined treatments (multimodal interventions: a treatment plan that combines 2 or more types of therapy, such as medicines, nutrition, and exercise) work better than usual care (the standard care a patient would normally get) or a single approach (one type of therapy). Unwanted effects (adverse events: an unwanted event that causes harm) were reported in some studies but the evidence is very uncertain.\n- We saw no clear benefits for physical function (how well someone can do everyday activities like walking or climbing stairs), strength (the force muscles can produce), body weight, body composition (the amounts of muscle and fat), quality of life (how someone feels and functions day to day), appetite (desire to eat), fatigue (extreme tiredness), or biochemical markers (mea

### New Agentic Gemini 2.5 Pro

In [128]:
# Load agentic_gemini_2_5_pro results
agentic_gemini_2_5_pro = {}
agentic_gemini_2_5_pro_path = "outputs/agentic_gemini_2_5_pro"

# Get all JSON files in the agentic_gemini_2_5_pro directory
json_files = glob.glob(os.path.join(agentic_gemini_2_5_pro_path, "*.json"))

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Extract cochrane_id and response
        if 'cochrane_id' in data and 'response' in data and 'output' in data['response']:
            cochrane_id = data['cochrane_id']
            response = data['response']['output']
            agentic_gemini_2_5_pro[cochrane_id] = response

# Print the first 3 key-value pairs
list(agentic_gemini_2_5_pro.items())[:3]

[('CD015749.PUB2',
  "Can combined medicine, diet, and exercise improve outcomes for people with cachexia?\n\n- Due to a lack of robust evidence, it is unclear whether combining treatments such as medicine, diet, and exercise helps people with cachexia (severe weight and muscle loss caused by long-term illness). The evidence was not strong enough to determine the benefits or unwanted effects of these combined treatments.\n- Larger, well-designed studies are needed to find out if combining different types of treatments can help manage cachexia in people with various long-term illnesses.\n\nWhat is disease-related wasting (cachexia)?\nCachexia is a condition of severe weight and muscle loss, also known as 'disease-related wasting'. It can happen to people with long-term illnesses such as cancer, HIV/AIDS, kidney disease, heart disease, and chronic obstructive pulmonary disease (COPD). People with cachexia lose weight and muscle without meaning to, feel very tired, lose their appetite, an

### GPT-OSS-120B

In [129]:
# Load agentic_gpt_oss_120b results
agentic_gpt_oss_120b = {}
agentic_gpt_oss_120b_path = "outputs/agentic_gpt_oss_120b"

# Get all JSON files in the agentic_gpt_oss_120b directory
json_files = glob.glob(os.path.join(agentic_gpt_oss_120b_path, "*.json"))

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Extract cochrane_id and response
        if 'cochrane_id' in data and 'response' in data and 'output' in data['response']:
            cochrane_id = data['cochrane_id']
            response = data['response']['output']
            agentic_gpt_oss_120b[cochrane_id] = response

# Print the first 3 key-value pairs
list(agentic_gpt_oss_120b.items())[:3]

[('CD015749.PUB2',
  'Are combined drug, nutrition, and exercise treatments effective for managing weight loss and muscle wasting (cachexia) in people with chronic illnesses?\n\nWhat are the main findings?\n- The combined programmes (multimodal interventions – treatments that combine medicines, special nutrition, and exercise) did not clearly improve weight, muscle loss, physical function, strength, quality of life, or the chance of side‑effects.\n- Comparing these programmes with a single treatment or with a different version of the programme also showed no clear benefits.\n- Most evidence comes from a small number of trials, mainly in people with cancer, and the studies were short and had design weaknesses. Larger, well‑designed trials that follow people for longer and include more chronic conditions are needed.\n\nWhat is cachexia and why does it matter?\nCachexia, also called disease‑related wasting, is a serious condition that can develop in people with long‑term illnesses such as

### GPT-OSS 20b

In [37]:
# Load agentic_gpt_oss_20b results
agentic_gpt_oss_20b = {}
agentic_gpt_oss_20b_path = "outputs/agentic_gpt_oss_20b"

# Get all JSON files in the agentic_gpt_oss_20b directory
json_files = glob.glob(os.path.join(agentic_gpt_oss_20b_path, "*.json"))

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Extract cochrane_id and response
        if 'cochrane_id' in data and 'response' in data and 'output' in data['response']:
            cochrane_id = data['cochrane_id']
            response = data['response']['output']
            agentic_gpt_oss_20b[cochrane_id] = response

# Print the first 3 key-value pairs
list(agentic_gpt_oss_20b.items())[:3]

[('CD015749.PUB2',
  'Are combined treatments (drugs, nutrition, and exercise) effective for managing cachexia (severe weight and muscle loss) in people with chronic illnesses?\n\nKey messages?\n– Current research does not show clear benefits of combined drug, nutrition and exercise programmes for people with cachexia, and information on possible side‑effects is limited.  \n– Most studies involved people with cancer, lasted only a few weeks to months, and may not apply to other long‑term illnesses or to longer treatment periods.  \n– Larger, well‑designed randomised trials that follow participants for longer are needed to determine whether combined treatments are safe and effective.\n\nWhat is cachexia and why does it matter?\nCachexia, also called disease‑related wasting, is a serious condition that often develops in people with long‑term illnesses such as cancer, HIV/AIDS, kidney disease, heart disease, and chronic obstructive pulmonary disease (COPD). It causes unintentional loss of

### Llama 3.3 70B

In [130]:
# Load agentic_llama_3_3_70b results
agentic_llama_3_3_70b = {}
agentic_llama_3_3_70b_path = "outputs/agentic_llama_3_3_70b"

# Get all JSON files in the agentic_llama_3_3_70b directory
json_files = glob.glob(os.path.join(agentic_llama_3_3_70b_path, "*.json"))

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Extract cochrane_id and response
        if 'cochrane_id' in data and 'response' in data and 'output' in data['response']:
            cochrane_id = data['cochrane_id']
            response = data['response']['output']
            agentic_llama_3_3_70b[cochrane_id] = response

# Print the first 3 key-value pairs
list(agentic_llama_3_3_70b.items())[:3]

[('CD015749.PUB2',
  'Can combined treatments help manage cachexia better than a single treatment or standard care?\n\nKey messages:\n- We do not know if combined treatments are better than single treatments or standard care for managing cachexia, because the evidence is not strong enough.\n- Multimodal interventions may have some effects on symptoms like fatigue, but we are very uncertain about this.\n- More research is needed to find out if multimodal interventions are effective and safe for people with different types of chronic illnesses.\n\nWhat is cachexia and why is it a problem?\nCachexia is a condition where the body wastes away due to chronic illness, such as cancer or HIV/AIDS. It causes unintentional weight loss, muscle loss, fatigue, loss of appetite, and reduced quality of life. Cachexia is a significant problem because it can lead to a decline in physical function, strength, and overall health.\n\nWhat are multimodal interventions and how might they help with cachexia?\n

### Llama 3.2 3B

In [44]:
# Load agentic_llama_3_2_3b results
agentic_llama_3_2_3b = {}
agentic_llama_3_2_3b_path = "outputs/agentic_llama_3_2_3b"

# Get all JSON files in the agentic_llama_3_2_3b directory
json_files = glob.glob(os.path.join(agentic_llama_3_2_3b_path, "*.json"))

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Extract cochrane_id and response
        if 'cochrane_id' in data and 'response' in data and 'output' in data['response']:
            cochrane_id = data['cochrane_id']
            response = data['response']['output']
            agentic_llama_3_2_3b[cochrane_id] = response

# Print the first 3 key-value pairs
list(agentic_llama_3_2_3b.items())[:3]

[('CD015749.PUB2',
  '**Agent stopped due to max iterations.**\n\n- The review found insufficient evidence to support or refute the use of multimodal interventions in managing cachexia. The certainty of the evidence was very low. Methodologically rigorous, well-powered RCTs with adequate interaction times are needed to assess the effectiveness of multimodal interventions in managing cachexia across chronic illnesses.\n- We did not find enough good-quality evidence about the benefits and harms of multimodal interventions for cachexia management in people with chronic illnesses.\n- Future research in this area should focus on options and effects that are important to decision-makers, such as: multimodal interventions that can be used in people with chronic illnesses, and unwanted effects and costs associated with these interventions.\n\n**What is cachexia?**\n\nCachexia is a complex metabolic syndrome that affects people with chronic illnesses, such as cancer, HIV/AIDS, kidney disease, h

### Llama 3.3 70B with Gemini 2.5 Flash as Evaluator

In [131]:
# Load agentic_llama_3_3_70b_evaluator_gemini_2_5_flash results
agentic_llama_3_3_70b_evaluator_gemini_2_5_flash = {}
agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_path = "outputs/agentic_llama_3_3_70b_evaluator_gemini_2_5_flash"

# Get all JSON files in the agentic_llama_3_3_70b_evaluator_gemini_2_5_flash directory
json_files = glob.glob(os.path.join(agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_path, "*.json"))

for json_file in json_files:
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        # Extract cochrane_id and response
        if 'cochrane_id' in data and 'response' in data and 'output' in data['response']:
            cochrane_id = data['cochrane_id']
            response = data['response']['output']
            agentic_llama_3_3_70b_evaluator_gemini_2_5_flash[cochrane_id] = response

# Print the first 3 key-value pairs
list(agentic_llama_3_3_70b_evaluator_gemini_2_5_flash.items())[:3]

[('CD015749.PUB2',
  'Can combined treatments like medication, nutrition, and exercise help manage cachexia, a condition where the body wastes away due to chronic illness, better than usual care? \n\nWe looked at studies that combined two or more of these approaches to see if they could improve physical function, strength, and quality of life for people with cachexia. \n- Cachexia is a condition where the body wastes away due to chronic illness. Treatments that combine different approaches, like medication, nutrition, and exercise, might help manage it. However, we lack strong evidence to say if these combined treatments are better than usual care for people with chronic illnesses such as cancer, HIV/AIDS, or heart disease.\n- The current evidence does not show clear benefits or harms of multimodal interventions for cachexia management. We are very uncertain about the benefits and harms of these combined treatments. The studies did not show clear effects, either helpful or harmful.\n- 

# Test execution

### Reference

In [24]:
# Extract references and original text with matching keys
ground_truth_reference = []
ground_truth_original = []
matching_ids = []

for cochrane_id in original:
    if cochrane_id in original:
        ground_truth_reference.append(references[cochrane_id])
        ground_truth_original.append(original[cochrane_id])
        matching_ids.append(cochrane_id)

print(f"Found {len(matching_ids)} matching entries between original and references")

Found 100 matching entries between original and references


In [None]:
# Calculate BERTScore using the function for baseline (gold standard and original text)
ground_truth_bertscore = get_bertscore(ground_truth_reference, ground_truth_original, bertscore)
print(f"Ground Truth BERTScore F1: {ground_truth_bertscore:.4f}")

# Calculate MeaningBERT using the function for ground truth (gold standard and original text)
ground_truth_meaningbert = get_meaningbert_score(ground_truth_reference, ground_truth_original, meaning_bert)
print(f"Ground Truth MeaningBERT score: {ground_truth_meaningbert:.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Baseline BERTScore F1: 0.8482
Baseline MeaningBERT score: 0.6825


In [None]:
ground_truth_alignscore = get_alignscore(ground_truth_reference, ground_truth_original, alignscore_evaluator)
print(f"Baseline AlignScore: {ground_truth_alignscore:.4f}")

Computing AlignScore:   0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1687 > 512). Running this sequence through the model will result in indexing errors
Computing AlignScore: 100%|██████████| 100/100 [43:17<00:00, 25.98s/it]

Baseline AlignScore: 0.7551





### Baselines (simple agent)

#### Gemini 2.5 Pro

In [19]:
# Extract predictions and original with matching keys
baseline_gemini_2_5_pro_predictions = []
ground_truth_original = []
matching_ids = []

for cochrane_id in original:
    if cochrane_id in original:
        baseline_gemini_2_5_pro_predictions.append(baseline_gemini_2_5_pro[cochrane_id])
        ground_truth_original.append(original[cochrane_id])
        matching_ids.append(cochrane_id)

print(f"Found {len(matching_ids)} matching entries between baseline_gemini_2_5_pro and original")

Found 100 matching entries between baseline_gemini_2_5_pro and original


In [20]:
# Calculate BERTScore using the function
baseline_gemini_2_5_pro_bertscore = get_bertscore(baseline_gemini_2_5_pro_predictions, ground_truth_original, bertscore)
print(f"Baseline_gemini_2_5_pro BERTScore F1: {baseline_gemini_2_5_pro_bertscore:.4f}")

baseline_gemini_2_5_pro_meaningbert = get_meaningbert_score(baseline_gemini_2_5_pro_predictions, ground_truth_original, meaning_bert)
print(f"Baseline_gemini_2_5_pro MeaningBERT score: {baseline_gemini_2_5_pro_meaningbert:.4f}")

baseline_gemini_2_5_pro_bertscore_reference = get_bertscore(baseline_gemini_2_5_pro_predictions, ground_truth_reference, bertscore)
print(f"Baseline_gemini_2_5_pro BERTScore F1 (reference): {baseline_gemini_2_5_pro_bertscore_reference:.4f}")

baseline_gemini_2_5_pro_meaningbert_reference = get_meaningbert_score(baseline_gemini_2_5_pro_predictions, ground_truth_reference, meaning_bert)
print(f"Baseline_gemini_2_5_pro MeaningBERT score (reference): {baseline_gemini_2_5_pro_meaningbert_reference:.4f}")

baseline_gemini_2_5_pro_pls = get_pls_evaluation(baseline_gemini_2_5_pro_predictions)

Baseline_gemini_2_5_pro BERTScore F1: 0.8352
Baseline_gemini_2_5_pro MeaningBERT score: 0.5957
Baseline_gemini_2_5_pro BERTScore F1 (reference): 0.8701
Baseline_gemini_2_5_pro MeaningBERT score (reference): 0.7162


Evaluating with PLS API: 100%|██████████| 100/100 [00:20<00:00,  4.86it/s]


In [37]:
# For Baseline_gemini_2_5_pro predictions vs original
baseline_gemini_2_5_pro_alignscore = get_alignscore(baseline_gemini_2_5_pro_predictions, ground_truth_original, alignscore_evaluator)
print(f"Baseline_gemini_2_5_pro AlignScore: {baseline_gemini_2_5_pro_alignscore:.4f}")

# For Baseline_gemini_2_5_pro predictions vs reference
baseline_gemini_2_5_pro_alignscore_reference = get_alignscore(baseline_gemini_2_5_pro_predictions, ground_truth_reference, alignscore_evaluator)
print(f"Baseline_gemini_2_5_pro AlignScore (reference): {baseline_gemini_2_5_pro_alignscore_reference:.4f}")

Computing AlignScore: 100%|██████████| 100/100 [1:05:56<00:00, 39.57s/it]


Baseline_gemini_2_5_pro AlignScore: 0.7820


Computing AlignScore: 100%|██████████| 100/100 [35:18<00:00, 21.18s/it]

Baseline_gemini_2_5_pro AlignScore (reference): 0.7002





#### GPT 5

In [38]:
# Extract predictions and original with matching keys
baseline_gpt_5_predictions = []
ground_truth_original = []
matching_ids = []

for cochrane_id in original:
    if cochrane_id in original:
        baseline_gpt_5_predictions.append(baseline_gpt_5[cochrane_id])
        ground_truth_original.append(original[cochrane_id])
        matching_ids.append(cochrane_id)

print(f"Found {len(matching_ids)} matching entries between baseline_gpt_5 and original")

Found 100 matching entries between baseline_gpt_5 and original


In [40]:
# Calculate BERTScore using the function
baseline_gpt_5_bertscore = get_bertscore(baseline_gpt_5_predictions, ground_truth_original, bertscore)
print(f"Baseline_gpt_5 BERTScore F1: {baseline_gpt_5_bertscore:.4f}")

baseline_gpt_5_meaningbert = get_meaningbert_score(baseline_gpt_5_predictions, ground_truth_original, meaning_bert)
print(f"Baseline_gpt_5 MeaningBERT score: {baseline_gpt_5_meaningbert:.4f}")

baseline_gpt_5_bertscore_reference = get_bertscore(baseline_gpt_5_predictions, ground_truth_reference, bertscore)
print(f"Baseline_gpt_5 BERTScore F1 (reference): {baseline_gpt_5_bertscore_reference:.4f}")

baseline_gpt_5_meaningbert_reference = get_meaningbert_score(baseline_gpt_5_predictions, ground_truth_reference, meaning_bert)
print(f"Baseline_gpt_5 MeaningBERT score (reference): {baseline_gpt_5_meaningbert_reference:.4f}")

baseline_gpt_5_pls = get_pls_evaluation(baseline_gpt_5_predictions)

Baseline_gpt_5 BERTScore F1: 0.8342
Baseline_gpt_5 MeaningBERT score: 0.6075
Baseline_gpt_5 BERTScore F1 (reference): 0.8619
Baseline_gpt_5 MeaningBERT score (reference): 0.6873


Evaluating with PLS API: 100%|██████████| 100/100 [00:26<00:00,  3.71it/s]


In [41]:
# For Baseline_gpt_5 predictions vs original
baseline_gpt_5_alignscore = get_alignscore(baseline_gpt_5_predictions, ground_truth_original, alignscore_evaluator)
print(f"Baseline_gpt_5 AlignScore: {baseline_gpt_5_alignscore:.4f}")

# For Baseline_gpt_5 predictions vs reference
baseline_gpt_5_alignscore_reference = get_alignscore(baseline_gpt_5_predictions, ground_truth_reference, alignscore_evaluator)
print(f"Baseline_gpt_5 AlignScore (reference): {baseline_gpt_5_alignscore_reference:.4f}")

Computing AlignScore: 100%|██████████| 100/100 [1:01:37<00:00, 36.97s/it]


Baseline_gpt_5 AlignScore: 0.7692


Computing AlignScore: 100%|██████████| 100/100 [44:37<00:00, 26.77s/it]

Baseline_gpt_5 AlignScore (reference): 0.6598





#### GPT OSS 120B

In [105]:
# Extract predictions and original with matching keys
baseline_gpt_oss_120b_predictions = []
ground_truth_original = []
matching_ids = []

for cochrane_id in original:
    if cochrane_id in original:
        baseline_gpt_oss_120b_predictions.append(baseline_gpt_oss_120b[cochrane_id])
        ground_truth_original.append(original[cochrane_id])
        matching_ids.append(cochrane_id)

print(f"Found {len(matching_ids)} matching entries between baseline_gpt_oss_120b and original")

Found 100 matching entries between baseline_gpt_oss_120b and original


In [None]:
# Calculate BERTScore using the function
baseline_gpt_oss_120b_bertscore = get_bertscore(baseline_gpt_oss_120b_predictions, ground_truth_original, bertscore)
print(f"Baseline_gpt_oss_120b BERTScore F1: {baseline_gpt_oss_120b_bertscore:.4f}")

baseline_gpt_oss_120b_meaningbert = get_meaningbert_score(baseline_gpt_oss_120b_predictions, ground_truth_original, meaning_bert)
print(f"Baseline_gpt_oss_120b MeaningBERT score: {baseline_gpt_oss_120b_meaningbert:.4f}")

baseline_gpt_oss_120b_bertscore_reference = get_bertscore(baseline_gpt_oss_120b_predictions, ground_truth_reference, bertscore)
print(f"Baseline_gpt_oss_120b BERTScore F1 (reference): {baseline_gpt_oss_120b_bertscore_reference:.4f}")

baseline_gpt_oss_120b_meaningbert_reference = get_meaningbert_score(baseline_gpt_oss_120b_predictions, ground_truth_reference, meaning_bert)
print(f"Baseline_gpt_oss_120b MeaningBERT score (reference): {baseline_gpt_oss_120b_meaningbert_reference:.4f}")

baseline_gpt_oss_120b_pls = get_pls_evaluation(baseline_gpt_oss_120b_predictions)

In [None]:
# For Baseline_gpt_oss_120b predictions vs original
baseline_gpt_oss_120b_alignscore = get_alignscore(baseline_gpt_oss_120b_predictions, ground_truth_original, alignscore_evaluator)
print(f"Baseline_gpt_oss_120b AlignScore: {baseline_gpt_oss_120b_alignscore:.4f}")

# For Baseline_gpt_oss_120b predictions vs reference
baseline_gpt_oss_120b_alignscore_reference = get_alignscore(baseline_gpt_oss_120b_predictions, ground_truth_reference, alignscore_evaluator)
print(f"Baseline_gpt_oss_120b AlignScore (reference): {baseline_gpt_oss_120b_alignscore_reference:.4f}")

Computing AlignScore: 100%|██████████| 100/100 [45:55<00:00, 27.55s/it]


Baseline_gpt_oss_120b AlignScore: 0.7696


Computing AlignScore: 100%|██████████| 100/100 [29:04<00:00, 17.45s/it]

Baseline_gpt_oss_120b AlignScore (reference): 0.6878





### GPT-OSS 20B

In [35]:
# Extract predictions and original with matching keys
baseline_gpt_oss_20b_predictions = []
ground_truth_original = []
matching_ids = []

for cochrane_id in original:
    if cochrane_id in original:
        baseline_gpt_oss_20b_predictions.append(baseline_gpt_oss_20b[cochrane_id])
        ground_truth_original.append(original[cochrane_id])
        matching_ids.append(cochrane_id)

print(f"Found {len(matching_ids)} matching entries between baseline_gpt_oss_20b and original")

Found 100 matching entries between baseline_gpt_oss_20b and original


In [48]:
# Calculate BERTScore using the function
baseline_gpt_oss_20b_bertscore = get_bertscore(baseline_gpt_oss_20b_predictions, ground_truth_original, bertscore)
print(f"Baseline_gpt_oss_20b BERTScore F1: {baseline_gpt_oss_20b_bertscore:.4f}")

baseline_gpt_oss_20b_meaningbert = get_meaningbert_score(baseline_gpt_oss_20b_predictions, ground_truth_original, meaning_bert)
print(f"Baseline_gpt_oss_20b MeaningBERT score: {baseline_gpt_oss_20b_meaningbert:.4f}")

baseline_gpt_oss_20b_bertscore_reference = get_bertscore(baseline_gpt_oss_20b_predictions, ground_truth_reference, bertscore)
print(f"Baseline_gpt_oss_20b BERTScore F1 (reference): {baseline_gpt_oss_20b_bertscore_reference:.4f}")

baseline_gpt_oss_20b_meaningbert_reference = get_meaningbert_score(baseline_gpt_oss_20b_predictions, ground_truth_reference, meaning_bert)
print(f"Baseline_gpt_oss_20b MeaningBERT score (reference): {baseline_gpt_oss_20b_meaningbert_reference:.4f}")
baseline_gpt_oss_20b_pls = get_pls_evaluation(baseline_gpt_oss_20b_predictions)

Baseline_gpt_oss_20b BERTScore F1: 0.8327
Baseline_gpt_oss_20b MeaningBERT score: 0.5857
Baseline_gpt_oss_20b BERTScore F1 (reference): 0.8565
Baseline_gpt_oss_20b MeaningBERT score (reference): 0.6697


Evaluating with PLS API: 100%|██████████| 100/100 [00:15<00:00,  6.51it/s]


In [49]:
# For Baseline_gpt_oss_20b predictions vs original
baseline_gpt_oss_20b_alignscore = get_alignscore(baseline_gpt_oss_20b_predictions, ground_truth_original, alignscore_evaluator)
print(f"Baseline_gpt_oss_20b AlignScore: {baseline_gpt_oss_20b_alignscore:.4f}")

# For Baseline_gpt_oss_20b predictions vs reference
baseline_gpt_oss_20b_alignscore_reference = get_alignscore(baseline_gpt_oss_20b_predictions, ground_truth_reference, alignscore_evaluator)
print(f"Baseline_gpt_oss_20b AlignScore (reference): {baseline_gpt_oss_20b_alignscore_reference:.4f}")

Computing AlignScore: 100%|██████████| 100/100 [43:55<00:00, 26.36s/it]


Baseline_gpt_oss_20b AlignScore: 0.7396


Computing AlignScore: 100%|██████████| 100/100 [39:50<00:00, 23.91s/it]

Baseline_gpt_oss_20b AlignScore (reference): 0.6480





### Llama 3.3 70B

In [None]:
# Extract predictions and original with matching keys
baseline_llama_3_3_70b_predictions = []
ground_truth_original = []
matching_ids = []

for cochrane_id in original:
    if cochrane_id in original:
        baseline_llama_3_3_70b_predictions.append(baseline_llama_3_3_70b[cochrane_id])
        ground_truth_original.append(original[cochrane_id])
        matching_ids.append(cochrane_id)

print(f"Found {len(matching_ids)} matching entries between baseline_llama_3_3_70b and original")

Found 100 matching entries between baseline_llama_3_3_70b and original


In [None]:
# Calculate BERTScore using the function
baseline_llama_3_3_70b_bertscore = get_bertscore(baseline_llama_3_3_70b_predictions, ground_truth_original, bertscore)
print(f"Baseline_llama_3_3_70b BERTScore F1: {baseline_llama_3_3_70b_bertscore:.4f}")

baseline_llama_3_3_70b_meaningbert = get_meaningbert_score(baseline_llama_3_3_70b_predictions, ground_truth_original, meaning_bert)
print(f"Baseline_llama_3_3_70b MeaningBERT score: {baseline_llama_3_3_70b_meaningbert:.4f}")

baseline_llama_3_3_70b_bertscore_reference = get_bertscore(baseline_llama_3_3_70b_predictions, ground_truth_reference, bertscore)
print(f"Baseline_llama_3_3_70b BERTScore F1 (reference): {baseline_llama_3_3_70b_bertscore_reference:.4f}")

baseline_llama_3_3_70b_meaningbert_reference = get_meaningbert_score(baseline_llama_3_3_70b_predictions, ground_truth_reference, meaning_bert)
print(f"Baseline_llama_3_3_70b MeaningBERT score (reference): {baseline_llama_3_3_70b_meaningbert_reference:.4f}")

baseline_llama_3_3_70b_pls = get_pls_evaluation(baseline_llama_3_3_70b_predictions)

Baseline_llama_3_3_70b BERTScore F1: 0.8514
Baseline_llama_3_3_70b MeaningBERT score: 0.6985
Baseline_llama_3_3_70b BERTScore F1 (reference): 0.8679
Baseline_llama_3_3_70b MeaningBERT score (reference): 0.7158


Evaluating with PLS API: 100%|██████████| 100/100 [00:14<00:00,  6.71it/s]


In [None]:
# For Baseline_llama_3_3_70b predictions vs original
baseline_llama_3_3_70b_alignscore = get_alignscore(baseline_llama_3_3_70b_predictions, ground_truth_original, alignscore_evaluator)
print(f"Baseline_llama_3_3_70b AlignScore: {baseline_llama_3_3_70b_alignscore:.4f}")

# For Baseline_llama_3_3_70b predictions vs reference
baseline_llama_3_3_70b_alignscore_reference = get_alignscore(baseline_llama_3_3_70b_predictions, ground_truth_reference, alignscore_evaluator)
print(f"Baseline_llama_3_3_70b AlignScore (reference): {baseline_llama_3_3_70b_alignscore_reference:.4f}")

Computing AlignScore: 100%|██████████| 100/100 [45:51<00:00, 27.51s/it]


Baseline_llama_3_3_70b AlignScore: 0.7536


Computing AlignScore: 100%|██████████| 100/100 [29:15<00:00, 17.55s/it]

Baseline_llama_3_3_70b AlignScore (reference): 0.7076





### Llama 3.2 3B

In [42]:
# Extract predictions and original with matching keys
baseline_llama_3_2_3b_predictions = []
ground_truth_original = []
matching_ids = []

for cochrane_id in original:
    if cochrane_id in original:
        baseline_llama_3_2_3b_predictions.append(baseline_llama_3_2_3b[cochrane_id])
        ground_truth_original.append(original[cochrane_id])
        matching_ids.append(cochrane_id)

print(f"Found {len(matching_ids)} matching entries between baseline_llama_3_2_3b and original")

Found 100 matching entries between baseline_llama_3_2_3b and original


In [43]:
# Calculate BERTScore using the function
baseline_llama_3_2_3b_bertscore = get_bertscore(baseline_llama_3_2_3b_predictions, ground_truth_original, bertscore)
print(f"Baseline_llama_3_2_3b BERTScore F1: {baseline_llama_3_2_3b_bertscore:.4f}")

baseline_llama_3_2_3b_meaningbert = get_meaningbert_score(baseline_llama_3_2_3b_predictions, ground_truth_original, meaning_bert)
print(f"Baseline_llama_3_2_3b MeaningBERT score: {baseline_llama_3_2_3b_meaningbert:.4f}")

baseline_llama_3_2_3b_bertscore_reference = get_bertscore(baseline_llama_3_2_3b_predictions, ground_truth_reference, bertscore)
print(f"Baseline_llama_3_2_3b BERTScore F1 (reference): {baseline_llama_3_2_3b_bertscore_reference:.4f}")

baseline_llama_3_2_3b_meaningbert_reference = get_meaningbert_score(baseline_llama_3_2_3b_predictions, ground_truth_reference, meaning_bert)
print(f"Baseline_llama_3_2_3b MeaningBERT score (reference): {baseline_llama_3_2_3b_meaningbert_reference:.4f}")

baseline_llama_3_2_3b_pls = get_pls_evaluation(baseline_llama_3_2_3b_predictions)

Baseline_llama_3_2_3b BERTScore F1: 0.8477
Baseline_llama_3_2_3b MeaningBERT score: 0.6566
Baseline_llama_3_2_3b BERTScore F1 (reference): 0.8467
Baseline_llama_3_2_3b MeaningBERT score (reference): 0.6302


Evaluating with PLS API: 100%|██████████| 100/100 [00:14<00:00,  6.91it/s]


In [50]:
# For Baseline_llama_3_2_3b predictions vs original
baseline_llama_3_2_3b_alignscore = get_alignscore(baseline_llama_3_2_3b_predictions, ground_truth_original, alignscore_evaluator)
print(f"Baseline_llama_3_2_3b AlignScore: {baseline_llama_3_2_3b_alignscore:.4f}")

# For Baseline_llama_3_2_3b predictions vs reference
baseline_llama_3_2_3b_alignscore_reference = get_alignscore(baseline_llama_3_2_3b_predictions, ground_truth_reference, alignscore_evaluator)
print(f"Baseline_llama_3_2_3b AlignScore (reference): {baseline_llama_3_2_3b_alignscore_reference:.4f}")

Computing AlignScore: 100%|██████████| 100/100 [53:14<00:00, 31.94s/it]


Baseline_llama_3_2_3b AlignScore: 0.8499


Computing AlignScore: 100%|██████████| 100/100 [35:52<00:00, 21.52s/it]

Baseline_llama_3_2_3b AlignScore (reference): 0.6982





In [50]:
# Evaluate each model's predictions
print("Evaluating Reference (Gold Standard)...")
reference_pls = get_pls_evaluation(ground_truth_reference)

# Create readability metrics table
readability_table = create_readability_table(
    model_results=[reference_pls, baseline_gemini_2_5_pro_pls, baseline_gpt_5_pls, baseline_gpt_oss_120b_pls],
    model_names=['Reference', 'Baseline_gemini_2_5_pro', 'Baseline_gpt_5', 'Baseline_gpt_oss_120b']
)

print("\nReadability Metrics:")
print(readability_table)

# Create summary percentages table
summary_table = create_summary_table(
    model_results=[reference_pls, baseline_gemini_2_5_pro_pls, baseline_gpt_5_pls, baseline_gpt_oss_120b_pls],
    model_names=['Reference', 'Baseline_gemini_2_5_pro', 'Baseline_gpt_5', 'Baseline_gpt_oss_120b']
)

print("\nSummary Percentages:")
print(summary_table)

Evaluating Reference (Gold Standard)...


Evaluating with PLS API: 100%|██████████| 100/100 [00:18<00:00,  5.52it/s]


Readability Metrics:
                     model       Kincaid           ARI  Coleman-Liau  \
0                Reference  11.38 ± 1.67  11.46 ± 1.89  11.21 ± 1.52   
1  Baseline_gemini_2_5_pro   8.26 ± 0.92   8.01 ± 1.07   9.07 ± 0.81   
2           Baseline_gpt_5   9.62 ± 1.08   9.89 ± 1.18  10.46 ± 1.06   
3    Baseline_gpt_oss_120b   9.82 ± 1.08   9.94 ± 1.25   9.74 ± 0.99   

  FleschReadingEase GunningFogIndex           LIX     SMOGIndex          RIX  \
0      49.23 ± 9.09    16.21 ± 2.06  50.35 ± 4.87  14.23 ± 1.56  6.07 ± 1.23   
1      64.45 ± 4.77    12.38 ± 1.12  41.21 ± 3.04  12.74 ± 1.01  4.02 ± 0.58   
2      57.97 ± 6.27    13.49 ± 1.27  45.08 ± 3.69  14.62 ± 1.10  4.87 ± 0.76   
3      59.10 ± 5.85    14.76 ± 1.30  45.98 ± 3.37  12.96 ± 0.95  5.15 ± 0.75   

  DaleChallIndex  
0    7.37 ± 0.62  
1    6.53 ± 0.37  
2    6.74 ± 0.44  
3    6.77 ± 0.47  

Summary Percentages:
                     model  Best Quartile %  P25 %  P50 %  P75 %  P90 %  \
0                Referen




In [51]:
readability_table

Unnamed: 0,model,Kincaid,ARI,Coleman-Liau,FleschReadingEase,GunningFogIndex,LIX,SMOGIndex,RIX,DaleChallIndex
0,Reference,11.38 ± 1.67,11.46 ± 1.89,11.21 ± 1.52,49.23 ± 9.09,16.21 ± 2.06,50.35 ± 4.87,14.23 ± 1.56,6.07 ± 1.23,7.37 ± 0.62
1,Baseline_gemini_2_5_pro,8.26 ± 0.92,8.01 ± 1.07,9.07 ± 0.81,64.45 ± 4.77,12.38 ± 1.12,41.21 ± 3.04,12.74 ± 1.01,4.02 ± 0.58,6.53 ± 0.37
2,Baseline_gpt_5,9.62 ± 1.08,9.89 ± 1.18,10.46 ± 1.06,57.97 ± 6.27,13.49 ± 1.27,45.08 ± 3.69,14.62 ± 1.10,4.87 ± 0.76,6.74 ± 0.44
3,Baseline_gpt_oss_120b,9.82 ± 1.08,9.94 ± 1.25,9.74 ± 0.99,59.10 ± 5.85,14.76 ± 1.30,45.98 ± 3.37,12.96 ± 0.95,5.15 ± 0.75,6.77 ± 0.47


In [52]:
summary_table

Unnamed: 0,model,Best Quartile %,P25 %,P50 %,P75 %,P90 %,P10 %,Beyond P90 %,Below P10 %
0,Reference,52.5,25.89,17.89,26.61,14.78,0.22,14.5,0.11
1,Baseline_gemini_2_5_pro,78.44,45.33,5.28,33.11,12.89,0.0,3.39,0.0
2,Baseline_gpt_5,63.83,41.33,5.56,22.5,9.61,0.0,21.0,0.0
3,Baseline_gpt_oss_120b,73.44,39.39,10.56,34.06,13.22,0.0,2.78,0.0


### Gemini 2.5 Pro

In [132]:
# Extract predictions and original with matching keys
agentic_gemini_2_5_pro_predictions = []
ground_truth_original = []
matching_ids = []

for cochrane_id in original:
    if cochrane_id in original:
        agentic_gemini_2_5_pro_predictions.append(agentic_gemini_2_5_pro[cochrane_id])
        ground_truth_original.append(original[cochrane_id])
        matching_ids.append(cochrane_id)

print(f"Found {len(matching_ids)} matching entries between agentic_gemini_2_5_pro and original")

Found 100 matching entries between agentic_gemini_2_5_pro and original


In [None]:
# Calculate BERTScore using the function
agentic_gemini_2_5_pro_bertscore = get_bertscore(agentic_gemini_2_5_pro_predictions, ground_truth_original, bertscore)
print(f"agentic_gemini_2_5_pro BERTScore F1: {agentic_gemini_2_5_pro_bertscore:.4f}")

agentic_gemini_2_5_pro_meaningbert = get_meaningbert_score(agentic_gemini_2_5_pro_predictions, ground_truth_original, meaning_bert)
print(f"agentic_gemini_2_5_pro MeaningBERT score: {agentic_gemini_2_5_pro_meaningbert:.4f}")

agentic_gemini_2_5_pro_bertscore_reference = get_bertscore(agentic_gemini_2_5_pro_predictions, ground_truth_reference, bertscore)
print(f"agentic_gemini_2_5_pro BERTScore F1 (reference): {agentic_gemini_2_5_pro_bertscore_reference:.4f}")

agentic_gemini_2_5_pro_meaningbert_reference = get_meaningbert_score(agentic_gemini_2_5_pro_predictions, ground_truth_reference, meaning_bert)
print(f"agentic_gemini_2_5_pro MeaningBERT score (reference): {agentic_gemini_2_5_pro_meaningbert_reference:.4f}")

agentic_gemini_2_5_pro_pls = get_pls_evaluation(agentic_gemini_2_5_pro_predictions)

agentic_gpt_oss_120b BERTScore F1: 0.8469
agentic_gpt_oss_120b MeaningBERT score: 0.6136
agentic_gpt_oss_120b BERTScore F1 (reference): 0.8736
agentic_gpt_oss_120b MeaningBERT score (reference): 0.7153


Evaluating with PLS API: 100%|██████████| 100/100 [00:16<00:00,  6.11it/s]


In [134]:
# For agentic_gemini_2_5_pro predictions vs original
agentic_gemini_2_5_pro_alignscore = get_alignscore(agentic_gemini_2_5_pro_predictions, ground_truth_original, alignscore_evaluator)
print(f"agentic_gemini_2_5_pro AlignScore: {agentic_gemini_2_5_pro_alignscore:.4f}")

# For agentic_gemini_2_5_pro predictions vs reference
agentic_gemini_2_5_pro_alignscore_reference = get_alignscore(agentic_gemini_2_5_pro_predictions, ground_truth_reference, alignscore_evaluator)
print(f"agentic_gemini_2_5_pro AlignScore (reference): {agentic_gemini_2_5_pro_alignscore_reference:.4f}")

Computing AlignScore: 100%|██████████| 100/100 [38:45<00:00, 23.25s/it]


agentic_gemini_2_5_pro AlignScore: 0.7992


Computing AlignScore: 100%|██████████| 100/100 [29:30<00:00, 17.71s/it]

agentic_gemini_2_5_pro AlignScore (reference): 0.7219





### Agentic GPT-OSS 120B 

In [135]:
# Extract predictions and original with matching keys
agentic_gpt_oss_120b_predictions = []
ground_truth_original = []
matching_ids = []

for cochrane_id in original:
    if cochrane_id in original:
        agentic_gpt_oss_120b_predictions.append(agentic_gpt_oss_120b[cochrane_id])
        ground_truth_original.append(original[cochrane_id])
        matching_ids.append(cochrane_id)

print(f"Found {len(matching_ids)} matching entries between agentic_gpt_oss_120b and original")

Found 100 matching entries between agentic_gpt_oss_120b and original


In [136]:
# Calculate BERTScore using the function
agentic_gpt_oss_120b_bertscore = get_bertscore(agentic_gpt_oss_120b_predictions, ground_truth_original, bertscore)
print(f"agentic_gpt_oss_120b BERTScore F1: {agentic_gpt_oss_120b_bertscore:.4f}")

agentic_gpt_oss_120b_meaningbert = get_meaningbert_score(agentic_gpt_oss_120b_predictions, ground_truth_original, meaning_bert)
print(f"agentic_gpt_oss_120b MeaningBERT score: {agentic_gpt_oss_120b_meaningbert:.4f}")

agentic_gpt_oss_120b_bertscore_reference = get_bertscore(agentic_gpt_oss_120b_predictions, ground_truth_reference, bertscore)
print(f"agentic_gpt_oss_120b BERTScore F1 (reference): {agentic_gpt_oss_120b_bertscore_reference:.4f}")

agentic_gpt_oss_120b_meaningbert_reference = get_meaningbert_score(agentic_gpt_oss_120b_predictions, ground_truth_reference, meaning_bert)
print(f"agentic_gpt_oss_120b MeaningBERT score (reference): {agentic_gpt_oss_120b_meaningbert_reference:.4f}")

agentic_gpt_oss_120b_pls = get_pls_evaluation(agentic_gpt_oss_120b_predictions)

agentic_gpt_oss_120b BERTScore F1: 0.8464
agentic_gpt_oss_120b MeaningBERT score: 0.6557
agentic_gpt_oss_120b BERTScore F1 (reference): 0.8651
agentic_gpt_oss_120b MeaningBERT score (reference): 0.7393


Evaluating with PLS API: 100%|██████████| 100/100 [00:19<00:00,  5.20it/s]


In [137]:
# For agentic_gpt_oss_120b predictions vs original
agentic_gpt_oss_120b_alignscore = get_alignscore(agentic_gpt_oss_120b_predictions, ground_truth_original, alignscore_evaluator)
print(f"agentic_gpt_oss_120b AlignScore: {agentic_gpt_oss_120b_alignscore:.4f}")

# For agentic_gpt_oss_120b predictions vs reference
agentic_gpt_oss_120b_alignscore_reference = get_alignscore(agentic_gpt_oss_120b_predictions, ground_truth_reference, alignscore_evaluator)
print(f"agentic_gpt_oss_120b AlignScore (reference): {agentic_gpt_oss_120b_alignscore_reference:.4f}")

Computing AlignScore: 100%|██████████| 100/100 [43:26<00:00, 26.07s/it]


agentic_gpt_oss_120b AlignScore: 0.7953


Computing AlignScore: 100%|██████████| 100/100 [31:31<00:00, 18.92s/it]

agentic_gpt_oss_120b AlignScore (reference): 0.6595





### Agentic GPT-OSS 20B

In [38]:
# Extract predictions and original with matching keys
agentic_gpt_oss_20b_predictions = []
ground_truth_original = []
matching_ids = []

for cochrane_id in original:
    if cochrane_id in original:
        agentic_gpt_oss_20b_predictions.append(agentic_gpt_oss_20b[cochrane_id])
        ground_truth_original.append(original[cochrane_id])
        matching_ids.append(cochrane_id)

print(f"Found {len(matching_ids)} matching entries between agentic_gpt_oss_20b and original")

Found 100 matching entries between agentic_gpt_oss_20b and original


In [39]:
# Calculate BERTScore using the function
agentic_gpt_oss_20b_bertscore = get_bertscore(agentic_gpt_oss_20b_predictions, ground_truth_original, bertscore)
print(f"agentic_gpt_oss_20b BERTScore F1: {agentic_gpt_oss_20b_bertscore:.4f}")

agentic_gpt_oss_20b_meaningbert = get_meaningbert_score(agentic_gpt_oss_20b_predictions, ground_truth_original, meaning_bert)
print(f"agentic_gpt_oss_20b MeaningBERT score: {agentic_gpt_oss_20b_meaningbert:.4f}")

agentic_gpt_oss_20b_bertscore_reference = get_bertscore(agentic_gpt_oss_20b_predictions, ground_truth_reference, bertscore)
print(f"agentic_gpt_oss_20b BERTScore F1 (reference): {agentic_gpt_oss_20b_bertscore_reference:.4f}")

agentic_gpt_oss_20b_meaningbert_reference = get_meaningbert_score(agentic_gpt_oss_20b_predictions, ground_truth_reference, meaning_bert)
print(f"agentic_gpt_oss_20b MeaningBERT score (reference): {agentic_gpt_oss_20b_meaningbert_reference:.4f}")

agentic_gpt_oss_20b_pls = get_pls_evaluation(agentic_gpt_oss_20b_predictions)

agentic_gpt_oss_20b BERTScore F1: 0.8422
agentic_gpt_oss_20b MeaningBERT score: 0.6680
agentic_gpt_oss_20b BERTScore F1 (reference): 0.8615
agentic_gpt_oss_20b MeaningBERT score (reference): 0.7534


Evaluating with PLS API: 100%|██████████| 100/100 [00:18<00:00,  5.44it/s]


In [51]:
# For agentic_gpt_oss_20b predictions vs original
agentic_gpt_oss_20b_alignscore = get_alignscore(agentic_gpt_oss_20b_predictions, ground_truth_original, alignscore_evaluator)
print(f"agentic_gpt_oss_20b AlignScore: {agentic_gpt_oss_20b_alignscore:.4f}")

# For agentic_gpt_oss_20b predictions vs reference
agentic_gpt_oss_20b_alignscore_reference = get_alignscore(agentic_gpt_oss_20b_predictions, ground_truth_reference, alignscore_evaluator)
print(f"agentic_gpt_oss_20b AlignScore (reference): {agentic_gpt_oss_20b_alignscore_reference:.4f}")

Computing AlignScore: 100%|██████████| 100/100 [1:24:11<00:00, 50.52s/it]


agentic_gpt_oss_20b AlignScore: 0.8002


Computing AlignScore: 100%|██████████| 100/100 [45:52<00:00, 27.53s/it]

agentic_gpt_oss_20b AlignScore (reference): 0.6590





### Agentic Llama 3.3 70B 

In [109]:
# Extract predictions and original with matching keys
agentic_llama_3_3_70b_predictions = []
ground_truth_original = []
matching_ids = []

for cochrane_id in original:
    if cochrane_id in original:
        agentic_llama_3_3_70b_predictions.append(agentic_llama_3_3_70b[cochrane_id])
        ground_truth_original.append(original[cochrane_id])
        matching_ids.append(cochrane_id)

print(f"Found {len(matching_ids)} matching entries between agentic_llama_3_3_70b and original")

Found 100 matching entries between agentic_llama_3_3_70b and original


In [110]:
# Calculate BERTScore using the function
agentic_llama_3_3_70b_bertscore = get_bertscore(agentic_llama_3_3_70b_predictions, ground_truth_original, bertscore)
print(f"agentic_llama_3_3_70b BERTScore F1: {agentic_llama_3_3_70b_bertscore:.4f}")

agentic_llama_3_3_70b_meaningbert = get_meaningbert_score(agentic_llama_3_3_70b_predictions, ground_truth_original, meaning_bert)
print(f"agentic_llama_3_3_70b MeaningBERT score: {agentic_llama_3_3_70b_meaningbert:.4f}")

agentic_llama_3_3_70b_bertscore_reference = get_bertscore(agentic_llama_3_3_70b_predictions, ground_truth_reference, bertscore)
print(f"agentic_llama_3_3_70b BERTScore F1 (reference): {agentic_llama_3_3_70b_bertscore_reference:.4f}")

agentic_llama_3_3_70b_meaningbert_reference = get_meaningbert_score(agentic_llama_3_3_70b_predictions, ground_truth_reference, meaning_bert)
print(f"agentic_llama_3_3_70b MeaningBERT score (reference): {agentic_llama_3_3_70b_meaningbert_reference:.4f}")

agentic_llama_3_3_70b_pls = get_pls_evaluation(agentic_llama_3_3_70b_predictions)

agentic_llama_3_3_70b BERTScore F1: 0.8549
agentic_llama_3_3_70b MeaningBERT score: 0.6818
agentic_llama_3_3_70b BERTScore F1 (reference): 0.8708
agentic_llama_3_3_70b MeaningBERT score (reference): 0.7446


Evaluating with PLS API: 100%|██████████| 100/100 [00:21<00:00,  4.63it/s]


In [111]:
# For agentic_llama_3_3_70b predictions vs original
agentic_llama_3_3_70b_alignscore = get_alignscore(agentic_llama_3_3_70b_predictions, ground_truth_original, alignscore_evaluator)
print(f"agentic_llama_3_3_70b AlignScore: {agentic_llama_3_3_70b_alignscore:.4f}")

# For agentic_llama_3_3_70b predictions vs reference
agentic_llama_3_3_70b_alignscore_reference = get_alignscore(agentic_llama_3_3_70b_predictions, ground_truth_reference, alignscore_evaluator)
print(f"agentic_llama_3_3_70b AlignScore (reference): {agentic_llama_3_3_70b_alignscore_reference:.4f}")

Computing AlignScore: 100%|██████████| 100/100 [44:33<00:00, 26.74s/it] 


agentic_llama_3_3_70b AlignScore: 0.7823


Computing AlignScore: 100%|██████████| 100/100 [23:06<00:00, 13.86s/it]

agentic_llama_3_3_70b AlignScore (reference): 0.7140





### Agentic Llama 3.2 3B

In [45]:
# Extract predictions and original with matching keys
agentic_llama_3_2_3b_predictions = []
ground_truth_original = []
matching_ids = []

for cochrane_id in original:
    if cochrane_id in original:
        agentic_llama_3_2_3b_predictions.append(agentic_llama_3_2_3b[cochrane_id])
        ground_truth_original.append(original[cochrane_id])
        matching_ids.append(cochrane_id)

print(f"Found {len(matching_ids)} matching entries between agentic_llama_3_2_3b and original")

Found 100 matching entries between agentic_llama_3_2_3b and original


In [46]:
# Calculate BERTScore using the function
agentic_llama_3_2_3b_bertscore = get_bertscore(agentic_llama_3_2_3b_predictions, ground_truth_original, bertscore)
print(f"agentic_llama_3_2_3b BERTScore F1: {agentic_llama_3_2_3b_bertscore:.4f}")

agentic_llama_3_2_3b_meaningbert = get_meaningbert_score(agentic_llama_3_2_3b_predictions, ground_truth_original, meaning_bert)
print(f"agentic_llama_3_2_3b MeaningBERT score: {agentic_llama_3_2_3b_meaningbert:.4f}")

agentic_llama_3_2_3b_bertscore_reference = get_bertscore(agentic_llama_3_2_3b_predictions, ground_truth_reference, bertscore)
print(f"agentic_llama_3_2_3b BERTScore F1 (reference): {agentic_llama_3_2_3b_bertscore_reference:.4f}")

agentic_llama_3_2_3b_meaningbert_reference = get_meaningbert_score(agentic_llama_3_2_3b_predictions, ground_truth_reference, meaning_bert)
print(f"agentic_llama_3_2_3b MeaningBERT score (reference): {agentic_llama_3_2_3b_meaningbert_reference:.4f}")

agentic_llama_3_2_3b_pls = get_pls_evaluation(agentic_llama_3_2_3b_predictions)

agentic_llama_3_2_3b BERTScore F1: 0.8551
agentic_llama_3_2_3b MeaningBERT score: 0.6952
agentic_llama_3_2_3b BERTScore F1 (reference): 0.8532
agentic_llama_3_2_3b MeaningBERT score (reference): 0.6672


Evaluating with PLS API: 100%|██████████| 100/100 [00:15<00:00,  6.28it/s]


In [52]:
# For agentic_llama_3_2_3b predictions vs original
agentic_llama_3_2_3b_alignscore = get_alignscore(agentic_llama_3_2_3b_predictions, ground_truth_original, alignscore_evaluator)
print(f"agentic_llama_3_2_3b AlignScore: {agentic_llama_3_2_3b_alignscore:.4f}")

# For agentic_llama_3_2_3b predictions vs reference
agentic_llama_3_2_3b_alignscore_reference = get_alignscore(agentic_llama_3_2_3b_predictions, ground_truth_reference, alignscore_evaluator)
print(f"agentic_llama_3_2_3b AlignScore (reference): {agentic_llama_3_2_3b_alignscore_reference:.4f}")

Computing AlignScore: 100%|██████████| 100/100 [46:52<00:00, 28.12s/it] 


agentic_llama_3_2_3b AlignScore: 0.8403


Computing AlignScore: 100%|██████████| 100/100 [35:06<00:00, 21.07s/it]

agentic_llama_3_2_3b AlignScore (reference): 0.6706





### Agentic Llama 3.3 70B with Gemini 2.5 Flash as Evaluator

In [113]:
# Extract predictions and original with matching keys
agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_predictions = []
ground_truth_original = []
matching_ids = []

for cochrane_id in original:
    if cochrane_id in original:
        agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_predictions.append(agentic_llama_3_3_70b_evaluator_gemini_2_5_flash[cochrane_id])
        ground_truth_original.append(original[cochrane_id])
        matching_ids.append(cochrane_id)

print(f"Found {len(matching_ids)} matching entries between agentic_llama_3_3_70b_evaluator_gemini_2_5_flash and original")

Found 100 matching entries between agentic_llama_3_3_70b_evaluator_gemini_2_5_flash and original


In [114]:
# Calculate BERTScore using the function
agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_bertscore = get_bertscore(agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_predictions, ground_truth_original, bertscore)
print(f"agentic_llama_3_3_70b_evaluator_gemini_2_5_flash BERTScore F1: {agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_bertscore:.4f}")

agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_meaningbert = get_meaningbert_score(agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_predictions, ground_truth_original, meaning_bert)
print(f"agentic_llama_3_3_70b_evaluator_gemini_2_5_flash MeaningBERT score: {agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_meaningbert:.4f}")

agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_bertscore_reference = get_bertscore(agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_predictions, ground_truth_reference, bertscore)
print(f"agentic_llama_3_3_70b_evaluator_gemini_2_5_flash BERTScore F1 (reference): {agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_bertscore_reference:.4f}")

agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_meaningbert_reference = get_meaningbert_score(agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_predictions, ground_truth_reference, meaning_bert)
print(f"agentic_llama_3_3_70b_evaluator_gemini_2_5_flash MeaningBERT score (reference): {agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_meaningbert_reference:.4f}")

agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_pls = get_pls_evaluation(agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_predictions)

agentic_llama_3_3_70b_evaluator_gemini_2_5_flash BERTScore F1: 0.8485
agentic_llama_3_3_70b_evaluator_gemini_2_5_flash MeaningBERT score: 0.6514
agentic_llama_3_3_70b_evaluator_gemini_2_5_flash BERTScore F1 (reference): 0.8711
agentic_llama_3_3_70b_evaluator_gemini_2_5_flash MeaningBERT score (reference): 0.7325


Evaluating with PLS API: 100%|██████████| 100/100 [00:16<00:00,  6.15it/s]


In [115]:
# For agentic_llama_3_3_70b_evaluator_gemini_2_5_flash predictions vs original
agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_alignscore = get_alignscore(agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_predictions, ground_truth_original, alignscore_evaluator)
print(f"agentic_llama_3_3_70b_evaluator_gemini_2_5_flash AlignScore: {agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_alignscore:.4f}")

# For agentic_llama_3_3_70b_evaluator_gemini_2_5_flash predictions vs reference
agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_alignscore_reference = get_alignscore(agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_predictions, ground_truth_reference, alignscore_evaluator)
print(f"agentic_llama_3_3_70b_evaluator_gemini_2_5_flash AlignScore (reference): {agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_alignscore_reference:.4f}")

Computing AlignScore: 100%|██████████| 100/100 [37:12<00:00, 22.32s/it]


agentic_llama_3_3_70b_evaluator_gemini_2_5_flash AlignScore: 0.7644


Computing AlignScore: 100%|██████████| 100/100 [25:58<00:00, 15.58s/it]

agentic_llama_3_3_70b_evaluator_gemini_2_5_flash AlignScore (reference): 0.6982





In [162]:
# Evaluate each model's predictions
# print("Evaluating Reference (Gold Standard)...")
# reference_pls = get_pls_evaluation(ground_truth_reference)

# Create readability metrics table
readability_table = create_readability_table(
    model_results=[reference_pls, baseline_gemini_2_5_pro_pls, baseline_gpt_5_pls, baseline_gpt_oss_120b_pls, baseline_llama_3_3_70b_pls, agentic_gemini_2_5_pro_pls,agentic_gpt_oss_120b_pls, agentic_llama_3_3_70b_pls, agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_pls],
    model_names=['Reference', 'Baseline_gemini_2_5_pro', 'Baseline_gpt_5', 'Baseline_gpt_oss_120b', 'baseline_llama_3_3_70b', 'agentic_gemini_2_5_pro', 'Agentic_gpt_oss_120b', 'Agentic_llama_3_3_70b', 'Agentic_llama_3_3_70b_evaluator_gemini_2_5_flash']
)
# Create summary percentages table
summary_table = create_summary_table(
    model_results=[reference_pls, baseline_gemini_2_5_pro_pls, baseline_gpt_5_pls, baseline_gpt_oss_120b_pls, baseline_llama_3_3_70b_pls, agentic_gemini_2_5_pro_pls, agentic_gpt_oss_120b_pls, agentic_llama_3_3_70b_pls, agentic_llama_3_3_70b_evaluator_gemini_2_5_flash_pls],
    model_names=['Reference', 'Baseline_gemini_2_5_pro', 'Baseline_gpt_5', 'Baseline_gpt_oss_120b', 'baseline_llama_3_3_70b', 'Agentic_gpt_oss_120b', 'Agentic_llama_3_3_70b', 'Agentic_llama_3_3_70b_evaluator_gemini_2_5_flash']
)

In [None]:
# Evaluate each model's predictions
# print("Evaluating Reference (Gold Standard)...")
# reference_pls = get_pls_evaluation(ground_truth_reference)

# Create readability metrics table
readability_table = create_readability_table(
    model_results=[baseline_gpt_oss_20b_pls, baseline_llama_3_2_3b_pls, agentic_gpt_oss_20b_pls, agentic_llama_3_2_3b_pls],
    model_names=['Baseline_gpt_oss_20b', 'baseline_llama_3_2_3b', 'Agentic_gpt_oss_20b', 'Agentic_llama_3_2_3b']
)
# Create summary percentages table
summary_table = create_summary_table(
    model_results=[baseline_gpt_oss_20b_pls, baseline_llama_3_2_3b_pls, agentic_gpt_oss_20b_pls, agentic_llama_3_2_3b_pls],
    model_names=['Baseline_gpt_oss_20b', 'baseline_llama_3_2_3b', 'Agentic_gpt_oss_20b', 'Agentic_llama_3_2_3b']
)

In [55]:
# Build a consistent summary DataFrame for the selected model set.
# Use a mapping of display name -> variable prefix so we can safely fetch the variables.
model_map = [
    ('Baseline_gpt_oss_20b', 'baseline_gpt_oss_20b'),
    ('Baseline_llama_3_2_3b', 'baseline_llama_3_2_3b'),
    ('Agentic_gpt_oss_20b', 'agentic_gpt_oss_20b'),
    ('Agentic_llama_3_2_3b', 'agentic_llama_3_2_3b'),
]

def safe_get(name, default=np.nan):
    return globals().get(name, default)

models = []
bertscore_orig = []
meaningbert_orig = []
alignscore_orig = []
bertscore_ref = []
meaningbert_ref = []
alignscore_ref = []
flesch_kincaid = []
ari = []
excellence_rate = []

for display, prefix in model_map:
    models.append(display)
    b1 = safe_get(f"{prefix}_bertscore", np.nan)
    m1 = safe_get(f"{prefix}_meaningbert", np.nan)
    a1 = safe_get(f"{prefix}_alignscore", np.nan)
    bref = safe_get(f"{prefix}_bertscore_reference", np.nan)
    mref = safe_get(f"{prefix}_meaningbert_reference", np.nan)
    aref = safe_get(f"{prefix}_alignscore_reference", np.nan)
    pls = safe_get(f"{prefix}_pls", {})

    # extract readability/summary fields if pls is a dict with expected keys
    try:
        fk = pls['readability']['flesch_kincaid_grade']['mean']
    except Exception:
        fk = np.nan
    try:
        ariv = pls['readability']['automated_readability_index']['mean']
    except Exception:
        ariv = np.nan
    try:
        exr = pls['summary']['best_quartile_rate']
    except Exception:
        exr = np.nan

    bertscore_orig.append(b1)
    meaningbert_orig.append(m1)
    alignscore_orig.append(a1)
    bertscore_ref.append(bref)
    meaningbert_ref.append(mref)
    alignscore_ref.append(aref)
    flesch_kincaid.append(fk)
    ari.append(ariv)
    excellence_rate.append(exr)

df = pd.DataFrame({
    'Model': models,
    'BERTScore F1 (original)': bertscore_orig,
    'MeaningBERT (original)': meaningbert_orig,
    'AlignScore (original)': alignscore_orig,
    'BERTScore F1 (reference)': bertscore_ref,
    'MeaningBERT (reference)': meaningbert_ref,
    'AlignScore (reference)': alignscore_ref,
    'Flesch-Kincaid': flesch_kincaid,
    'ARI': ari,
    'Excellence Rate (%)': excellence_rate,
})

df

Unnamed: 0,Model,BERTScore F1 (original),MeaningBERT (original),AlignScore (original),BERTScore F1 (reference),MeaningBERT (reference),AlignScore (reference),Flesch-Kincaid,ARI,Excellence Rate (%)
0,Baseline_gpt_oss_20b,0.8327,0.5857,0.7396,0.8565,0.6697,0.648,10.17,8.89,71.88
1,Baseline_llama_3_2_3b,0.8477,0.6566,0.8499,0.8467,0.6302,0.6982,9.58,8.95,69.67
2,Agentic_gpt_oss_20b,0.8422,0.668,0.8002,0.8615,0.7534,0.659,10.5,10.27,59.32
3,Agentic_llama_3_2_3b,0.8551,0.6952,0.8403,0.8532,0.6672,0.6706,12.93,12.72,45.69


In [56]:
# Save df, readability_table, and summary_table to CSV files
df.to_csv("model_evaluation_summary_gpt_oss_20b_llama_3_2_3b.csv", index=False)
readability_table.to_csv("model_readability_metrics_gpt_oss_20b_llama_3_2_3b.csv", index=False)
summary_table.to_csv("model_summary_percentages_gpt_oss_20b_llama_3_2_3b.csv", index=False)

In [163]:
# Build a consistent summary DataFrame for the selected model set.
# Use a mapping of display name -> variable prefix so we can safely fetch the variables.
model_map = [
    ('Baseline_gpt_5', 'baseline_gpt_5'),
    ('Baseline_gemini_2_5_pro', 'baseline_gemini_2_5_pro'),
    ('Baseline_gpt_oss_120b', 'baseline_gpt_oss_120b'),
    ('baseline_llama_3_3_70b', 'baseline_llama_3_3_70b'),
    ('Agentic_gemini_2_5_pro', 'agentic_gemini_2_5_pro'),
    ('Agentic_gpt_oss_120b', 'agentic_gpt_oss_120b'),
    ('Agentic_llama_3_3_70b', 'agentic_llama_3_3_70b'),
    ('Agentic_llama_3_3_70b_evaluator_gemini_2_5_flash', 'agentic_llama_3_3_70b_evaluator_gemini_2_5_flash'),
]

def safe_get(name, default=np.nan):
    return globals().get(name, default)

models = []
bertscore_orig = []
meaningbert_orig = []
alignscore_orig = []
bertscore_ref = []
meaningbert_ref = []
alignscore_ref = []
flesch_kincaid = []
ari = []
excellence_rate = []

for display, prefix in model_map:
    models.append(display)
    b1 = safe_get(f"{prefix}_bertscore", np.nan)
    m1 = safe_get(f"{prefix}_meaningbert", np.nan)
    a1 = safe_get(f"{prefix}_alignscore", np.nan)
    bref = safe_get(f"{prefix}_bertscore_reference", np.nan)
    mref = safe_get(f"{prefix}_meaningbert_reference", np.nan)
    aref = safe_get(f"{prefix}_alignscore_reference", np.nan)
    pls = safe_get(f"{prefix}_pls", {})

    # extract readability/summary fields if pls is a dict with expected keys
    try:
        fk = pls['readability']['flesch_kincaid_grade']['mean']
    except Exception:
        fk = np.nan
    try:
        ariv = pls['readability']['automated_readability_index']['mean']
    except Exception:
        ariv = np.nan
    try:
        exr = pls['summary']['best_quartile_rate']
    except Exception:
        exr = np.nan

    bertscore_orig.append(b1)
    meaningbert_orig.append(m1)
    alignscore_orig.append(a1)
    bertscore_ref.append(bref)
    meaningbert_ref.append(mref)
    alignscore_ref.append(aref)
    flesch_kincaid.append(fk)
    ari.append(ariv)
    excellence_rate.append(exr)

df = pd.DataFrame({
    'Model': models,
    'BERTScore F1 (original)': bertscore_orig,
    'MeaningBERT (original)': meaningbert_orig,
    'AlignScore (original)': alignscore_orig,
    'BERTScore F1 (reference)': bertscore_ref,
    'MeaningBERT (reference)': meaningbert_ref,
    'AlignScore (reference)': alignscore_ref,
    'Flesch-Kincaid': flesch_kincaid,
    'ARI': ari,
    'Excellence Rate (%)': excellence_rate,
})

df

Unnamed: 0,Model,BERTScore F1 (original),MeaningBERT (original),AlignScore (original),BERTScore F1 (reference),MeaningBERT (reference),AlignScore (reference),Flesch-Kincaid,ARI,Excellence Rate (%)
0,Baseline_gpt_5,0.8342,0.6075,0.7692,0.8619,0.6873,0.6598,9.62,9.89,63.83
1,Baseline_gemini_2_5_pro,0.8352,0.5957,0.782,0.8701,0.7162,0.7002,8.26,8.01,78.44
2,Baseline_gpt_oss_120b,0.8407,0.6477,0.7696,0.865,0.7346,0.6878,9.82,9.94,73.44
3,baseline_llama_3_3_70b,0.8514,0.6985,0.7536,0.8679,0.7158,0.7076,12.01,12.31,61.56
4,Agentic_gemini_2_5_pro,0.8469,0.6136,0.7992,0.8736,0.7153,0.7219,9.4,9.47,83.22
5,Agentic_gpt_oss_120b,0.8464,0.6557,0.7953,0.8651,0.7393,0.6595,10.83,11.0,57.28
6,Agentic_llama_3_3_70b,0.8549,0.6818,0.7823,0.8708,0.7446,0.714,12.23,12.46,58.89
7,Agentic_llama_3_3_70b_evaluator_gemini_2_5_flash,0.8485,0.6514,0.7644,0.8711,0.7325,0.6982,9.97,10.13,85.28


In [170]:
# Save df, readability_table, and summary_table to CSV files
df.to_csv("model_evaluation_summary.csv", index=False)
readability_table.to_csv("model_readability_metrics.csv", index=False)
summary_table.to_csv("model_summary_percentages.csv", index=False)

In [164]:
readability_table

Unnamed: 0,model,Kincaid,ARI,Coleman-Liau,FleschReadingEase,GunningFogIndex,LIX,SMOGIndex,RIX,DaleChallIndex
0,Reference,11.38 ± 1.67,11.46 ± 1.89,11.21 ± 1.52,49.23 ± 9.09,16.21 ± 2.06,50.35 ± 4.87,14.23 ± 1.56,6.07 ± 1.23,7.37 ± 0.62
1,Baseline_gemini_2_5_pro,8.26 ± 0.92,8.01 ± 1.07,9.07 ± 0.81,64.45 ± 4.77,12.38 ± 1.12,41.21 ± 3.04,12.74 ± 1.01,4.02 ± 0.58,6.53 ± 0.37
2,Baseline_gpt_5,9.62 ± 1.08,9.89 ± 1.18,10.46 ± 1.06,57.97 ± 6.27,13.49 ± 1.27,45.08 ± 3.69,14.62 ± 1.10,4.87 ± 0.76,6.74 ± 0.44
3,Baseline_gpt_oss_120b,9.82 ± 1.08,9.94 ± 1.25,9.74 ± 0.99,59.10 ± 5.85,14.76 ± 1.30,45.98 ± 3.37,12.96 ± 0.95,5.15 ± 0.75,6.77 ± 0.47
4,baseline_llama_3_3_70b,12.01 ± 2.08,12.31 ± 2.46,11.05 ± 1.73,48.55 ± 10.40,16.89 ± 2.34,51.89 ± 5.60,12.65 ± 1.57,6.57 ± 1.53,6.87 ± 0.64
5,agentic_gemini_2_5_pro,9.40 ± 0.84,9.47 ± 0.96,10.16 ± 0.93,58.78 ± 5.07,13.54 ± 1.09,44.64 ± 3.09,12.42 ± 0.80,4.74 ± 0.61,6.65 ± 0.41
6,Agentic_gpt_oss_120b,10.83 ± 1.08,11.00 ± 1.33,10.82 ± 1.21,52.66 ± 6.46,16.32 ± 1.28,49.21 ± 3.45,13.87 ± 1.07,5.82 ± 0.79,7.75 ± 0.46
7,Agentic_llama_3_3_70b,12.23 ± 1.76,12.46 ± 1.93,12.59 ± 1.47,42.72 ± 9.22,16.91 ± 1.96,53.08 ± 4.87,13.41 ± 1.62,6.60 ± 1.28,7.54 ± 0.54
8,Agentic_llama_3_3_70b_evaluator_gemini_2_5_flash,9.97 ± 0.87,10.13 ± 0.98,10.91 ± 1.04,54.97 ± 5.52,14.24 ± 1.16,46.69 ± 3.06,12.48 ± 0.95,5.11 ± 0.60,6.84 ± 0.44


In [165]:
summary_table

Unnamed: 0,model,Best Quartile %,P25 %,P50 %,P75 %,P90 %,P10 %,Beyond P90 %,Below P10 %
0,Reference,52.5,25.89,17.89,26.61,14.78,0.22,14.5,0.11
1,Baseline_gemini_2_5_pro,78.44,45.33,5.28,33.11,12.89,0.0,3.39,0.0
2,Baseline_gpt_5,63.83,41.33,5.56,22.5,9.61,0.0,21.0,0.0
3,Baseline_gpt_oss_120b,73.44,39.39,10.56,34.06,13.22,0.0,2.78,0.0
4,baseline_llama_3_3_70b,61.56,29.17,23.33,32.39,9.61,0.44,5.0,0.06
5,Agentic_gpt_oss_120b,83.22,44.67,9.11,38.56,7.06,0.0,0.61,0.0
6,Agentic_llama_3_3_70b,57.28,26.17,17.61,31.11,16.78,0.0,8.33,0.0
7,Agentic_llama_3_3_70b_evaluator_gemini_2_5_flash,58.89,20.78,24.22,38.11,10.39,0.17,6.06,0.28


### GPT-5

In [None]:
# Extract predictions and references with matching keys
gpt_5_predictions = []
ground_truth_original = []
matching_ids = []

for cochrane_id in original:
    if cochrane_id in original:
        gpt_5_predictions.append(gpt_5[cochrane_id])
        ground_truth_original.append(original[cochrane_id])
        matching_ids.append(cochrane_id)

print(f"Found {len(matching_ids)} matching entries between gpt-5 and original")

Found 100 matching entries between gpt-5 and original


In [None]:
# Calculate BERTScore using the function
gpt_5_bertscore = get_bertscore(gpt_5_predictions, ground_truth_original, bertscore)
print(f"GPT-5 BERTScore F1: {gpt_5_bertscore:.4f}")

# Calculate MeaningBERT score for GPT-5 and original
gpt_5_meaningbert = get_meaningbert_score(gpt_5_predictions, ground_truth_original, meaning_bert)
print(f"GPT-5 MeaningBERT score: {gpt_5_meaningbert:.4f}")

# Calculate BERTScore for GPT-5 and reference
gpt_5_bertscore_reference = get_bertscore(gpt_5_predictions, ground_truth_reference, bertscore)
print(f"GPT-5 BERTScore F1 (reference): {gpt_5_bertscore_reference:.4f}")

# Calculate MeaningBERT score for GPT-5 and reference
gpt_5_meaningbert_reference = get_meaningbert_score(gpt_5_predictions, ground_truth_reference, meaning_bert)
print(f"GPT-5 MeaningBERT score (reference): {gpt_5_meaningbert_reference:.4f}")

GPT-5 BERTScore F1: 0.8283
GPT-5 MeaningBERT score: 0.5632
GPT-5 BERTScore F1 (reference): 0.8519
GPT-5 MeaningBERT score (reference): 0.6414


In [None]:
gpt_5_alignscore = get_alignscore(gpt_5_predictions, ground_truth_original, alignscore_evaluator)
print(f"GPT-5 AlignScore: {gpt_5_alignscore:.4f}")

gpt_5_alignscore_reference = get_alignscore(gpt_5_predictions, ground_truth_reference, alignscore_evaluator)
print(f"GPT-5 AlignScore (reference): {gpt_5_alignscore_reference:.4f}")

Computing AlignScore: 100%|██████████| 100/100 [1:00:42<00:00, 36.43s/it]


GPT-5 AlignScore: 0.7647


Computing AlignScore: 100%|██████████| 100/100 [40:48<00:00, 24.49s/it]

GPT-5 AlignScore (reference): 0.6296





### Readability

In [None]:
# Evaluate each model's predictions
print("Evaluating Reference (Gold Standard)...")
reference_pls = get_pls_evaluation(ground_truth_reference)

print("Evaluating GPT-5...")
gpt5_pls = get_pls_evaluation(gpt_5_predictions)

print("Evaluating Gemini...")
gemini_pls = get_pls_evaluation(gemini_predictions)

print("Evaluating GPT-OSS-120B...")
gpt_oss_120b_pls = get_pls_evaluation(gpt_oss_120b_predictions)

# Create readability metrics table
readability_table = create_readability_table(
    model_results=[reference_pls, gpt5_pls, gemini_pls, gpt_oss_120b_pls],
    model_names=['Reference', 'GPT-5', 'Gemini', 'GPT-OSS-120B']
)

print("\nReadability Metrics:")
print(readability_table)

# Create summary percentages table
summary_table = create_summary_table(
    model_results=[reference_pls, gpt5_pls, gemini_pls, gpt_oss_120b_pls],
    model_names=['Reference', 'GPT-5', 'Gemini', 'GPT-OSS-120B']
)

print("\nSummary Percentages:")
print(summary_table)

# Access individual metrics if needed
print(f"\nGPT-5 Flesch-Kincaid Grade: {gpt5_pls['readability']['flesch_kincaid_grade']['mean']:.2f}")
print(f"GPT-5 Excellence Rate: {gpt5_pls['summary']['excellence_rate']:.2f}%")

Evaluating Reference (Gold Standard)...


Evaluating with PLS API: 100%|██████████| 100/100 [00:30<00:00,  3.26it/s]


Evaluating GPT-5...


Evaluating with PLS API: 100%|██████████| 100/100 [00:36<00:00,  2.78it/s]


Evaluating Gemini...


Evaluating with PLS API: 100%|██████████| 100/100 [00:31<00:00,  3.14it/s]


Evaluating GPT-OSS-120B...


Evaluating with PLS API: 100%|██████████| 100/100 [00:32<00:00,  3.09it/s]


Readability Metrics:
          model       Kincaid           ARI  Coleman-Liau FleschReadingEase  \
0     Reference  11.38 ± 1.67  11.46 ± 1.89  11.21 ± 1.52      49.23 ± 9.09   
1         GPT-5   9.04 ± 1.40   9.51 ± 1.50  10.23 ± 1.30      61.31 ± 7.94   
2        Gemini   8.37 ± 1.14   8.31 ± 1.22   9.33 ± 1.06      64.06 ± 6.46   
3  GPT-OSS-120B  10.52 ± 1.39  10.67 ± 1.63  10.86 ± 1.30      53.37 ± 7.52   

  GunningFogIndex           LIX     SMOGIndex          RIX DaleChallIndex  
0    16.21 ± 2.06  50.35 ± 4.87  14.23 ± 1.56  6.07 ± 1.23    7.37 ± 0.62  
1    12.93 ± 1.63  42.24 ± 4.62  13.64 ± 1.44  4.34 ± 0.92    7.15 ± 0.58  
2    12.25 ± 1.36  41.34 ± 4.03  12.00 ± 0.91  4.08 ± 0.74    6.35 ± 0.50  
3    15.84 ± 1.54  48.06 ± 4.29  14.73 ± 1.17  5.53 ± 0.99    7.82 ± 0.48  

Summary Percentages:
          model  Excellent %  Good %  Acceptable %  Poor %  Critical %  \
0     Reference        37.83   17.89         14.67   15.00       14.61   
1         GPT-5        56.61    




In [2]:
df

Unnamed: 0,Model,BERTScore F1 (original),MeaningBERT (original),AlignScore (original),BERTScore F1 (reference),MeaningBERT (reference),AlignScore (reference),Flesch-Kincaid,ARI,Coleman-Liau,FleschReadingEase,GunningFogIndex,LIX,SMOGIndex,RIX,DaleChallIndex,Excellence Rate (%)
0,Reference,0.8482,0.6825,0.7551,-,-,-,11.38 ± 1.67,11.46 ± 1.89,11.21 ± 1.52,49.23 ± 9.09,16.21 ± 2.06,50.35 ± 4.87,14.23 ± 1.56,6.07 ± 1.23,7.37 ± 0.62,37.83
1,GPT-5,0.8283,0.5632,0.7647,0.8519,0.6414,0.6296,9.04 ± 1.40,9.51 ± 1.50,10.23 ± 1.30,61.31 ± 7.94,12.93 ± 1.63,42.24 ± 4.62,13.64 ± 1.44,4.34 ± 0.92,7.15 ± 0.58,56.61
2,Gemini,0.8419,0.582,0.7893,0.8712,0.6850,0.7189,8.37 ± 1.14,8.31 ± 1.22,9.33 ± 1.06,64.06 ± 6.46,12.25 ± 1.36,41.34 ± 4.03,12.00 ± 0.91,4.08 ± 0.74,6.35 ± 0.50,63.78
3,GPT-OSS-120B,0.8311,0.6408,0.7925,0.8556,0.6713,0.6713,10.52 ± 1.39,10.67 ± 1.63,10.86 ± 1.30,53.37 ± 7.52,15.84 ± 1.54,48.06 ± 4.29,14.73 ± 1.17,5.53 ± 0.99,7.82 ± 0.48,43.44


In [66]:
table = pd.DataFrame({
    'Model': ['Reference', 'GPT-5', 'Gemini', 'GPT-OSS-120B'],
    'BERTScore F1 (original)': [baseline_bertscore, gpt_5_bertscore, gemini_bertscore, gpt_oss_120b_bertscore],
    'MeaningBERT (original)': [baseline_meaningbert, gpt_5_meaningbert, gemini_meaningbert, gpt_oss_120b_meaningbert],
    'AlignScore (original)': [baseline_alignscore, gpt_5_alignscore, gemini_alignscore, gpt_oss_120b_alignscore],
    'BERTScore F1 (reference)': ["-", gpt_5_bertscore_reference, gemini_bertscore_reference, gpt_oss_120b_bertscore_reference],
    'MeaningBERT (reference)': ["-", gpt_5_meaningbert_reference, gemini_meaningbert_reference, gpt_oss_120b_alignscore_reference],
    'AlignScore (reference)': ["-", gpt_5_alignscore_reference, gemini_alignscore_reference, gpt_oss_120b_alignscore_reference],
    'Flesch-Kincaid': [reference_pls['readability']['flesch_kincaid_grade']['mean'],
                        gpt5_pls['readability']['flesch_kincaid_grade']['mean'],
                        gemini_pls['readability']['flesch_kincaid_grade']['mean'],
                        gpt_oss_120b_pls['readability']['flesch_kincaid_grade']['mean']],
    'ARI': [reference_pls['readability']['automated_readability_index']['mean'],
            gpt5_pls['readability']['automated_readability_index']['mean'],
            gemini_pls['readability']['automated_readability_index']['mean'],
            gpt_oss_120b_pls['readability']['automated_readability_index']['mean']],
    'Excellence Rate (%)': [reference_pls['summary']['excellence_rate'],
                            gpt5_pls['summary']['excellence_rate'],
                            gemini_pls['summary']['excellence_rate'],
                            gpt_oss_120b_pls['summary']['excellence_rate']]
})

table

Unnamed: 0,Model,BERTScore F1 (original),MeaningBERT (original),AlignScore (original),BERTScore F1 (reference),MeaningBERT (reference),AlignScore (reference),Flesch-Kincaid,ARI,Excellence Rate (%)
0,Reference,0.8482,0.6825,0.7551,-,-,-,11.38,11.46,37.83
1,GPT-5,0.8283,0.5632,0.7647,0.8519,0.6414,0.6296,9.04,9.51,56.61
2,Gemini,0.8419,0.582,0.7893,0.8712,0.685,0.7189,8.37,8.31,63.78
3,GPT-OSS-120B,0.8311,0.6408,0.7925,0.8556,0.6713,0.6713,10.52,10.67,43.44


In [65]:
%pip install openpyxl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m[31mERROR: Could not find a version that satisfies the requirement openpyxl (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for openpyxl[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
import datetime

# Create timestamp for file naming
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

# Save as CSV
table.to_csv(f'outputs/evaluation_results_{timestamp}.csv', index=False)
print(f"Results saved to outputs/evaluation_results_{timestamp}.csv")

# Save as Excel with formatting
with pd.ExcelWriter(f'outputs/evaluation_results_{timestamp}.xlsx', engine='openpyxl') as writer:
    table.to_excel(writer, sheet_name='Evaluation Metrics', index=False)

    # Also save the readability table in a separate sheet
    readability_table.to_excel(writer, sheet_name='Readability Details', index=False)

    # Get the workbook and worksheet to apply formatting
    workbook = writer.book
    worksheet = writer.sheets['Evaluation Metrics']

    # Auto-adjust column widths
    for column in worksheet.columns:
        max_length = 0
        column_letter = column[0].column_letter
        for cell in column:
            try:
                if len(str(cell.value)) > max_length:
                    max_length = len(str(cell.value))
            except:
                pass
        adjusted_width = min(max_length + 2, 30)
        worksheet.column_dimensions[column_letter].width = adjusted_width

print(f"Results saved to outputs/evaluation_results_{timestamp}.xlsx")

# Save as JSON for programmatic access
results_dict = table.to_dict('records')
with open(f'outputs/evaluation_results_{timestamp}.json', 'w') as f:
    json.dump(results_dict, f, indent=2)
print(f"Results saved to outputs/evaluation_results_{timestamp}.json")

# Save as Markdown for documentation
markdown_table = table.to_markdown(index=False)
with open(f'outputs/evaluation_results_{timestamp}.md', 'w') as f:
    f.write("# Evaluation Results\n\n")
    f.write(f"Generated on: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
    f.write(markdown_table)
print(f"Results saved to outputs/evaluation_results_{timestamp}.md")

# Save as LaTeX for papers
latex_table = table.to_latex(index=False, caption="Model Evaluation Results", label="tab:eval_results")
with open(f'outputs/evaluation_results_{timestamp}.tex', 'w') as f:
    f.write(latex_table)
print(f"Results saved to outputs/evaluation_results_{timestamp}.tex")

# Display the table
print("\n" + "="*80)
print("EVALUATION RESULTS")
print("="*80)
print(table.to_string(index=False))

Results saved to outputs/evaluation_results_20250819_114333.csv
Results saved to outputs/evaluation_results_20250819_114333.json


ImportError: Missing optional dependency 'tabulate'.  Use pip or conda to install tabulate.