In [None]:
%%capture
!pip install rouge bert-score rouge-score

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertForMaskedLM, BertModel
from bert_score import BERTScorer
from rouge import Rouge
from rouge_score import rouge_scorer

In [None]:
references_path = "/content/drive/MyDrive/AI/MultiClinSum 2025/Data/summaries"
candidates_path = "/content/drive/MyDrive/AI/MultiClinSum 2025/Results/Baseline"
output_path = "/content/drive/MyDrive/AI/MultiClinSum 2025/Results/Paper/Baseline.xlsx"

In [None]:
import os

references_filenames_sorted = sorted(
    [f for f in os.listdir(references_path) if f.endswith(".txt")],
    key=lambda x: int(x.split('_')[-2].removesuffix('.txt'))
)[500:]

candidates_filenames_sorted = sorted(
    [f for f in os.listdir(candidates_path) if f.endswith(".txt")],
    key=lambda x: int(x.split('_')[-1].removesuffix('.txt'))
)

In [None]:
rouge_scores = []
bert_scores = []

r_scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
b_scorer = BERTScorer(model_type='bert-base-multilingual-cased') # Requires ~1.5G VRAM

for reference_filename, candidate_filename in zip(references_filenames_sorted, candidates_filenames_sorted):
  print("Processing", candidate_filename)
  with open(f'{references_path}/{reference_filename}', 'r', encoding='utf8') as reference_file:
    with open(f'{candidates_path}/{candidate_filename}', 'r', encoding='utf8') as candidate_file:
      candidate_summary = candidate_file.read()
      reference_summary = reference_file.read().strip()

      rouge_scores.append(r_scorer.score(prediction=candidate_summary, target=reference_summary))
      bert_scores.append(b_scorer.score(cands=[candidate_summary], refs=[reference_summary]))

Processing multiclinsum_gs_es_501.txt
Processing multiclinsum_gs_es_502.txt
Processing multiclinsum_gs_es_503.txt
Processing multiclinsum_gs_es_504.txt
Processing multiclinsum_gs_es_505.txt
Processing multiclinsum_gs_es_506.txt
Processing multiclinsum_gs_es_507.txt
Processing multiclinsum_gs_es_508.txt
Processing multiclinsum_gs_es_509.txt
Processing multiclinsum_gs_es_510.txt
Processing multiclinsum_gs_es_511.txt
Processing multiclinsum_gs_es_512.txt
Processing multiclinsum_gs_es_513.txt
Processing multiclinsum_gs_es_514.txt
Processing multiclinsum_gs_es_515.txt
Processing multiclinsum_gs_es_516.txt
Processing multiclinsum_gs_es_517.txt
Processing multiclinsum_gs_es_518.txt
Processing multiclinsum_gs_es_519.txt
Processing multiclinsum_gs_es_520.txt
Processing multiclinsum_gs_es_521.txt
Processing multiclinsum_gs_es_522.txt
Processing multiclinsum_gs_es_523.txt
Processing multiclinsum_gs_es_524.txt
Processing multiclinsum_gs_es_525.txt
Processing multiclinsum_gs_es_526.txt
Processing m

In [None]:
bert_scores

[(tensor([0.7441]), tensor([0.7484]), tensor([0.7462])),
 (tensor([0.7882]), tensor([0.7745]), tensor([0.7813])),
 (tensor([0.7292]), tensor([0.7474]), tensor([0.7382])),
 (tensor([0.6803]), tensor([0.7442]), tensor([0.7108])),
 (tensor([0.7369]), tensor([0.7716]), tensor([0.7539])),
 (tensor([0.6923]), tensor([0.7713]), tensor([0.7297])),
 (tensor([0.6852]), tensor([0.7353]), tensor([0.7094])),
 (tensor([0.5771]), tensor([0.6694]), tensor([0.6198])),
 (tensor([0.7549]), tensor([0.7625]), tensor([0.7587])),
 (tensor([0.7359]), tensor([0.7574]), tensor([0.7465])),
 (tensor([0.7189]), tensor([0.7845]), tensor([0.7503])),
 (tensor([0.7510]), tensor([0.7566]), tensor([0.7538])),
 (tensor([0.7038]), tensor([0.7539]), tensor([0.7280])),
 (tensor([0.7131]), tensor([0.7794]), tensor([0.7448])),
 (tensor([0.6971]), tensor([0.7581]), tensor([0.7263])),
 (tensor([0.7469]), tensor([0.7888]), tensor([0.7673])),
 (tensor([0.7237]), tensor([0.7968]), tensor([0.7585])),
 (tensor([0.6842]), tensor([0.7

In [None]:
rougel_precision_scores = [score['rougeL'].precision for score in rouge_scores]
rougel_recall_scores = [score['rougeL'].recall for score in rouge_scores]
rougel_f1_scores = [score['rougeL'].fmeasure for score in rouge_scores]

bert_precision_scores = [score[0][0].item() for score in bert_scores]
bert_recall_scores = [score[1][0].item() for score in bert_scores]
bert_f1_scores = [score[2][0].item() for score in bert_scores]

In [None]:
import pandas as pd

df = pd.DataFrame({'file_name': candidates_filenames_sorted, 'rougel_precision': rougel_precision_scores, 'rougel_recall': rougel_recall_scores, 'rougel_f1': rougel_f1_scores, 'bert_precision': bert_precision_scores, 'bert_recall': bert_recall_scores, 'bert_f1': bert_f1_scores})

In [None]:
df

Unnamed: 0,file_name,rougel_precision,rougel_recall,rougel_f1,bert_precision,bert_recall,bert_f1
0,multiclinsum_gs_es_501.txt,0.219512,0.270677,0.242424,0.744090,0.748399,0.746238
1,multiclinsum_gs_es_502.txt,0.303191,0.252212,0.275362,0.788174,0.774514,0.781285
2,multiclinsum_gs_es_503.txt,0.221818,0.273543,0.244980,0.729242,0.747385,0.738202
3,multiclinsum_gs_es_504.txt,0.107143,0.349515,0.164009,0.680330,0.744212,0.710838
4,multiclinsum_gs_es_505.txt,0.190751,0.362637,0.250000,0.736901,0.771617,0.753860
...,...,...,...,...,...,...,...
87,multiclinsum_gs_es_588.txt,0.159259,0.462366,0.236915,0.734367,0.798295,0.764998
88,multiclinsum_gs_es_589.txt,0.157233,0.434783,0.230947,0.698684,0.783234,0.738547
89,multiclinsum_gs_es_590.txt,0.182171,0.470000,0.262570,0.722556,0.783246,0.751678
90,multiclinsum_gs_es_591.txt,0.141509,0.428571,0.212766,0.691818,0.754242,0.721682


In [None]:
df['rougel_precision'].describe()

Unnamed: 0,rougel_precision
count,92.0
mean,0.218984
std,0.080227
min,0.103175
25%,0.158753
50%,0.200372
75%,0.269396
max,0.52451


In [None]:
df['rougel_recall'].describe()

Unnamed: 0,rougel_recall
count,92.0
mean,0.422506
std,0.103966
min,0.201342
25%,0.348938
50%,0.418815
75%,0.493548
max,0.692308


In [None]:
df['rougel_f1'].describe()

Unnamed: 0,rougel_f1
count,92.0
mean,0.278332
std,0.074913
min,0.156962
25%,0.231244
50%,0.261462
75%,0.308522
max,0.534653


In [None]:
df['bert_f1'].describe()

Unnamed: 0,bert_f1
count,92.0
mean,0.752884
std,0.031565
min,0.619807
25%,0.734307
50%,0.752369
75%,0.774761
max,0.835826


In [None]:
df.to_excel(output_path, index=False)